# SETUP

In [4]:
from os import listdir
from os.path import isfile, join
import pandas as pd
import re
import json
import requests
from datetime import datetime, date, timedelta
import os
from langdetect import detect
import openpyxl
import camelot
import openai
import spacy


# SCRAPPING (Antoine)

In [5]:
now = datetime.now() 
today = now.strftime("%Y-%m-%d")
yesterday = (now - timedelta(days=365)).strftime("%Y-%m-%d") #new files not added often so increase the days number in timedelta to avoid empty string

#search page url:
url = "https://public-search.emploi.belgique.be/website-service/joint-work-convention/search"

#download page that will be added to each document name to have a full downloadable link
dl_url = "https://public-search.emploi.belgique.be/website-download-service/joint-work-convention/"

#request - 
# If you want to filter on a specific CP (here 200), instead of dates. Both dates and CP filters can also be combined in the 'json' dict parameter
# r = requests.post(url,json={"jc":"2000000"})
r = requests.post(url,json={"jc":"2000000","signatureDate": {'start': "2018-01-01T00:00:00.000Z", 'end': today+"T00:00:00.000Z"}})

data = r.json()

#function that checks if json file already exists
def where_json(file_name):
    return os.path.exists(file_name)

#checking json file and opening it if it exists
if where_json("data.json"):
    with open("data.json","r") as file:
        existing_data = json.loads(file.read())
else: #creating an empty list
    existing_data = []

new_data = []
for item in data:


    #checks if entry already existing in database
    if not any(d['depositNumber'] == item['depositNumber'] for d in existing_data):
        split = item['documentLink'].split('/')

        #gets the Commission Paritaire number
        item['CPnumber'] = split[0]

        #gets the file number
        item['DocNumber'] = split[1][:-4]

        #replaces name of the file with complete downloadable link
        item['documentLink'] = dl_url + item['documentLink']

        #downloads the pdf from link
        response = requests.get(item['documentLink'])
        if response.status_code == 200:

            #saves the pdf in directory depending on CP number
            if not os.path.exists(f"{item['CPnumber']}"):      
                os.makedirs(f"{item['CPnumber']}")

            file_path = os.path.join(f"{item['CPnumber']}",os.path.basename(item['documentLink']))

            with open(file_path, 'wb') as f:
                f.write(response.content)

        #append item data dict to list
        new_data.append(item)

        #add email alert here??? with download link?
    
    #replaces the previous existing data with new data
    if new_data: 
        existing_data = new_data

#save list as json file
json_object = json.dumps(existing_data, indent=4)

with open("data.json", "w") as outfile:
    outfile.write(json_object)

# OCR (Piero)

In [6]:
# add ocr code 

def ocr_fr_detect_v2(file):
    """ 
    This function takes a pdf file as an input and outputs a txt file with the same name.
    The txt file contains only the french text contained in the pdf document.
    Takes approximatly 20 seconds for 6 pages
    """ 
    vowels = ['a','e','i','o','u']
    fr = []
    duch = {'da', 'sl', 'de', 'nl', 'et' ,'no', 'af','fi', 'tl', 'sv', 'so'}
    french = {'hr', 'ca', 'fr','ro', 'it', 'lv', 'en', 'es', 'cy'}
    # check the file extension
    if file.endswith(".pdf"):
        #print(file) # debug, prints filename
        tables = camelot.read_pdf(file, flavor='stream' , pages= 'all', edge_tol=0)
        # for every detected table (page and text structure)
        for i in range(len(tables)):
            col_lang = []
            # make a df
            data = tables[i].df
            # replace new line (\n) with space
            data.replace('\\n',' ',regex=True, inplace = True)
            # for every column detected
            for j in range(len(data.columns)):
                # put all the text of that column in a list # this takes also out empty rows and lone numbers (as pagenumber)
                text_list = [x for x in tables[i].df[j].values if x != '' if not x.isdigit()] 
                # convert the list to text
                col_text = (' '.join(text_list))
                # if there is at least one vowel (we cannot detect language for numbers)
                if any(char in vowels for char in col_text):
                    # detect language
                    try:
                        language = detect(col_text)
                        col_lang.append(language)
                    except:
                        col_lang.append('Error')
                        #print("This row throws and error:", i, j, col_text)
                    
                else:
                    col_lang.append('None')
            #print(col_lang)
            for k in range(len(data)):
                # put all the text of that column in a list # this takes also out empty rows and lone numbers (as pagenumber)
                #text_list = [x for x in tables[i].df[j].values if x != '' if not x.isdigit()] 
                # for every columns in the row 
                for g in range(len(data.columns)): 
                    text = tables[i].df[g].values[k] 
                    language = col_lang[g]
                    if text == '':
                        pass
                    elif language in french:
                        #print(language,': ', text)
                        fr.append(text)
                    elif language in duch or language == 'None':
                        #print(language,': ', text)
                        pass
                    else: 
                        pass                            
        # prepare the text
        french_text = (' '.join(fr))
        #reunite halved words
        french_text = french_text.replace("- ", "")
        text_file = os.path.basename(os.path.splitext(file)[0] + "_fr.txt")
        filepath = os.path.join("200/", text_file)
        # Outputs the french text in a text file
        with open(filepath, "w") as output:
            output.write(french_text)
    else:
        print('not a pdf')
        pass

all_pdf_titles = [f for f in listdir("200") if isfile(join("200", f))]
for pdf in all_pdf_titles :
    ocr_fr_detect_v2(f"200/{pdf}")


# CLASSIFICATION (Olivier)

In [7]:
#import the classification table
df_mapping = pd.read_excel('Classification_excel.xlsx', index_col=None)
df_mapping["Key words"] = df_mapping["Key words"].str.lower()

#Create df for each of the 4 categories
df_type = df_mapping.loc[df_mapping['Category'] == 'CLA type']
df_status = df_mapping.loc[df_mapping['Category'] == 'CLA status']
df_sector = df_mapping.loc[df_mapping['Category'] == 'Sector']
df_theme = df_mapping.loc[df_mapping['Category'] == 'Theme']

In [8]:
#Collect the list of pdfs titles

all_pdf_titles = []
for file in os.listdir("200/"):
    if file.endswith(".txt"):
        all_pdf_titles.append(file)
# all_pdf_titles

In [9]:
#Create a dictionnary with PDF ID and the four classifications (type, status, sector and theme)
dict_pdf = {}
for pdf in all_pdf_titles :
    pdf_class = {}
    pdf_txt = open(f"200/{pdf}", 'r')
    pdf_words = pdf_txt.read().lower()

    # Add the type to the list pdf_class
    class_type_label_cnt = df_type.loc[df_type['Key words'] == "www.cnt-nar.be", 'Class'].to_string(index=False)
    class_type_label_ind = df_type.loc[df_type['Key words'].isnull(), 'Class'].to_string(index=False)

    elements = df_type["Key words"].dropna().tolist()
    for elem in elements :
        if elem in pdf_words:
            if class_type_label_cnt not in pdf_class :
                pdf_class["CLA type"] = class_type_label_cnt
                break
        else : 
            if class_type_label_ind not in pdf_class :
                pdf_class["CLA type"] = class_type_label_ind

    # Add the status to the list pdf_class
    class_status_label_update = df_status.loc[df_status['Key words'] == "erratum", 'Class'].to_string(index=False)
    class_status_label_new = df_status.loc[df_status['Key words'].isnull(), 'Class'].to_string(index=False)

    elements = df_status["Key words"].dropna().tolist()
    for elem in elements :
        if elem in pdf_words:
            if class_status_label_update not in pdf_class :
                pdf_class["CLA status"] = class_status_label_update
                break
        else : 
            if class_status_label_new not in pdf_class :
                pdf_class["CLA status"] = class_status_label_new

    # Add the sector to the list pdf_class
    class_sector_label = df_sector['Class'].to_list()
    result_2 = []
    elements = df_sector["Key words"].dropna().tolist()
    for elem in elements :
        if elem in pdf_words:
            result_2.append(True)
        else : 
            result_2.append(False)

    if True in result_2 :
        pdf_class["Sector"] = df_sector.loc[df_sector["Key words"]== elem, "Class"].to_string(index=False)
    else :
        if "Sector not specified" not in pdf_class :
            pdf_class["Sector"] = "Sector not specified"

    #Add the theme to the list pdf_class

    class_theme_label = df_theme['Class'].to_list()
    result_3 = []
    elements = df_theme["Key words"].dropna().tolist()
    for elem in elements :
        if elem in pdf_words:
            result_3.append(True)
        else : 
            result_3.append(False)
    class_theme = []
    if True in result_3 :
        for elem in elements :
            if elem in pdf_words :
                if df_theme.loc[df_theme["Key words"]== elem, "Class"].to_string(index=False) not in pdf_class :
                    class_theme.append((df_theme.loc[df_theme["Key words"]== elem, "Class"].to_string(index=False)))
                    pdf_class["Theme"] = class_theme
    else :
        if "Unknown_theme" not in pdf_class :
            pdf_class["Theme"] =  "Unknown_theme"
    dict_pdf[pdf] = pdf_class
# dict_pdf


# TEXT SUMMARY WITH CHATGPT (Tania)

In [10]:
def get_summary_long_file(filename):

    with open (filename, "rb") as f:
        text_from_file = f.read()

    text_to_summarize = str(text_from_file)

    # Set the model to use
    model_engine = "text-davinci-003"

    # Set the maximum context length (in tokens) allowed by the model
    max_context_length = 2048

    # Split the text into chunks of the maximum allowed context length
    text = text_to_summarize
    text_chunks = [text[i:i+max_context_length] for i in range(0, len(text), max_context_length)]

    # Send each chunk of text to the model and store the results
    results = []
    for chunk in text_chunks:
        response = openai.Completion.create(
            engine=model_engine,
            prompt=chunk,
            max_tokens=1024,
            temperature=0.5,
        )
        results.append(response["choices"][0]["text"])

    # Concatenate the results into a single string
    result_text = "".join(results)

    # Prompt
    prompt = f"Résume ce document en français: {result_text}"

    # Use the model to generate a summary of the text
    summary_response = openai.Completion.create(
        engine=model_engine,
        prompt=prompt,
        max_tokens=1024,
        temperature=0.5,
    )

    summary = summary_response["choices"][0]["text"]

    cla_summary_json[filename] = summary

In [11]:
chatGPT_api_key = "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"
# Set the API key
openai.api_key = chatGPT_api_key
#Collect the list of pdfs titles
all_txt_titles = []
for file in os.listdir("200/"):
    if file.endswith(".txt"):
        all_txt_titles.append(file)
# all_pdf_titles
all_summaries = {}
for txt in all_txt_titles :
    file_txt = open(f"200/{txt}", 'r')
    txt_string = file_txt.read().lower()
    txt_string = txt_string[:10000]
    prompt = f"Résume ce document: {txt_string}"
    response = openai.Completion.create(
    engine="text-davinci-003",
    prompt=prompt,
    max_tokens=590,
    n=1,
    stop=None,
    temperature=0.5)
    summary = response["choices"][0]["text"]
    all_summaries[txt] = summary
all_summaries

{'200-2021-013464_fr.txt': ".\n\nCette convention collective de travail du 18 novembre 2021 a été conclue au sein de la commission paritaire auxiliaire pour employés afin de modifier la convention collective du 9 juin 2016 concernant la prime de fin d'année. Elle remplace la deuxième condition relative à l'ancienneté, qui se trouve dans l'article 3 de la convention collective du 9 juin 2016, par une ancienneté d'au moins six mois au moment du paiement de la prime, et prend en compte les périodes d'occupation intérimaire pour le calcul de l'ancienneté. La présente convention collective de travail entre en vigueur le 1er janvier 2021 et est conclue pour une durée indéterminée. Elle peut être dénoncée par une des parties, moyennant un préavis de 3 mois, adressé par lettre recommandée au président de la commission paritaire auxiliaire pour employés.",
 '200-2021-013468_fr.txt': "ioll collective de travail entre en vigueur le 1er janvier 2022 et prend fin le 31 décembre 2023.\n\nCette conve

In [12]:
df_all_summaries = pd.DataFrame([all_summaries]).T.reset_index(level=0)
df_all_summaries.rename(columns={df_all_summaries.columns[0]: "filename"}, inplace = True)
df_all_summaries.rename(columns={df_all_summaries.columns[1]: "summary"}, inplace = True)
df_all_summaries["filename"] = df_all_summaries["filename"].str.rstrip('_fr.txt')
df_all_summaries


Unnamed: 0,filename,summary
0,200-2021-013464,.\n\nCette convention collective de travail du...
1,200-2021-013468,ioll collective de travail entre en vigueur le...
2,200-2021-013476,\n\nCette convention collective de travail est...
3,200-2021-013474,\n\nCette convention collective de travail s'a...
4,200-2021-013466,\n\nCe document est une convention collective ...
5,200-2018-013057,\n\nLa présente convention collective de trava...
6,200-2019-010016,onné. §4 les parties s'engagent à encourager ...
7,200-2019-010020,"\n\nLe Service Public Fédéral Emploi, Travail ..."
8,200-2021-011085,\n\nLa présente convention collective de trava...
9,200-2019-010014,- donné le temps jusqu'au 31/12/2022 de trouve...


# PARENTS DETECTION (imported Rafaella)

In [13]:
#no code --> import from another dataframe.
df_parents = pd.read_csv("df_full.csv")

df_parents["pdf_file_name"] = df_parents["pdf_file_name"].str.rstrip('.pdf')
df_parents.rename(columns={'pdf_file_name': 'filename'}, inplace = True)
del df_parents['Unnamed: 0']
df_parents['filename'] = df_parents['filename'].replace(['ALL/200/200-2021-011262'], '200-2021-011262')
df_parents['filename'] = df_parents['filename'].replace(['ALL/200/200-2019-010021'], '200-2019-010021')
df_parents['filename'] = df_parents['filename'].replace(['ALL/200/200-2019-009410'], '200-2019-009410')
df_parents['filename'] = df_parents['filename'].replace(['ALL/200/200-2021-013468'], '200-2021-013468')
df_parents['filename'] = df_parents['filename'].replace(['ALL/200/200-2019-010019'], '200-2019-010019')
df_parents = df_parents[["filename", "parent_name"]]
df_parents["parent_name"] = df_parents["parent_name"].str.rstrip('.pdf')
df_parents.drop(df_parents.columns[0], axis=1)
df_parents = df_parents.loc[:,~df_parents.columns.duplicated()].copy()
df_parents
#remove index because no more datafram

Unnamed: 0,filename,parent_name
0,200-2021-013471,
1,200-2021-011262,
2,200-2021-011085,
3,200-2020-000391,200-2019-010017
4,200-2019-010016,
5,200-2018-013526,
6,200-2019-010021,
7,200-2021-013465,
8,200-2019-010018,
9,200-2021-013472,


# MERGE METADATA / CLASSIFICATION / SUMMARY

In [14]:
# Create DF for classification
df_dict_pdf = pd.DataFrame(dict_pdf).T.reset_index(level=0)
df_dict_pdf.rename(columns={df_dict_pdf.columns[0]: "filename"}, inplace = True)
df_dict_pdf["filename"] = df_dict_pdf["filename"].str.rstrip('.pdf_fr.txt')

In [15]:
# Create DF with meta
with open("data.json","r") as file:
    existing_data = json.loads(file.read())
df_csv=pd.DataFrame(existing_data)
df_csv.rename(columns={df_csv.columns[25]: "filename"}, inplace = True)
# df_csv = df_csv[['themesFr','themesNl','filename']]
df_csv["filename"] = df_csv["filename"].str.rstrip('.pdf')
df_csv = df_csv.loc[df_csv["filename"].str.startswith('200', na=False)]
# df_csv
# df.to_csv('test.csv')

In [38]:
# Merge both df on filename
import numpy as np
merged_df = pd.merge(df_csv, df_dict_pdf, on="filename")

merged_df2 = pd.merge(merged_df, df_all_summaries, on="filename")
merged_df3 = pd.merge(merged_df2, df_parents, on= "filename")
merged_df3 = merged_df3[["filename","jcId", "titleFr", "validityDate", "depositDate", "CLA type", "CLA status", "Sector", "Theme", "summary", "parent_name"]]

merged_df3 = merged_df3.replace({np.nan:None})
merged_df3.head()

Unnamed: 0,filename,jcId,titleFr,validityDate,depositDate,CLA type,CLA status,Sector,Theme,summary,parent_name
0,200-2022-009993,2000000,Pension complémentaire pour les employés de l'...,,2022-05-18T10:00:00.000+00:00,Concluded on industry level,New,"LA SELLERIE, DE LA FABRICATION DE COURROIES ET...","[Salaire, Fin au contrat de travail et chômage...",", le plus grand nombre d'ouvriers (exprimé en ...",
1,200-2021-015135,2000000,Prime corona,2022-12-31T11:00:00.000+00:00,2021-12-21T11:00:00.000+00:00,Concluded on industry level,New,Sector not specified,[Primes et indemnités pour le travailleur actif],er août 2021. Cette convention collective de t...,200-2021-013469
2,200-2021-013476,2000000,Dispense de l'obligation de disponibilité adap...,2024-12-31T11:00:00.000+00:00,2021-11-24T11:00:00.000+00:00,Concluded on industry level,New,"LA SELLERIE, DE LA FABRICATION DE COURROIES ET...","[Mesures pour l'emploi et groupes cibles, Mesu...",\n\nCette convention collective de travail est...,
3,200-2021-013464,2000000,Modification de la CCT concernant la prime de ...,,2021-11-24T11:00:00.000+00:00,Concluded on industry level,New,Sector not specified,"[Fin au contrat de travail et chômage, Primes ...",.\n\nCette convention collective de travail du...,
4,200-2021-013465,2000000,Pouvoir d'achat,,2021-11-24T11:00:00.000+00:00,Concluded on industry level,New,Sector not specified,"[Salaire, Salaire, Salaire, Salaire, Fin au co...",\n\nCette convention collective de travail s'a...,


In [39]:
merged_df3.to_csv('meta_data_classification_summary.csv', index=False)

# COMPARISON OF SUMMARIES

In [40]:
list_filename = merged_df3["filename"].tolist()
list_parent_name = merged_df3["parent_name"].tolist()
dico_parents = dict(zip(list_filename, list_parent_name))
dico_parents

{'200-2022-009993': None,
 '200-2021-015135': '200-2021-013469',
 '200-2021-013476': None,
 '200-2021-013464': None,
 '200-2021-013465': None,
 '200-2021-013467': None,
 '200-2021-013468': None,
 '200-2021-013471': None,
 '200-2021-013472': None,
 '200-2021-013473': None,
 '200-2021-013474': None,
 '200-2021-013475': None,
 '200-2021-013463': '200-2020-000391',
 '200-2021-013466': None,
 '200-2021-013469': None,
 '200-2021-011262': None,
 '200-2021-011085': None,
 '200-2020-012501': None,
 '200-2020-009333': None,
 '200-2020-009334': '200-2019-010016',
 '200-2020-000391': '200-2019-010017',
 '200-2019-010014': None,
 '200-2019-010015': None,
 '200-2019-010016': None,
 '200-2019-010017': None,
 '200-2019-010018': None,
 '200-2019-010019': None,
 '200-2019-010020': None,
 '200-2019-010021': None,
 '200-2019-009410': None,
 '200-2018-013526': None,
 '200-2018-013527': None,
 '200-2018-013057': None}

In [42]:
# Comparin the summaries of CLA and their parents   
comparison_summary = {}
for key, value in dico_parents.items():
    if value is None :
        comparison_summary[key]= "No parent CLA"
    else :
        summary = merged_df3.loc[merged_df3['filename'] == key, 'summary']
        summary_2 = merged_df3.loc[merged_df3['filename'] == value, 'summary']
        # Define the prompt
        prompt = f"Résume en quelques mots les différences entre {summary} et {summary_2}"

        # Query the API
        response = openai.Completion.create(
            engine="text-davinci-003",
            prompt=prompt,
            max_tokens=1024,
            n=1,
            stop=None,
            temperature=0.5
        )

        # Print the response
        diffs = response["choices"][0]["text"]
        comparison_summary[key]= diffs
        
comparison_summary


{'200-2022-009993': 'No parent CLA',
 '200-2021-015135': '\n\n1er août 2021: CCT pour les salariés du secteur privé; 14: CCT pour les salariés du secteur public.',
 '200-2021-013476': 'No parent CLA',
 '200-2021-013464': 'No parent CLA',
 '200-2021-013465': 'No parent CLA',
 '200-2021-013467': 'No parent CLA',
 '200-2021-013468': 'No parent CLA',
 '200-2021-013471': 'No parent CLA',
 '200-2021-013472': 'No parent CLA',
 '200-2021-013473': 'No parent CLA',
 '200-2021-013474': 'No parent CLA',
 '200-2021-013475': 'No parent CLA',
 '200-2021-013463': '\n\nLa première différence concerne la date, le 23 avril 2019 pour la première et le 20 mai 2019 pour la seconde. La seconde différence concerne le contenu, la première est une convention collective et la seconde est une modification des frais de transport.',
 '200-2021-013466': 'No parent CLA',
 '200-2021-013469': 'No parent CLA',
 '200-2021-011262': 'No parent CLA',
 '200-2021-011085': 'No parent CLA',
 '200-2020-012501': 'No parent CLA',


In [44]:
#Create a dataframe with the result before to merge to the whole database
df = pd.DataFrame([comparison_summary]).T.reset_index(level=0)
df.rename(columns={'index': 'filename'}, inplace = True)
# df.rename(columns={'0': 'parent_comparison'}, inplace = True)
df.columns.values[1] = "parent_comparison"
df

Unnamed: 0,filename,parent_comparison
0,200-2022-009993,No parent CLA
1,200-2021-015135,\n\n1er août 2021: CCT pour les salariés du se...
2,200-2021-013476,No parent CLA
3,200-2021-013464,No parent CLA
4,200-2021-013465,No parent CLA
5,200-2021-013467,No parent CLA
6,200-2021-013468,No parent CLA
7,200-2021-013471,No parent CLA
8,200-2021-013472,No parent CLA
9,200-2021-013473,No parent CLA


# FINAL LOAD TO CSV

In [45]:
merged_df4 = pd.merge(merged_df3, df, on= "filename")
merged_df4

Unnamed: 0,filename,jcId,titleFr,validityDate,depositDate,CLA type,CLA status,Sector,Theme,summary,parent_name,parent_comparison
0,200-2022-009993,2000000,Pension complémentaire pour les employés de l'...,,2022-05-18T10:00:00.000+00:00,Concluded on industry level,New,"LA SELLERIE, DE LA FABRICATION DE COURROIES ET...","[Salaire, Fin au contrat de travail et chômage...",", le plus grand nombre d'ouvriers (exprimé en ...",,No parent CLA
1,200-2021-015135,2000000,Prime corona,2022-12-31T11:00:00.000+00:00,2021-12-21T11:00:00.000+00:00,Concluded on industry level,New,Sector not specified,[Primes et indemnités pour le travailleur actif],er août 2021. Cette convention collective de t...,200-2021-013469,\n\n1er août 2021: CCT pour les salariés du se...
2,200-2021-013476,2000000,Dispense de l'obligation de disponibilité adap...,2024-12-31T11:00:00.000+00:00,2021-11-24T11:00:00.000+00:00,Concluded on industry level,New,"LA SELLERIE, DE LA FABRICATION DE COURROIES ET...","[Mesures pour l'emploi et groupes cibles, Mesu...",\n\nCette convention collective de travail est...,,No parent CLA
3,200-2021-013464,2000000,Modification de la CCT concernant la prime de ...,,2021-11-24T11:00:00.000+00:00,Concluded on industry level,New,Sector not specified,"[Fin au contrat de travail et chômage, Primes ...",.\n\nCette convention collective de travail du...,,No parent CLA
4,200-2021-013465,2000000,Pouvoir d'achat,,2021-11-24T11:00:00.000+00:00,Concluded on industry level,New,Sector not specified,"[Salaire, Salaire, Salaire, Salaire, Fin au co...",\n\nCette convention collective de travail s'a...,,No parent CLA
5,200-2021-013467,2000000,Modification de la CCT relative au FSE,2023-12-31T11:00:00.000+00:00,2021-11-24T11:00:00.000+00:00,Concluded on industry level,New,Sector not specified,"[Salaire, Salaire, Mesures pour l'emploi et gr...","\n\nCette convention collective de travail, en...",,No parent CLA
6,200-2021-013468,2000000,Formation 2022-2023,2023-12-31T11:00:00.000+00:00,2021-11-24T11:00:00.000+00:00,Concluded on industry level,New,Sector not specified,"[Salaire, Temps de travail, Recrutement et for...",ioll collective de travail entre en vigueur le...,,No parent CLA
7,200-2021-013471,2000000,Régime de chômage avec complément d'entreprise...,2023-06-30T10:00:00.000+00:00,2021-11-24T11:00:00.000+00:00,Concluded on industry level,New,"LA SELLERIE, DE LA FABRICATION DE COURROIES ET...","[Mesures pour l'emploi et groupes cibles, Fin ...",\n\nCette convention collective de travail s'a...,,No parent CLA
8,200-2021-013472,2000000,Régime de chômage avec complément d'entreprise...,2023-06-30T10:00:00.000+00:00,2021-11-24T11:00:00.000+00:00,Concluded on industry level,New,"LA SELLERIE, DE LA FABRICATION DE COURROIES ET...","[Mesures pour l'emploi et groupes cibles, Fin ...",\n\nLa présente convention collective de trava...,,No parent CLA
9,200-2021-013473,2000000,Crédit-temps 2021-2022,2022-12-31T11:00:00.000+00:00,2021-11-24T11:00:00.000+00:00,Concluded on industry level,New,"LA SELLERIE, DE LA FABRICATION DE COURROIES ET...","[Salaire, Recrutement et formation, Fonds soci...",\n\nCette convention collective de travail du ...,,No parent CLA


In [46]:
df_test = merged_df4.dropna()
list_summary=df_test["parent_comparison"].tolist()
list_summary


['\n\n1er août 2021: CCT pour les salariés du secteur privé; 14: CCT pour les salariés du secteur public.',
 "\n\nErratum 19 concerne la Commission Paritaire et 23 concerne l'engagement des parties à encourager la diversité.",
 '\n\nLes différences sont que le 20 est une modification des frais de transport, tandis que le 24 est un calcul basé sur des chiffres bruts.']

In [47]:
merged_df4.to_csv('VF_meta_data_classification_summary_parents.csv', index=False)