# install libs

In [None]:
!pip install flatten_dict
!pip install unidecode
!pip install gender-guesser

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# define functions

## XML parsing

In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from flatten_dict import flatten

def parse_xml(url):
    response = requests.get(url)
    data = response.text
    
    # Parse XML into ElementTree object
    root = ET.fromstring(data)
    
    # Convert ElementTree object into nested dictionary
    xml_dict = element_to_dict(root)
    
    # Flatten the nested dictionary
    flat_dict = flatten(xml_dict, reducer='underscore')
    
    df = pd.DataFrame.from_dict(flat_dict, orient='index').transpose()
    return df


def parse_xml_from_text(input_text):   
    # Flatten the nested dictionary
    flat_dict = flatten(input_text, reducer='underscore')
    
    df = pd.DataFrame.from_dict(flat_dict, orient='index').transpose()
    return df

# Helper function to convert ElementTree object to nested dictionary
def element_to_dict(element):
    result = {}
    
    # Store tag name as key
    result[element.tag] = {}
    
    # Store attributes as sub-dictionary if present
    if element.attrib:
        result[element.tag]['attributes'] = element.attrib
    
    # Store text content if present
    if element.text and element.text.strip():
        result[element.tag]['text'] = element.text.strip()
    
    # Process child elements recursively
    for child in element:
        child_dict = element_to_dict(child)
        if child.tag in result[element.tag]:
            # Handle multiple elements with the same tag name
            if isinstance(result[element.tag][child.tag], list):
                result[element.tag][child.tag].append(child_dict[child.tag])
            else:
                result[element.tag][child.tag] = [result[element.tag][child.tag], child_dict[child.tag]]
        else:
            result[element.tag].update(child_dict)
    
    return result

In [None]:
def explode_and_concat(df, column, explode=True):
    # Create a temporary DataFrame that contains the column to be exploded along with the other columns in the original DataFrame
    temp_df = df.copy()
    
    # Explode the column containing lists
    if explode:
      temp_df = temp_df.explode(column)
    
    
    # Normalize the DataFrame
    df_to_normalize = pd.json_normalize(temp_df[column])
    df_to_normalize = df_to_normalize.add_prefix(column + '_')
    
    # Merge the original DataFrame with the normalized DataFrame
    result = pd.concat([temp_df.reset_index(drop=True), df_to_normalize.reset_index(drop=True)], axis=1)
    
    # Drop the original column as its content has been normalized and added as new columns
    result = result.drop(columns=[column])
    
    return result



In [None]:
from tqdm.auto import tqdm

def get_complete_df_from_batch_files():
  df_list = []

  # actually 101 batches
  # for i in tqdm(range(101)):
  for i in tqdm(range(101)):
    # hard coded url to my git repo containing xml batch files
    url = f"https://raw.githubusercontent.com/louispaulet/hatvp_viz/main/datasets/base/xml_batches/declarations_hatvp_batch_{i+1}.xml"
    df_list.append(get_stocks_for_url(url))

  return pd.concat(df_list)


## data cleaning

In [None]:
def get_stocks_for_url(url):
  df = parse_xml(url)
  df
  columns_to_explode = [
      'declarations_declaration',
  ]

  exploded_df = df

  for column in columns_to_explode:
      exploded_df = explode_and_concat(exploded_df, column, False)

  exploded_df = exploded_df[["declarations_declaration_uuid.text",
  "declarations_declaration_dateDepot.text",
  "declarations_declaration_general.declarant.nom.text",
  "declarations_declaration_general.declarant.prenom.text",
  "declarations_declaration_general.declarant.dateNaissance.text",
  "declarations_declaration_general.qualiteDeclarantForPDF.text",
  "declarations_declaration_participationFinanciereDto.items.items"]]
  columns_to_explode = [
      "declarations_declaration_participationFinanciereDto.items.items"
  ]

  for column in columns_to_explode:
      exploded_df = explode_and_concat(exploded_df, column, True)
  strings_to_remove = [
      'declarations_declaration_general.',
      'declarations_declaration_participationFinanciereDto.items.items_',
      '.text'
  ]

  # Assuming you have a dataframe named 'exploded_df'

  # Create a new list to store the modified column names
  modified_columns = []

  # Iterate over each column name
  for col in exploded_df.columns:
      modified_col = col
      # Iterate over each string to remove
      for string in strings_to_remove:
          modified_col = modified_col.replace(string, '')

      modified_col = modified_col.replace('.', '_')
      # Add the modified column name to the list
      modified_columns.append(modified_col)

  # Assign the modified column names back to the dataframe
  exploded_df.columns = modified_columns

  # clean_df = clean_stock_dataset(exploded_df)

  return exploded_df

In [None]:
from unidecode import unidecode

def clean_stock_dataset(clean_df):
  clean_df['evaluation'] = clean_df['evaluation'].fillna(0)  # Fill NA values in "evaluation" column with 0 as NA stock is evaluated at 0
  clean_df['evaluation'] = clean_df['evaluation'].astype(int)  # Convert to int
  clean_df['nombreParts'] = clean_df['nombreParts'].fillna(0)  # Fill NA values in "nombreParts" column with 0 as NA nombreParts is evaluated at 0
  clean_df['nombreParts'] = clean_df['nombreParts'].astype(int)  # Convert to int
  clean_df['remuneration'] = clean_df['remuneration'].fillna(0)  # same for this one, although it does also contain text

  clean_df['actiConseil'] = clean_df['actiConseil'].apply(lambda value: value if value == 'Oui' else 'Non')

  clean_df['declarations_declaration_dateDepot'] = pd.to_datetime(clean_df['declarations_declaration_dateDepot'], format='%d/%m/%Y %H:%M:%S')

  # normalize unpublished default message
  clean_df['nomSociete'] = clean_df['nomSociete'].fillna('Données non publiées')
  clean_df.loc[clean_df['nomSociete'].str.contains('Donnée'), 'nomSociete'] = 'Données non publiées'

  # normalize company names by removing accents...
  clean_df['nomSociete'] = clean_df['nomSociete'].apply(unidecode)
  # ...and setting everything to uppercase
  clean_df['nomSociete'] = clean_df['nomSociete'].str.upper()

  mask = ["declarations_declaration_uuid",
    "declarations_declaration_dateDepot",
    "declarant_nom",
    "declarant_prenom",
    "declarant_dateNaissance",
    "qualiteDeclarantForPDF",
    "nomSociete",
    "evaluation",
    "nombreParts"
  ]

  clean_df = clean_df[mask]
  clean_df.columns = ['document_uuid', 'document_datetime', 'owner_name', 'owner_surname', 'owner_birtdate', 'owner_qualification', 'company_name', 'position_value', 'shares_nb']
  return clean_df



# clean_df['nomSociete'].value_counts()

## data enrichment

In [None]:
import gender_guesser.detector as gender

def add_gender_guess_col(df):

  # Create a gender detector
  d = gender.Detector()

  # simplify the surname for gender_guesser
  df["owner_surname_simplified"] = df["owner_surname"].apply(str.capitalize)
  df['owner_surname_simplified'] = df['owner_surname_simplified'].str.split(r'[\s-]').str[0]

  # Use the detector to predict the gender based on the first name
  df['predicted_gender'] = df["owner_surname_simplified"].apply(str.capitalize).map(lambda x: d.get_gender(x))


  # manual fix over residual errors
  missing_male_surnames = [
      'Crescent',
      'Martens',
      'Normane',
      'Yohann',
      'Stephane',
      'Pap',
      'Herve',
      'Francois',
      ]


  missing_female_surnames = [
      'Mahel',
      'Lux',
      'Anais',
      'Francoise',
      'Segolene',
      ]


  df.loc[(df['predicted_gender'] == "unknown") & (df['owner_surname_simplified'].isin(missing_male_surnames)), "predicted_gender"] = "male"
  df.loc[(df['predicted_gender'] == "unknown") & (df['owner_surname_simplified'].isin(missing_female_surnames)), "predicted_gender"] = "female"


  df.loc[(df['predicted_gender'] == "mostly_male"), "predicted_gender"] = "male"
  df.loc[(df['predicted_gender'] == "mostly_female"), "predicted_gender"] = "female"

  df.loc[(df['predicted_gender'].isin(["andy", 'unknown'])), "predicted_gender"] = "hybrid or unknown"

  # remove temp column
  del df['owner_surname_simplified']

  return df

In [None]:
def add_is_popular_stock_col(df):
  popular_company_names_list = df.company_name.value_counts().to_frame()
  popular_company_names_list = popular_company_names_list[popular_company_names_list['company_name'] >= 10].reset_index()
  popular_company_names_list = list(popular_company_names_list['index'])

  df['is_popular_stock'] = False
  df.loc[df['company_name'].isin(popular_company_names_list), 'is_popular_stock'] = True

  return df

In [None]:
from datetime import datetime

def fix_birthdate(df, broken_year, expected_year):
  df.loc[df['owner_birtdate'].str.contains(broken_year), 'owner_birtdate'] = df.loc[df['owner_birtdate'].str.contains(broken_year), 'owner_birtdate'].str.replace(broken_year, expected_year)
  return df

def add_owner_age_col(df):

  # broken birthdays are manually fixed here
  df = fix_birthdate(df, '0971', '1971')
  df = fix_birthdate(df, '1057', '1957')
  df = fix_birthdate(df, '7970', '1970')
  df = fix_birthdate(df, '1661', '1961')
  df = fix_birthdate(df, '1067', '1967')
  df = fix_birthdate(df, '1059', '1959')
  

  # Convert 'owner_birtdate' to datetime
  df['owner_birtdate'] = pd.to_datetime(df['owner_birtdate'], format='%d/%m/%Y')

  # Calculate age based on 'owner_birtdate' and current date
  current_date = datetime.now()
  df['owner_age'] = (current_date - df['owner_birtdate']).astype('<m8[Y]')
  return df

## data filtering

In [None]:
def condense_df_by_removing_empty_rows_and_only_keeping_latest_document_for_each_unique_owner(df):

  condensed_df = df[~df.company_name.str.contains("DONNEES NON PUBLIEES")]
  condensed_df['document_datetime'] = pd.to_datetime(condensed_df['document_datetime'])
  condensed_df = condensed_df.sort_values('document_datetime', ascending=False)

  id_list = condensed_df[["document_uuid", "document_datetime", "owner_name",	"owner_surname", "owner_birtdate"]].groupby(['owner_name', 'owner_surname', 'owner_birtdate']).first().reset_index()
  id_list = list(id_list["document_uuid"])
  condensed_df[condensed_df['document_uuid'].isin(id_list)]

  return condensed_df

# RUN

## get complete DF from XML batch files

In [None]:
complete_df = get_complete_df_from_batch_files()

  0%|          | 0/101 [00:00<?, ?it/s]

## clean and enrich DF

In [None]:
# data cleaning
clean_df = clean_stock_dataset(complete_df)
# enrichment step
enriched_df = add_gender_guess_col(clean_df)
enriched_df = add_owner_age_col(enriched_df)
enriched_df = add_is_popular_stock_col(enriched_df)

In [None]:
enriched_df

Unnamed: 0,document_uuid,document_datetime,owner_name,owner_surname,owner_birtdate,owner_qualification,company_name,position_value,shares_nb,predicted_gender,owner_age,is_popular_stock
0,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,1980-04-05,Député/Ain(01),ORANGE,877,83,male,43.0,True
1,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,1980-04-05,Député/Ain(01),CREDIT AGRICOLE SA,2910,341,male,43.0,True
2,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,1980-04-05,Député/Ain(01),AIRBUS,1929,20,male,43.0,True
3,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,1980-04-05,Député/Ain(01),L'OREAL,6552,20,male,43.0,True
4,fa8d18ec-0db9-4a39-b1f4-caba0c31329b,2022-11-27 18:18:23,ABAD,DAMIEN,1980-04-05,Député/Ain(01),ORANGE,877,83,male,43.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
102,19140875-1488-43e7-95a7-63d0b7212a19,2021-06-25 13:29:08,zuili,nicolas,1965-03-08,Maire ou adjoint municipal/Rouen (76),BNP PARIBAS,10820,200,male,58.0,True
103,19140875-1488-43e7-95a7-63d0b7212a19,2021-06-25 13:29:08,zuili,nicolas,1965-03-08,Maire ou adjoint municipal/Rouen (76),RENAULT,7073,200,male,58.0,True
104,19140875-1488-43e7-95a7-63d0b7212a19,2021-06-25 13:29:08,zuili,nicolas,1965-03-08,Maire ou adjoint municipal/Rouen (76),SAINT GOBAIN,11238,200,male,58.0,True
105,19140875-1488-43e7-95a7-63d0b7212a19,2021-06-25 13:29:08,zuili,nicolas,1965-03-08,Maire ou adjoint municipal/Rouen (76),UNIBAIL RODAMCO,7991,100,male,58.0,False


## create a filtered version

In [None]:
filtered_df = condense_df_by_removing_empty_rows_and_only_keeping_latest_document_for_each_unique_owner(enriched_df)

In [None]:
filtered_df

Unnamed: 0,document_uuid,document_datetime,owner_name,owner_surname,owner_birtdate,owner_qualification,company_name,position_value,shares_nb,predicted_gender,owner_age,is_popular_stock
144,3496a04c-b003-4817-8c49-63ab224c2037,2023-03-06 16:53:48,pechenard,frederic,1957-03-12,Elu régional/Ile-de-France (11)/VICE PRESIDENT,DANONE SA,10156,213,male,66.0,False
142,3496a04c-b003-4817-8c49-63ab224c2037,2023-03-06 16:53:48,pechenard,frederic,1957-03-12,Elu régional/Ile-de-France (11)/VICE PRESIDENT,CASINO GUICHARD PERRACHON,1599,200,male,66.0,False
145,3496a04c-b003-4817-8c49-63ab224c2037,2023-03-06 16:53:48,pechenard,frederic,1957-03-12,Elu régional/Ile-de-France (11)/VICE PRESIDENT,ELIOR GROUP,939,500,male,66.0,False
146,3496a04c-b003-4817-8c49-63ab224c2037,2023-03-06 16:53:48,pechenard,frederic,1957-03-12,Elu régional/Ile-de-France (11)/VICE PRESIDENT,ERAMET,5868,90,male,66.0,False
147,3496a04c-b003-4817-8c49-63ab224c2037,2023-03-06 16:53:48,pechenard,frederic,1957-03-12,Elu régional/Ile-de-France (11)/VICE PRESIDENT,ESSILORLUXOTTICA SA,7527,50,male,66.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...
111,8f3de601-5851-4399-a759-feb8cfdc0ba7,2017-10-28 11:58:59,MASSON,jean-Louis,1947-03-25,Sénateur/Moselle(57),ATOS 732,8563,66,male,76.0,False
112,8f3de601-5851-4399-a759-feb8cfdc0ba7,2017-10-28 11:58:59,MASSON,jean-Louis,1947-03-25,Sénateur/Moselle(57),CARREFOUR 172,3792,228,male,76.0,False
113,8f3de601-5851-4399-a759-feb8cfdc0ba7,2017-10-28 11:58:59,MASSON,jean-Louis,1947-03-25,Sénateur/Moselle(57),DANONE 644,8688,132,male,76.0,False
138,8f3de601-5851-4399-a759-feb8cfdc0ba7,2017-10-28 11:58:59,MASSON,jean-Louis,1947-03-25,Sénateur/Moselle(57),SODEXO 220,12451,126,male,76.0,False


In [None]:
filtered_df.to_csv("/content/drive/MyDrive/AI/HATVP/datasets/stocks/HATVP_stocks_v11_filtered.csv")
enriched_df.to_csv("/content/drive/MyDrive/AI/HATVP/datasets/stocks/HATVP_stocks_v11_enriched.csv")

# DEAD EXPLORATION CODE HERE

## Multiple names for a same company

SODEXO 220	and SODEXO are the same company

PROBLEM STILL NEEDING A FIX

In [None]:
filtered_df[filtered_df.company_name.str.contains('SODEXO')]

Unnamed: 0,document_uuid,document_datetime,owner_name,owner_surname,owner_birtdate,owner_qualification,company_name,position_value,shares_nb,predicted_gender,owner_age,is_popular_stock
203,e110c8f7-0235-4ddc-a116-2ec087551b78,2022-01-23 23:29:27,GORDAT,Gérald,1981-05-23,Membre d’EPCI/Le Grand Charolais/Président,SODEXO,1040,15,male,42.0,False
114,698ac63b-6a2e-4c2f-81a5-723b416b5385,2020-09-03 17:10:24,BARRIOL,Denis,1970-10-22,Membre d’EPCI/Saint Etienne Métropole/Vice Pré...,SODEXO,557,9,male,52.0,False
101,d4b90360-466c-4527-9d63-f89a05d83784,2020-07-21 19:22:10,Debil,Victor,1982-06-06,Membre d’EPCI/Agglomeration du Beauvaisis/Vice...,SODEXO,15435,240,male,40.0,False
173,4aafedcc-17a6-4413-a95a-ce269f788215,2020-07-15 20:31:14,SEGAUD,Carl,1973-04-07,Maire ou adjoint municipal/Maire,SODEXO,5310,85,male,50.0,False
120,8f3de601-5851-4399-a759-feb8cfdc0ba7,2017-10-28 11:58:59,MASSON,jean-Louis,1947-03-25,Sénateur/Moselle(57),SODEXO,6719,68,male,76.0,False
138,8f3de601-5851-4399-a759-feb8cfdc0ba7,2017-10-28 11:58:59,MASSON,jean-Louis,1947-03-25,Sénateur/Moselle(57),SODEXO 220,12451,126,male,76.0,False


CREDI AGRICOLE S.A. and CREDIT AGRICOLE SA and CREDIT AGRICOLE are also the same company

In [None]:
filtered_df[filtered_df.company_name.str.contains('AGRICO')].company_name.value_counts()

CREDIT AGRICOLE                                                       134
CREDIT AGRICOLE SA                                                     30
PS DE LA CAISSE LOCALE DU CREDIT AGRICOLE                               3
PS CREDIT AGRICOLE                                                      3
PARTS SOCIETAIRES CREDIT AGRICOLE ENGAGEMENT                            2
PARTS SOCIETAIRES CREDIT AGRICOLE VOLONTAIRE                            2
CREDIT AGRICOLE CENTRE FRANCE                                           2
CREDIT AGRICOLE PARTS SOCIALES                                          2
CREDIT AGRICOLE CAISSE LOCALE DE BRUYERES                               2
CREDIT AGRICOLE COMPTE-TITRE                                            2
COMPTE TITRES CREDIT AGRICOLE                                           2
CREDIT AGRICOLE PEA                                                     2
CREDIT AGRICOLAE                                                        1
COMPTE PEA CREDIT AGRICOLE            

In [None]:
filtered_df

Unnamed: 0,document_uuid,document_datetime,owner_name,owner_surname,owner_birtdate,owner_qualification,company_name,position_value,shares_nb,predicted_gender,owner_age,is_popular_stock
144,3496a04c-b003-4817-8c49-63ab224c2037,2023-03-06 16:53:48,pechenard,frederic,1957-03-12,Elu régional/Ile-de-France (11)/VICE PRESIDENT,DANONE SA,10156,213,male,66.0,False
142,3496a04c-b003-4817-8c49-63ab224c2037,2023-03-06 16:53:48,pechenard,frederic,1957-03-12,Elu régional/Ile-de-France (11)/VICE PRESIDENT,CASINO GUICHARD PERRACHON,1599,200,male,66.0,False
145,3496a04c-b003-4817-8c49-63ab224c2037,2023-03-06 16:53:48,pechenard,frederic,1957-03-12,Elu régional/Ile-de-France (11)/VICE PRESIDENT,ELIOR GROUP,939,500,male,66.0,False
146,3496a04c-b003-4817-8c49-63ab224c2037,2023-03-06 16:53:48,pechenard,frederic,1957-03-12,Elu régional/Ile-de-France (11)/VICE PRESIDENT,ERAMET,5868,90,male,66.0,False
147,3496a04c-b003-4817-8c49-63ab224c2037,2023-03-06 16:53:48,pechenard,frederic,1957-03-12,Elu régional/Ile-de-France (11)/VICE PRESIDENT,ESSILORLUXOTTICA SA,7527,50,male,66.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...
111,8f3de601-5851-4399-a759-feb8cfdc0ba7,2017-10-28 11:58:59,MASSON,jean-Louis,1947-03-25,Sénateur/Moselle(57),ATOS 732,8563,66,male,76.0,False
112,8f3de601-5851-4399-a759-feb8cfdc0ba7,2017-10-28 11:58:59,MASSON,jean-Louis,1947-03-25,Sénateur/Moselle(57),CARREFOUR 172,3792,228,male,76.0,False
113,8f3de601-5851-4399-a759-feb8cfdc0ba7,2017-10-28 11:58:59,MASSON,jean-Louis,1947-03-25,Sénateur/Moselle(57),DANONE 644,8688,132,male,76.0,False
138,8f3de601-5851-4399-a759-feb8cfdc0ba7,2017-10-28 11:58:59,MASSON,jean-Louis,1947-03-25,Sénateur/Moselle(57),SODEXO 220,12451,126,male,76.0,False


### TF-IDF kernels option

In [None]:
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.cluster import AgglomerativeClustering

# def link_similar_company_names(df, company_colname):
#     # Extract company names
#     company_names = df[company_colname].tolist()

#     # Create TF-IDF vectorizer
#     vectorizer = TfidfVectorizer()
#     X = vectorizer.fit_transform(company_names)

#     # Apply clustering algorithm
#     clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.85, linkage='average')
#     labels = clustering.fit_predict(X.toarray())

#     # Assign cluster labels to the dataframe
#     df['cluster_label'] = labels

#     # Identify representative company names for each cluster
#     representative_names = df.groupby('cluster_label')[company_colname].agg(lambda x: x.value_counts().index[0]).reset_index()

#     # Merge representative names back into the original dataframe
#     df = df.merge(representative_names, on='cluster_label', how='left')
    
#     df = df.rename(columns={'company_name_x': company_colname})
#     df = df.rename(columns={'company_name_y': 'representative_name'})

#     return df


### Word embedding kernel option

In [None]:
# !pip install sentence-transformers

In [None]:
# from sentence_transformers import SentenceTransformer
# from sklearn.cluster import AgglomerativeClustering
# import pandas as pd

# def link_similar_company_names(df, company_colname):
#     # Extract company names
#     company_names = df[company_colname].tolist()
#     company_names = ["".join([c for c in name if not c.isdigit()]) for name in company_names]

#     # Load pre-trained model
#     model = SentenceTransformer('all-MiniLM-L6-v2')

#     # Generate embeddings for each company name
#     embeddings = model.encode(company_names, convert_to_tensor=True)

#     # Apply clustering algorithm
#     clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.75)
#     labels = clustering.fit_predict(embeddings)

#     # Assign cluster labels to the dataframe
#     df['cluster_label'] = labels

#     # Identify representative company names for each cluster
#     representative_names = df.groupby('cluster_label')[company_colname].agg(lambda x: x.value_counts().index[0]).reset_index()

#     # Merge representative names back into the original dataframe
#     df = df.merge(representative_names, on='cluster_label', how='left')

#     df = df.rename(columns={f'{company_colname}_x': company_colname})
#     df = df.rename(columns={f'{company_colname}_y': 'representative_name'})

#     return df
# # 

In [None]:
# trans_df_link = link_similar_company_names(filtered_df.copy(), 'company_name')

In [None]:
# trans_df_link

In [None]:
# trans_df_link.representative_name.value_counts()

In [None]:
# trans_df_link[trans_df_link.representative_name.str.contains('AGRICO')]

In [None]:
# fake_list_of_company_names = ["ACCOR", "ACCOR SA", "CREDIT AGRICOLE", "CREDIT AGRICOLE SA"]

In [None]:
# # Load pre-trained model
# model = SentenceTransformer('all-MiniLM-L6-v2')

# # Generate embeddings for each company name
# embeddings = model.encode(fake_list_of_company_names, convert_to_tensor=True)
# embeddings

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity

# # Calculate cosine similarity
# cos_sim = cosine_similarity(embeddings)

# # Print the cosine similarity matrix
# for i in range(len(fake_list_of_company_names)):
#     for j in range(len(fake_list_of_company_names)):
#         print(f"Cosine similarity between '{fake_list_of_company_names[i]}' and '{fake_list_of_company_names[j]}': {cos_sim[i][j]}")

### cosine similarity of company name embeddings

Reason of failure: best match results of all methods ("company sa" and "company" are matched), but still yields different companies for "company" and "company group".  

In [None]:
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity
# import pandas as pd
# import numpy as np

# def link_similar_company_names(df, company_colname):
#     # Extract company names
#     company_names = df[company_colname].tolist()
#     company_names = ["".join([c for c in name if not c.isdigit()]) for name in company_names]

#     # Load pre-trained model
#     model = SentenceTransformer('all-MiniLM-L6-v2')

#     # Generate embeddings for each company name
#     embeddings = model.encode(company_names, convert_to_tensor=True)

#     # Compute cosine similarity
#     cosine_similarities = cosine_similarity(embeddings)

#     # Create a dataframe to store representative names
#     representative_names = pd.DataFrame(columns=[company_colname, 'representative_name'])
    
#     # Go through each row of the similarity matrix
#     for idx, row in enumerate(cosine_similarities):
#         # Get the indices of company names that have a cosine similarity greater than 0.8 with the current company name
#         similar_idxs = np.where(row > 0.8)[0]
#         if len(similar_idxs) > 1:
#             # If there are similar company names, choose the most common one as the representative name
#             similar_names = [company_names[i] for i in similar_idxs]
#             rep_name = max(set(similar_names), key=similar_names.count)
#             for i in similar_idxs:
#                 representative_names = representative_names.append({company_colname: company_names[i], 'representative_name': rep_name}, ignore_index=True)
#         else:
#             # If there are no similar company names, the company name is its own representative name
#             representative_names = representative_names.append({company_colname: company_names[idx], 'representative_name': company_names[idx]}, ignore_index=True)

#     # Merge representative names back into the original dataframe
#     df = df.merge(representative_names, on=company_colname, how='left')

#     return df


In [None]:
# filtered_df

In [None]:
# cosine_df = link_similar_company_names(filtered_df.copy(), 'company_name')

In [None]:
# cosine_df

In [None]:
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity
# import pandas as pd
# import numpy as np
# from tqdm import tqdm

# def link_similar_company_names(df, company_colname):
#     # Extract company names
#     company_names = df[company_colname].tolist()
#     company_names = ["".join([c for c in name if not c.isdigit()]) for name in company_names]

#     # Load pre-trained model
#     model = SentenceTransformer('all-MiniLM-L6-v2')

#     # Generate embeddings for each company name
#     embeddings = model.encode(company_names, convert_to_tensor=True)

#     # Compute cosine similarity
#     cosine_similarities = cosine_similarity(embeddings)

#     # Create a dataframe to store representative names
#     representative_names = pd.DataFrame(columns=[company_colname, 'representative_name'])

#     # Go through each row of the similarity matrix with tqdm
#     for idx, row in tqdm(enumerate(cosine_similarities), total=len(cosine_similarities), desc="Processing company names"):
#         # Get the indices of company names that have a cosine similarity greater than 0.8 with the current company name
#         similar_idxs = np.where(row > 0.8)[0]
#         if len(similar_idxs) > 1:
#             # If there are similar company names, choose the most common one as the representative name
#             similar_names = [company_names[i] for i in similar_idxs]
#             rep_name = max(set(similar_names), key=similar_names.count)
#             for i in similar_idxs:
#                 representative_names = representative_names.append({company_colname: company_names[i], 'representative_name': rep_name}, ignore_index=True)
#         else:
#             # If there are no similar company names, the company name is its own representative name
#             representative_names = representative_names.append({company_colname: company_names[idx], 'representative_name': company_names[idx]}, ignore_index=True)

#     # Merge representative names back into the original dataframe
#     df = df.merge(representative_names, on=company_colname, how='left')

#     return df


In [None]:
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity
# import pandas as pd
# import numpy as np

# def link_similar_company_names(df, company_colname):
#     # Extract company names
#     company_names = df[company_colname].tolist()
#     company_names = ["".join([c for c in name if not c.isdigit()]) for name in company_names]
#     company_names = [name.replace(' ', '') for name in company_names]


#     # Load pre-trained model
#     model = SentenceTransformer('all-MiniLM-L6-v2')

#     # Generate embeddings for each company name
#     embeddings = model.encode(company_names, convert_to_tensor=True)

#     # Compute cosine similarity
#     cosine_similarities = cosine_similarity(embeddings)

#     # Create a list to store representative names
#     representative_names_list = []

#     # Go through each row of the similarity matrix
#     for idx, row in tqdm(enumerate(cosine_similarities), total=len(cosine_similarities), desc="Processing company names"):
#     # for idx, row in enumerate(cosine_similarities):
#         # Get the indices of company names that have a cosine similarity greater than 0.8 with the current company name
#         similar_idxs = np.where(row > 0.8)[0]
#         if len(similar_idxs) > 1:
#             # If there are similar company names, choose the most common one as the representative name
#             similar_names = [company_names[i] for i in similar_idxs]
#             rep_name = max(set(similar_names), key=similar_names.count)
#             for i in similar_idxs:
#                 representative_names_list.append({company_colname: company_names[i], 'representative_name': rep_name})
#         else:
#             # If there are no similar company names, the company name is its own representative name
#             representative_names_list.append({company_colname: company_names[idx], 'representative_name': company_names[idx]})

#     # Convert the list to a DataFrame
#     representative_names = pd.DataFrame(representative_names_list)

#     # Merge representative names back into the original dataframe
#     df = df.merge(representative_names, on=company_colname, how='left')

#     return df


In [None]:
# cosine_df = link_similar_company_names(filtered_df.copy(), 'company_name')

In [None]:
# cosine_df

In [None]:
# df = filtered_df.copy()
# company_colname = "company_name"


# company_names = df[company_colname].tolist()

# # remove digits from company name to make matching easier
# company_names_simplified = ["".join([c for c in name if not c.isdigit()]) for name in company_names]
# company_names_simplified = [name.replace(' ', '') for name in company_names_simplified]

# # Load pre-trained model
# model = SentenceTransformer('all-MiniLM-L6-v2')

# # Generate embeddings for each company name
# embeddings = model.encode(company_names_simplified, convert_to_tensor=True)

# # Compute cosine similarity
# cosine_similarities = cosine_similarity(embeddings)
# cosine_similarities

In [None]:
# # Create a list to store representative names
# representative_names_list = []

# # Go through each row of the similarity matrix
# for idx, row in tqdm(enumerate(cosine_similarities), total=len(cosine_similarities), desc="Processing company names"):
# # for idx, row in enumerate(cosine_similarities):
#     # Get the indices of company names that have a cosine similarity greater than 0.8 with the current company name
#     similar_idxs = np.where(row > 0.8)[0]
#     if len(similar_idxs) > 1:
#         # If there are similar company names, choose the most common one as the representative name
#         similar_names = [company_names[i] for i in similar_idxs]
#         rep_name = max(set(similar_names), key=similar_names.count)
#         for i in similar_idxs:
#             representative_names_list.append({company_colname: company_names[i], 'representative_name': rep_name})
#     else:
#         # If there are no similar company names, the company name is its own representative name
#         representative_names_list.append({company_colname: company_names[idx], 'representative_name': company_names[idx]})

# # Convert the list to a DataFrame
# representative_names = pd.DataFrame(representative_names_list)

In [None]:
# representative_names.drop_duplicates()