In [63]:
import numpy as np
import pandas as pd
import os

import kagglehub
import json
import yaml

import random




In [3]:
# Download latest version
path = kagglehub.dataset_download("peopledatalabssf/free-7-million-company-dataset")

print(os.listdir(path))

print("Path to dataset files:", path)

#continue my code

Resuming download from 36700160 bytes (255257255 bytes left)...
Resuming download from https://www.kaggle.com/api/v1/datasets/download/peopledatalabssf/free-7-million-company-dataset?dataset_version_number=1 (36700160/291957415) bytes left.


100%|██████████| 278M/278M [00:15<00:00, 16.6MB/s] 

Extracting model files...





['companies_sorted.csv']
Path to dataset files: C:\Users\jonas\.cache\kagglehub\datasets\peopledatalabssf\free-7-million-company-dataset\versions\1


In [27]:
with open(f"config_synthetic_generation.yaml",'r') as f:
    config = yaml.safe_load(f)

organisation_endings = config['organisation_endings']
print(organisation_endings)

lc_org_endings = [string.lower() for string in organisation_endings]

print(lc_org_endings)

['AG', 'GmbH', 'KG', 'OHG', 'e.K.', 'eK', 'UG', 'Inc.', 'Inc', 'Ltd.', 'Ltd', 'LLC', 'PLC', 'LP', 'LLP', 'SA', 'SARL', 'Sàrl', 'SAS', 'SNC', 'EURL', 'S.p.A.', 'S.r.l.', 'S.a.p.a.', 'S.n.c.', 'S.a.s.', 'SpA', 'Srl', 'Sapa', 'Sas']
['ag', 'gmbh', 'kg', 'ohg', 'e.k.', 'ek', 'ug', 'inc.', 'inc', 'ltd.', 'ltd', 'llc', 'plc', 'lp', 'llp', 'sa', 'sarl', 'sàrl', 'sas', 'snc', 'eurl', 's.p.a.', 's.r.l.', 's.a.p.a.', 's.n.c.', 's.a.s.', 'spa', 'srl', 'sapa', 'sas']


In [40]:
companies_df = pd.read_csv(os.path.join(path, "companies_sorted.csv"))[["name","country"]]
companies_df = companies_df.dropna(how='any') #drop nan rows

In [41]:
companies_df.head()

Unnamed: 0,name,country
0,ibm,united states
1,tata consultancy services,india
2,accenture,ireland
3,us army,united states
4,ey,united kingdom


In [56]:
#extract all company names from the list of countries
countries = ['switzerland','germany','france','italy','austria']

companies = companies_df[companies_df["country"].isin(countries)]
#drop na rows
companies = companies.dropna(how='any')
companies['name'] = companies['name'].apply(lambda x : x.strip()) #strip wthitespaces
companies['name'] = companies['name'].apply(lambda x : x.strip(',.:;()?!')) #strip non-alphabetic characters

print(f"size of company_names: {len(companies)}")

size of company_names: 386141


In [57]:
companies.sample(30,random_state=42)

Unnamed: 0,name,country
2262779,renolit italia srl,italy
914766,colli,italy
1085650,studio rinaldo,italy
2684421,eurovod,france
474732,la linkup factory,france
1165423,chut on vous ecoute,france
3320449,bluewalker gmbh,germany
5073362,cler srl,france
3864747,teleporter music,germany
1784428,efrei international,france


We note that all company names are lowercased, so we should use the lowercased company endings

We note that company endings like "s.r.l." are tokenized into ['s','.','r','.','l','.'] so the wouldn't match the company endings

In [None]:
import re

#apply same stripping to company endings

lc_org_endings = np.unique([ending.strip(' ,.:;()?!') for ending in lc_org_endings])

print(lc_org_endings)

['ag' 'e.k' 'ek' 'eurl' 'gmbh' 'inc' 'kg' 'llc' 'llp' 'lp' 'ltd' 'ohg'
 'plc' 's.a.p.a' 's.a.s' 's.n.c' 's.p.a' 's.r.l' 'sa' 'sapa' 'sarl' 'sas'
 'snc' 'spa' 'srl' 'sàrl' 'ug']


In [59]:
def strip_name(name : str,suffixes : list) -> str:

    if not name.endswith(tuple(suffixes)):
        return name
    
    else:
        for suffix in suffixes:
            if name.endswith(suffix):
                return name[:-len(suffix)].strip()



In [60]:
companies['stripped_name'] = companies['name'].apply(lambda x : strip_name(x,lc_org_endings))
companies.sample(30,random_state = 42 )                                                       

Unnamed: 0,name,country,stripped_name
2262779,renolit italia srl,italy,renolit italia
914766,colli,italy,colli
1085650,studio rinaldo,italy,studio rinaldo
2684421,eurovod,france,eurovod
474732,la linkup factory,france,la linkup factory
1165423,chut on vous ecoute,france,chut on vous ecoute
3320449,bluewalker gmbh,germany,bluewalker
5073362,cler srl,france,cler
3864747,teleporter music,germany,teleporter music
1784428,efrei international,france,efrei international


All companie names are still lowercased. We will use .title() or .capitalize() or identiy with 30% probability each

In [67]:
random.seed(42)

def change_capitalization(name : str, type :int):
    if type == 0:
        return name
    elif type == 1:
        return name.capitalize()
    elif type == 2:
        return name.title()
    else:
        raise ValueError

companies['stripped_name'] = companies['stripped_name'].apply(lambda x : change_capitalization(x,random.randint(0,2)))
companies.sample(30,random_state=42)


Unnamed: 0,name,country,stripped_name
2262779,renolit italia srl,italy,Renolit italia
914766,colli,italy,Colli
1085650,studio rinaldo,italy,Studio rinaldo
2684421,eurovod,france,Eurovod
474732,la linkup factory,france,la linkup factory
1165423,chut on vous ecoute,france,Chut on vous ecoute
3320449,bluewalker gmbh,germany,Bluewalker
5073362,cler srl,france,Cler
3864747,teleporter music,germany,Teleporter Music
1784428,efrei international,france,Efrei International


In [68]:
print(f"size of final company names dataset: {len(companies)}")

size of final company names dataset: 386141


In [69]:
#save dataset

companies.to_csv(f"auxiliary_data/final/organization_names.csv",columns=['stripped_name'],header=['name'],index=False)