# Cleaning WIKI pre-processed file

This notebook reads the pre-processed WIKI dataset from local data folder and performs the cleaning of country, LEI and company's names. The final result is saved locally and in the WIKI S3 bucket.

In [None]:
import pandas as pd
import os
from dotenv import dotenv_values, load_dotenv

# Import CountryCleaner
from financial_entity_cleaner.location import CountryCleaner

# Import the module for cleaning company's name
from financial_entity_cleaner.company import CompanyNameCleaner

# Import BankingIdCleaner
from financial_entity_cleaner.id import BankingIdCleaner

In [2]:
env_var = dotenv_values('../../.env')

In [3]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=env_var['S3_ENDPOINT'],
    aws_access_key_id=env_var['S3_ACCESS_KEY'],
    aws_secret_access_key=env_var['S3_SECRET_KEY'],
)
bucket_name = env_var['S3_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

## 1. Read WIKI pre-processed file

In [3]:
# Save locally to "dataset" folder
saved_path = "../../../dataset/pre_processed/"
company = "wiki_company.csv"
companies = "wiki_companies.csv"

In [4]:
company_df = pd.read_csv(os.path.join(saved_path, company), low_memory=False)
companies_df = pd.read_csv(os.path.join(saved_path, companies), low_memory=False)

In [5]:
display(company_df.head(2))
display(companies_df.head(2))

Unnamed: 0,wikidata_id,name,lei,perm_id,bloomberg_id,siren,isin,siret,country_name,hq
0,Q66,Boeing,RVHJWBXLJ1RFUBSY1F30,4295903076.0,,,US0970231058,,United States of America,United States of America
1,Q67,Airbus,529900FCMZ4LKXFD0R69,,,383474814.0,,,France,France


Unnamed: 0,wikidata_id,name,alias,country_name,hq
0,Q66,Boeing,The Boeing Company,United States of America,United States of America
1,Q66,Boeing,Boeing Company,United States of America,United States of America


## 2. Cleaning County information

In [None]:
# Create an object based on CountryCleaner() class
country_cleaner_obj=CountryCleaner()

In [9]:
country_cleaner_obj.letter_case='lower'
country_cleaner_obj.output_info = [CountryCleaner.ATTRIBUTE_ALPHA2,
                                   CountryCleaner.ATTRIBUTE_SHORT_NAME]

In [10]:
company_df = country_cleaner_obj.clean_df(df=company_df, 
                                            cols=['country_name','hq'],
                                            output_names_as= 'suffix')

Normalizing countries...100%|██████████████████████████████████████████████████| 173935/173935 [01:25<00:00, 2025.59it/s]


In [11]:
companies_df = country_cleaner_obj.clean_df(df=companies_df, 
                                            cols=['country_name','hq'],
                                            output_names_as= 'suffix')

Normalizing countries...100%|██████████████████████████████████████████████████| 93410/93410 [00:43<00:00, 2130.53it/s]


In [12]:
display(company_df.head(2))
display(companies_df.head(2))

Unnamed: 0,wikidata_id,name,lei,perm_id,bloomberg_id,siren,isin,siret,country_name,hq,country_name_short_name,country_name_alpha2,hq_short_name,hq_alpha2
0,Q66,Boeing,RVHJWBXLJ1RFUBSY1F30,4295903076.0,,,US0970231058,,United States of America,United States of America,united states,us,united states,us
1,Q67,Airbus,529900FCMZ4LKXFD0R69,,,383474814.0,,,France,France,france,fr,france,fr


Unnamed: 0,wikidata_id,name,alias,country_name,hq,country_name_short_name,country_name_alpha2,hq_short_name,hq_alpha2
0,Q66,Boeing,The Boeing Company,United States of America,United States of America,united states,us,united states,us
1,Q66,Boeing,Boeing Company,United States of America,United States of America,united states,us,united states,us


## 3. Cleaning IDs

In [13]:
# Create an object based on the BankingIdCleaner() class
id_cleaner_obj = BankingIdCleaner()

In [14]:
# Setup cleaning properties
id_cleaner_obj.invalid_ids_as_nan = True
id_cleaner_obj.validation_as_categorical = True
id_cleaner_obj.output_lettercase = "BankingIdCleaner.UPPER_LETTER_CASE"

In [15]:
company_df = id_cleaner_obj.clean_df(company_df, cols=['lei','isin','siren','siret'], 
                                    remove_cols= True, 
                                    output_names_as= 'suffix',
                                    types = ['lei','isin','siren','siret'])

Normalizing IDs...100%|██████████████████████████████████████████████████| 173935/173935 [01:30<00:00, 1924.04it/s]


In [16]:
company_df.head()

Unnamed: 0,wikidata_id,name,perm_id,bloomberg_id,country_name,hq,country_name_short_name,country_name_alpha2,hq_short_name,hq_alpha2,lei_cleaned_id,lei_isvalid_id,isin_cleaned_id,isin_isvalid_id,siren_cleaned_id,siren_isvalid_id,siret_cleaned_id,siret_isvalid_id
0,Q66,Boeing,4295903076.0,,United States of America,United States of America,united states,us,united states,us,RVHJWBXLJ1RFUBSY1F30,1.0,US0970231058,1.0,,,,
1,Q67,Airbus,,,France,France,france,fr,france,fr,529900FCMZ4LKXFD0R69,1.0,,,,,,
2,Q95,Google,4295899948.0,,United States of America,United States of America,united states,us,united states,us,7ZW8QJWVPR4P1J1KQY45,1.0,,,,,,
3,Q248,Intel,4295906830.0,,United States of America,United States of America,united states,us,united states,us,KNX4USFCNGPY45LOCE31,1.0,US4581401001,1.0,,,,
4,Q312,Apple,4295905573.0,,United States of America,United States of America,united states,us,united states,us,HWUPKR0MPOU8FGXBT394,1.0,US0378331005,1.0,,,,


In [17]:
company_df.drop(['country_name',
                'hq',
                'isin_isvalid_id',
                'siren_isvalid_id',
                'siret_isvalid_id',
                'lei_isvalid_id'],axis='columns',inplace=True)

## 4. Cleaning Company's names

In [18]:
# Create a CompanyNameCleaner object
company_cleaner_obj = CompanyNameCleaner()

In [19]:
# Set cleaner properties
company_cleaner_obj.normalize_legal_terms = True
company_cleaner_obj.letter_case="lower"
company_cleaner_obj.remove_accents = True
company_cleaner_obj.default_cleaning_rules = ['place_word_the_at_the_beginning',
                                                'remove_words_in_asterisk', 
                                                'remove_words_in_parentheses',
                                                'remove_question_marks_in_parentheses', 
                                                'replace_hyphen_by_space', 
                                                'replace_underscore_by_space', 
                                                'remove_text_puctuation_except_dot', 
                                                'remove_math_symbols', 
                                                'remove_parentheses', 
                                                'remove_brackets', 
                                                'remove_curly_brackets', 
                                                'remove_single_quote_next_character', 
                                                'remove_double_quote', 
                                                'enforce_single_space_between_words']
company_cleaner_obj.post_cleaning_rules = ['remove_all_punctuation']

In [20]:
company_df = company_cleaner_obj.clean_df(company_df,
                                            'name',
                                            'name_clean',
                                            'country_name_alpha2',
                                            'True')
companies_df = company_cleaner_obj.clean_df(companies_df,
                                            'alias',
                                            'alias_clean',
                                            'country_name_alpha2',
                                            'True')

Cleaning company name...100%|██████████████████████████████████████████████████| 189/189 [01:19<00:00,  2.37it/s]
Cleaning company name...100%|██████████████████████████████████████████████████| 173/173 [00:46<00:00,  3.71it/s]


## 5. Check and save

In [21]:
print(f'Total company in WIKIDATA : {company_df.shape[0]}')

Total company in WIKIDATA : 173935


In [22]:
saved_path = "../../../dataset/cleaned/"
company_clean = "company_cleaned.csv"
companies_clean = "companies_cleaned.csv"


company_filename = os.path.join(saved_path, company_clean)
companies_filename = os.path.join(saved_path, companies_clean)

company_df.to_csv(company_filename,encoding='utf-8',header=True, index=False)
companies_df.to_csv(companies_filename,encoding='utf-8',header=True, index=False)

In [23]:
# Company
s3_filename = 'WIKI/cleaned/company_cleaned.csv'
s3_resource.meta.client.upload_file(Filename=company_filename,
                                    Bucket=env_var['S3_BUCKET'], 
                                    Key=s3_filename)

# Companies
s3_filename = 'WIKI/cleaned/companies_cleaned.csv'
s3_resource.meta.client.upload_file(Filename=companies_filename,
                                    Bucket=env_var['S3_BUCKET'], 
                                    Key=s3_filename)