# Cleaning GLEIF pre-processed file

...

In [48]:
import pandas as pd
pd.set_option('display.max_columns',50)
import os
from dotenv import dotenv_values, load_dotenv

# Import CountryCleaner
from financial_entity_cleaner.location import CountryCleaner

# Import BankingIdCleaner
from financial_entity_cleaner.id import BankingIdCleaner

# Import the module for cleaning company's name
from financial_entity_cleaner.company import CompanyNameCleaner

In [17]:
env_var = dotenv_values('../../.env')

In [18]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=env_var['S3_ENDPOINT'],
    aws_access_key_id=env_var['S3_ACCESS_KEY'],
    aws_secret_access_key=env_var['S3_SECRET_KEY'],
)
bucket_name = env_var['S3_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

## 1. Read the GLEIF pre-processed file

In [19]:
# Save locally to "dataset" folder
saved_path = "../../../dataset/raw/GLEIF"
filename = "gleif.csv"

In [20]:
gleif_df = pd.read_csv(os.path.join(saved_path, filename),sep='|', low_memory=False)

In [21]:
gleif_df.head()

Unnamed: 0,LEI,Entity.LegalName,Entity.OtherEntityNames.OtherEntityName.1,Entity.OtherEntityNames.OtherEntityName.2,Entity.OtherEntityNames.OtherEntityName.3,Entity.OtherEntityNames.OtherEntityName.4,Entity.OtherEntityNames.OtherEntityName.5,Entity.LegalAddress.City,Entity.LegalAddress.Country,Entity.HeadquartersAddress.City,Entity.HeadquartersAddress.Country
0,001GPB6A9XPE8XJICC14,Fidelity Advisor Leveraged Company Stock Fund,FIDELITY ADVISOR SERIES I - Fidelity Advisor L...,,,,,BOSTON,US,Boston,US
1,004L5FPTUREIWK9T2N63,"Hutchin Hill Capital, LP",,,,,,Wilmington,US,New York,US
2,00EHHQ2ZHDCFXJCPCL46,Vanguard Russell 1000 Growth Index Trust,,,,,,MALVERN,US,Valley Forge,US
3,00GBW0Z2GYIER7DHDS71,"ARISTEIA CAPITAL, L.L.C.",,,,,,WILMINGTON,US,Greenwich,US
4,00KLB2PFTM3060S2N216,Oakmark International Fund,HARRIS ASSOCIATES INVESTMENT TRUST - Oakmark I...,,,,,BOSTON,US,Chicago,US


## 2. Cleaning country information

In [22]:
country_cleaner_obj=CountryCleaner()

In [23]:
country_cleaner_obj.letter_case='lower'
country_cleaner_obj.output_info = [CountryCleaner.ATTRIBUTE_ALPHA2,
                                   CountryCleaner.ATTRIBUTE_SHORT_NAME]

In [24]:
gleif_df = country_cleaner_obj.clean_df(df=gleif_df, 
                                            cols=['Entity.LegalAddress.Country','Entity.HeadquartersAddress.Country'],
                                            output_names_as= 'suffix')

Normalizing countries...100%|██████████████████████████████████████████████████| 2344185/2344185 [13:31<00:00, 2887.47it/s]


In [26]:
gleif_df.head()

Unnamed: 0,LEI,Entity.LegalName,Entity.OtherEntityNames.OtherEntityName.1,Entity.OtherEntityNames.OtherEntityName.2,Entity.OtherEntityNames.OtherEntityName.3,Entity.OtherEntityNames.OtherEntityName.4,Entity.OtherEntityNames.OtherEntityName.5,Entity.LegalAddress.City,Entity.LegalAddress.Country,Entity.HeadquartersAddress.City,Entity.HeadquartersAddress.Country,Entity.LegalAddress.Country_short_name,Entity.LegalAddress.Country_alpha2,Entity.HeadquartersAddress.Country_short_name,Entity.HeadquartersAddress.Country_alpha2
0,001GPB6A9XPE8XJICC14,Fidelity Advisor Leveraged Company Stock Fund,FIDELITY ADVISOR SERIES I - Fidelity Advisor L...,,,,,BOSTON,US,Boston,US,united states,us,united states,us
1,004L5FPTUREIWK9T2N63,"Hutchin Hill Capital, LP",,,,,,Wilmington,US,New York,US,united states,us,united states,us
2,00EHHQ2ZHDCFXJCPCL46,Vanguard Russell 1000 Growth Index Trust,,,,,,MALVERN,US,Valley Forge,US,united states,us,united states,us
3,00GBW0Z2GYIER7DHDS71,"ARISTEIA CAPITAL, L.L.C.",,,,,,WILMINGTON,US,Greenwich,US,united states,us,united states,us
4,00KLB2PFTM3060S2N216,Oakmark International Fund,HARRIS ASSOCIATES INVESTMENT TRUST - Oakmark I...,,,,,BOSTON,US,Chicago,US,united states,us,united states,us


## 3. Cleaning LEI

In [27]:
# Create an object based on the BankingIdCleaner() class
id_cleaner_obj = BankingIdCleaner()

In [28]:
# Setup cleaning properties
id_cleaner_obj.invalid_ids_as_nan = True
id_cleaner_obj.validation_as_categorical = True
id_cleaner_obj.output_lettercase = "BankingIdCleaner.UPPER_LETTER_CASE"

In [29]:
gleif_df.head()

Unnamed: 0,LEI,Entity.LegalName,Entity.OtherEntityNames.OtherEntityName.1,Entity.OtherEntityNames.OtherEntityName.2,Entity.OtherEntityNames.OtherEntityName.3,Entity.OtherEntityNames.OtherEntityName.4,Entity.OtherEntityNames.OtherEntityName.5,Entity.LegalAddress.City,Entity.LegalAddress.Country,Entity.HeadquartersAddress.City,Entity.HeadquartersAddress.Country,Entity.LegalAddress.Country_short_name,Entity.LegalAddress.Country_alpha2,Entity.HeadquartersAddress.Country_short_name,Entity.HeadquartersAddress.Country_alpha2
0,001GPB6A9XPE8XJICC14,Fidelity Advisor Leveraged Company Stock Fund,FIDELITY ADVISOR SERIES I - Fidelity Advisor L...,,,,,BOSTON,US,Boston,US,united states,us,united states,us
1,004L5FPTUREIWK9T2N63,"Hutchin Hill Capital, LP",,,,,,Wilmington,US,New York,US,united states,us,united states,us
2,00EHHQ2ZHDCFXJCPCL46,Vanguard Russell 1000 Growth Index Trust,,,,,,MALVERN,US,Valley Forge,US,united states,us,united states,us
3,00GBW0Z2GYIER7DHDS71,"ARISTEIA CAPITAL, L.L.C.",,,,,,WILMINGTON,US,Greenwich,US,united states,us,united states,us
4,00KLB2PFTM3060S2N216,Oakmark International Fund,HARRIS ASSOCIATES INVESTMENT TRUST - Oakmark I...,,,,,BOSTON,US,Chicago,US,united states,us,united states,us


In [30]:
gleif_df = id_cleaner_obj.clean_df(gleif_df, cols=['LEI'], 
                                    remove_cols= True, 
                                    output_names_as= 'suffix',
                                    types = ['lei'])

Normalizing IDs...100%|██████████████████████████████████████████████████| 2344185/2344185 [07:43<00:00, 5056.64it/s]


In [37]:
gleif_df.head()

Unnamed: 0,Entity.LegalName,Entity.OtherEntityNames.OtherEntityName.1,Entity.OtherEntityNames.OtherEntityName.2,Entity.OtherEntityNames.OtherEntityName.3,Entity.OtherEntityNames.OtherEntityName.4,Entity.OtherEntityNames.OtherEntityName.5,Entity.LegalAddress.City,Entity.LegalAddress.Country,Entity.HeadquartersAddress.City,Entity.HeadquartersAddress.Country,Entity.LegalAddress.Country_short_name,Entity.LegalAddress.Country_alpha2,Entity.HeadquartersAddress.Country_short_name,Entity.HeadquartersAddress.Country_alpha2,LEI_cleaned_id
0,Fidelity Advisor Leveraged Company Stock Fund,FIDELITY ADVISOR SERIES I - Fidelity Advisor L...,,,,,BOSTON,US,Boston,US,united states,us,united states,us,001GPB6A9XPE8XJICC14
1,"Hutchin Hill Capital, LP",,,,,,Wilmington,US,New York,US,united states,us,united states,us,004L5FPTUREIWK9T2N63
2,Vanguard Russell 1000 Growth Index Trust,,,,,,MALVERN,US,Valley Forge,US,united states,us,united states,us,00EHHQ2ZHDCFXJCPCL46
3,"ARISTEIA CAPITAL, L.L.C.",,,,,,WILMINGTON,US,Greenwich,US,united states,us,united states,us,00GBW0Z2GYIER7DHDS71
4,Oakmark International Fund,HARRIS ASSOCIATES INVESTMENT TRUST - Oakmark I...,,,,,BOSTON,US,Chicago,US,united states,us,united states,us,00KLB2PFTM3060S2N216


In [None]:
gleif_df.drop(['LEI_isvalid_id'], axis=1, inplace=True)

In [33]:
gleif_df.head()

Unnamed: 0,Entity.LegalName,Entity.OtherEntityNames.OtherEntityName.1,Entity.OtherEntityNames.OtherEntityName.2,Entity.OtherEntityNames.OtherEntityName.3,Entity.OtherEntityNames.OtherEntityName.4,Entity.OtherEntityNames.OtherEntityName.5,Entity.LegalAddress.City,Entity.LegalAddress.Country,Entity.HeadquartersAddress.City,Entity.HeadquartersAddress.Country,Entity.LegalAddress.Country_short_name,Entity.LegalAddress.Country_alpha2,Entity.HeadquartersAddress.Country_short_name,Entity.HeadquartersAddress.Country_alpha2,LEI_cleaned_id
0,Fidelity Advisor Leveraged Company Stock Fund,FIDELITY ADVISOR SERIES I - Fidelity Advisor L...,,,,,BOSTON,US,Boston,US,united states,us,united states,us,001GPB6A9XPE8XJICC14
1,"Hutchin Hill Capital, LP",,,,,,Wilmington,US,New York,US,united states,us,united states,us,004L5FPTUREIWK9T2N63
2,Vanguard Russell 1000 Growth Index Trust,,,,,,MALVERN,US,Valley Forge,US,united states,us,united states,us,00EHHQ2ZHDCFXJCPCL46
3,"ARISTEIA CAPITAL, L.L.C.",,,,,,WILMINGTON,US,Greenwich,US,united states,us,united states,us,00GBW0Z2GYIER7DHDS71
4,Oakmark International Fund,HARRIS ASSOCIATES INVESTMENT TRUST - Oakmark I...,,,,,BOSTON,US,Chicago,US,united states,us,united states,us,00KLB2PFTM3060S2N216


## 4. Cleaning Company's names

In [39]:
# Create a CompanyNameCleaner object
company_cleaner_obj = CompanyNameCleaner()

In [40]:
# Set cleaner properties
company_cleaner_obj.normalize_legal_terms = True
company_cleaner_obj.letter_case="lower"
company_cleaner_obj.remove_accents = True
company_cleaner_obj.default_cleaning_rules = ['place_word_the_at_the_beginning',
                                                'remove_words_in_asterisk', 
                                                'remove_words_in_parentheses',
                                                'remove_question_marks_in_parentheses', 
                                                'replace_hyphen_by_space', 
                                                'replace_underscore_by_space', 
                                                'remove_text_puctuation_except_dot', 
                                                'remove_math_symbols', 
                                                'remove_parentheses', 
                                                'remove_brackets', 
                                                'remove_curly_brackets', 
                                                'remove_single_quote_next_character', 
                                                'remove_double_quote', 
                                                'enforce_single_space_between_words']
company_cleaner_obj.post_cleaning_rules = ['remove_all_punctuation']

In [41]:
gleif_df.head()

Unnamed: 0,Entity.LegalName,Entity.OtherEntityNames.OtherEntityName.1,Entity.OtherEntityNames.OtherEntityName.2,Entity.OtherEntityNames.OtherEntityName.3,Entity.OtherEntityNames.OtherEntityName.4,Entity.OtherEntityNames.OtherEntityName.5,Entity.LegalAddress.City,Entity.LegalAddress.Country,Entity.HeadquartersAddress.City,Entity.HeadquartersAddress.Country,Entity.LegalAddress.Country_short_name,Entity.LegalAddress.Country_alpha2,Entity.HeadquartersAddress.Country_short_name,Entity.HeadquartersAddress.Country_alpha2,LEI_cleaned_id
0,Fidelity Advisor Leveraged Company Stock Fund,FIDELITY ADVISOR SERIES I - Fidelity Advisor L...,,,,,BOSTON,US,Boston,US,united states,us,united states,us,001GPB6A9XPE8XJICC14
1,"Hutchin Hill Capital, LP",,,,,,Wilmington,US,New York,US,united states,us,united states,us,004L5FPTUREIWK9T2N63
2,Vanguard Russell 1000 Growth Index Trust,,,,,,MALVERN,US,Valley Forge,US,united states,us,united states,us,00EHHQ2ZHDCFXJCPCL46
3,"ARISTEIA CAPITAL, L.L.C.",,,,,,WILMINGTON,US,Greenwich,US,united states,us,united states,us,00GBW0Z2GYIER7DHDS71
4,Oakmark International Fund,HARRIS ASSOCIATES INVESTMENT TRUST - Oakmark I...,,,,,BOSTON,US,Chicago,US,united states,us,united states,us,00KLB2PFTM3060S2N216


In [42]:
gleif_df = company_cleaner_obj.clean_df(gleif_df,
                                            'Entity.LegalName',
                                            'name_legal_clean',
                                            'Entity.LegalAddress.Country_alpha2',
                                            'True')

gleif_df = company_cleaner_obj.clean_df(gleif_df,
                                            'Entity.LegalName',
                                            'name_headq_clean',
                                            'Entity.HeadquartersAddress.Country_alpha2',
                                            'True')

Cleaning company name...100%|██████████████████████████████████████████████████| 230/230 [30:16<00:00,  7.90s/it]   
Cleaning company name...100%|██████████████████████████████████████████████████| 230/230 [28:47<00:00,  7.51s/it]   


In [43]:
gleif_df = company_cleaner_obj.clean_df(gleif_df,
                                            'Entity.OtherEntityNames.OtherEntityName.1',
                                            'name1_legal_clean',
                                            'Entity.LegalAddress.Country_alpha2',
                                            'True')
gleif_df = company_cleaner_obj.clean_df(gleif_df,
                                            'Entity.OtherEntityNames.OtherEntityName.2',
                                            'name2_legal_clean',
                                            'Entity.LegalAddress.Country_alpha2',
                                            'True')
gleif_df = company_cleaner_obj.clean_df(gleif_df,
                                            'Entity.OtherEntityNames.OtherEntityName.3',
                                            'name3_legal_clean',
                                            'Entity.LegalAddress.Country_alpha2',
                                            'True')
gleif_df = company_cleaner_obj.clean_df(gleif_df,
                                            'Entity.OtherEntityNames.OtherEntityName.4',
                                            'name4_legal_clean',
                                            'Entity.LegalAddress.Country_alpha2',
                                            'True')
gleif_df = company_cleaner_obj.clean_df(gleif_df,
                                            'Entity.OtherEntityNames.OtherEntityName.5',
                                            'name5_legal_clean',
                                            'Entity.LegalAddress.Country_alpha2',
                                            'True')


Cleaning company name...100%|██████████████████████████████████████████████████| 230/230 [02:49<00:00,  1.36it/s] 
Cleaning company name...100%|██████████████████████████████████████████████████| 230/230 [01:19<00:00,  2.90it/s]
Cleaning company name...100%|██████████████████████████████████████████████████| 230/230 [00:55<00:00,  4.17it/s]
Cleaning company name...100%|██████████████████████████████████████████████████| 230/230 [00:52<00:00,  4.42it/s]
Cleaning company name...100%|██████████████████████████████████████████████████| 230/230 [00:50<00:00,  4.53it/s]


In [44]:
gleif_df = company_cleaner_obj.clean_df(gleif_df,
                                            'Entity.OtherEntityNames.OtherEntityName.1',
                                            'name1_headq_clean',
                                            'Entity.HeadquartersAddress.Country_alpha2',
                                            'True')
gleif_df = company_cleaner_obj.clean_df(gleif_df,
                                            'Entity.OtherEntityNames.OtherEntityName.2',
                                            'name2_headq_clean',
                                            'Entity.HeadquartersAddress.Country_alpha2',
                                            'True')
gleif_df = company_cleaner_obj.clean_df(gleif_df,
                                            'Entity.OtherEntityNames.OtherEntityName.3',
                                            'name3_headq_clean',
                                            'Entity.HeadquartersAddress.Country_alpha2',
                                            'True')
gleif_df = company_cleaner_obj.clean_df(gleif_df,
                                            'Entity.OtherEntityNames.OtherEntityName.4',
                                            'name4_headq_clean',
                                            'Entity.HeadquartersAddress.Country_alpha2',
                                            'True')
gleif_df = company_cleaner_obj.clean_df(gleif_df,
                                            'Entity.OtherEntityNames.OtherEntityName.5',
                                            'name5_headq_clean',
                                            'Entity.HeadquartersAddress.Country_alpha2',
                                            'True')

Cleaning company name...100%|██████████████████████████████████████████████████| 230/230 [03:03<00:00,  1.25it/s] 
Cleaning company name...100%|██████████████████████████████████████████████████| 230/230 [01:10<00:00,  3.25it/s]
Cleaning company name...100%|██████████████████████████████████████████████████| 230/230 [00:53<00:00,  4.32it/s]
Cleaning company name...100%|██████████████████████████████████████████████████| 230/230 [00:48<00:00,  4.78it/s]
Cleaning company name...100%|██████████████████████████████████████████████████| 230/230 [00:46<00:00,  5.00it/s]


In [49]:
gleif_df.head()

Unnamed: 0,Entity.LegalName,Entity.OtherEntityNames.OtherEntityName.1,Entity.OtherEntityNames.OtherEntityName.2,Entity.OtherEntityNames.OtherEntityName.3,Entity.OtherEntityNames.OtherEntityName.4,Entity.OtherEntityNames.OtherEntityName.5,Entity.LegalAddress.City,Entity.LegalAddress.Country,Entity.HeadquartersAddress.City,Entity.HeadquartersAddress.Country,Entity.LegalAddress.Country_short_name,Entity.LegalAddress.Country_alpha2,Entity.HeadquartersAddress.Country_short_name,Entity.HeadquartersAddress.Country_alpha2,LEI_cleaned_id,name_legal_clean,name_headq_clean,name1_legal_clean,name2_legal_clean,name3_legal_clean,name4_legal_clean,name5_legal_clean,name1_headq_clean,name2_headq_clean,name3_headq_clean,name4_headq_clean,name5_headq_clean
0,Fidelity Advisor Leveraged Company Stock Fund,FIDELITY ADVISOR SERIES I - Fidelity Advisor L...,,,,,BOSTON,US,Boston,US,united states,us,united states,us,001GPB6A9XPE8XJICC14,fidelity advisor leveraged company stock fund,fidelity advisor leveraged company stock fund,fidelity advisor series i fidelity advisor lev...,,,,,fidelity advisor series i fidelity advisor lev...,,,,
1,"Hutchin Hill Capital, LP",,,,,,Wilmington,US,New York,US,united states,us,united states,us,004L5FPTUREIWK9T2N63,hutchin hill capital limited partnership,hutchin hill capital limited partnership,,,,,,,,,,
2,Vanguard Russell 1000 Growth Index Trust,,,,,,MALVERN,US,Valley Forge,US,united states,us,united states,us,00EHHQ2ZHDCFXJCPCL46,vanguard russell 1000 growth index trust,vanguard russell 1000 growth index trust,,,,,,,,,,
3,"ARISTEIA CAPITAL, L.L.C.",,,,,,WILMINGTON,US,Greenwich,US,united states,us,united states,us,00GBW0Z2GYIER7DHDS71,aristeia capital limited liability company,aristeia capital limited liability company,,,,,,,,,,
4,Oakmark International Fund,HARRIS ASSOCIATES INVESTMENT TRUST - Oakmark I...,,,,,BOSTON,US,Chicago,US,united states,us,united states,us,00KLB2PFTM3060S2N216,oakmark international fund,oakmark international fund,harris associates investment trust oakmark int...,,,,,harris associates investment trust oakmark int...,,,,


## 5. Check and save

In [50]:
print(f'Total company in RMI : {gleif_df.shape[0]}')

Total company in RMI : 2344185


In [51]:
gleif_df.columns

Index(['Entity.LegalName', 'Entity.OtherEntityNames.OtherEntityName.1',
       'Entity.OtherEntityNames.OtherEntityName.2',
       'Entity.OtherEntityNames.OtherEntityName.3',
       'Entity.OtherEntityNames.OtherEntityName.4',
       'Entity.OtherEntityNames.OtherEntityName.5', 'Entity.LegalAddress.City',
       'Entity.LegalAddress.Country', 'Entity.HeadquartersAddress.City',
       'Entity.HeadquartersAddress.Country',
       'Entity.LegalAddress.Country_short_name',
       'Entity.LegalAddress.Country_alpha2',
       'Entity.HeadquartersAddress.Country_short_name',
       'Entity.HeadquartersAddress.Country_alpha2', 'LEI_cleaned_id',
       'name_legal_clean', 'name_headq_clean', 'name1_legal_clean',
       'name2_legal_clean', 'name3_legal_clean', 'name4_legal_clean',
       'name5_legal_clean', 'name1_headq_clean', 'name2_headq_clean',
       'name3_headq_clean', 'name4_headq_clean', 'name5_headq_clean'],
      dtype='object')

In [52]:
new_names = [
    'legal_name', 'other_name1',
    'other_name2',
    'other_name3',
    'other_name4',
    'other_name5', 'legal_city',
    'legal_country', 'headquarter_city',
    'headquarter_country',
    'legal_country_short_name',
    'legal_country_alpha2',
    'headquarter_country_short_name',
    'headquarter_country_alpha2', 'lei',
    'name_legal_clean', 'name_headq_clean', 'name1_legal_clean',
    'name2_legal_clean', 'name3_legal_clean', 'name4_legal_clean',
    'name5_legal_clean', 'name1_headq_clean', 'name2_headq_clean',
    'name3_headq_clean', 'name4_headq_clean', 'name5_headq_clean'
]

gleif_df.columns = new_names

In [53]:
saved_path = "../../../dataset/cleaned/"
filename = "gleif_cleaned.csv"

gleif_filename = os.path.join(saved_path, filename)
gleif_df.to_csv(gleif_filename,encoding='utf-8',header=True, index=False)

In [54]:
s3_filename = 'GLEIF/cleaned/gleif_cleaned.csv'
s3_resource.meta.client.upload_file(Filename=gleif_filename,
                                    Bucket=env_var['S3_BUCKET'], 
                                    Key=s3_filename)

In [7]:
for obj in bucket.objects.filter(Prefix="GLEIF/"):
    print(obj.key)

GLEIF/
GLEIF/cleaned/
GLEIF/cleaned/gleif_cleaned.csv
GLEIF/pre_processed/
GLEIF/pre_processed/gleif_pre_processed.csv
GLEIF/raw/
GLEIF/raw/20221031-0800-gleif-goldencopy-lei2-golden-copy.csv
GLEIF/raw/ISIN_LEI_20221101.csv
GLEIF/raw/gleif.csv


In [6]:
import pandas as pd 
import os 

pd.set_option('display.max_columns',50)

In [19]:
saved_path = "../../../dataset/cleaned/"
filename = "gleif_cleaned.csv"
gleif_filename = os.path.join(saved_path, filename)

gleif_df = pd.read_csv(gleif_filename)

  gleif_df = pd.read_csv(gleif_filename)


In [21]:
gleif_df = gleif_df[gleif_df['lei'].notna()]

In [22]:
gleif_df.to_csv(gleif_filename,encoding='utf-8',header=True, index=False)