# Cleaning GLEIF pre-processed file

This notebook reads the pre-processed GLEIF dataset from local data folder and performs the cleaning of country, IDs and company's names. The final result is saved locally and in the GLEIF S3 bucket.

In [1]:
import pandas as pd
import os

In [2]:
# Import CountryCleaner
from financial_entity_cleaner.location import CountryCleaner

In [3]:
# Import the module for cleaning company's name
from financial_entity_cleaner.company import CompanyNameCleaner

In [4]:
# Import BankingIdCleaner
from financial_entity_cleaner.id import BankingIdCleaner

## 1. Read GLEIF pre-processed file

In [5]:
# Save locally to "data" folder
saved_path = "..\..\data\pre_processed"
filename = "gleif_pre_processed.csv"

In [8]:
gleif_df = pd.read_csv(os.path.join(saved_path, filename), low_memory=False)

In [9]:
gleif_df.head()

Unnamed: 0,lei,legal_name,other_name1,other_name2,other_name3,other_name4,other_name5,legal_country,headquarter_country,status
0,001GPB6A9XPE8XJICC14,FIDELITY ADVISOR SERIES I - Fidelity Advisor L...,,,,,,US,US,ACTIVE
1,004L5FPTUREIWK9T2N63,"Hutchin Hill Capital, LP",,,,,,US,US,ACTIVE
2,00EHHQ2ZHDCFXJCPCL46,Vanguard Russell 1000 Growth Index Trust,,,,,,US,US,ACTIVE
3,00GBW0Z2GYIER7DHDS71,"ARISTEIA CAPITAL, L.L.C.",,,,,,US,US,ACTIVE
4,00KLB2PFTM3060S2N216,HARRIS ASSOCIATES INVESTMENT TRUST - Oakmark I...,,,,,,US,US,ACTIVE


## 2. Cleaning County information

In [10]:
# Create an object based on CountryCleaner() class
country_cleaner_obj=CountryCleaner()

In [11]:
country_cleaner_obj.letter_case='lower'
country_cleaner_obj.output_info = [CountryCleaner.ATTRIBUTE_ALPHA2,
                                   CountryCleaner.ATTRIBUTE_SHORT_NAME]

In [12]:
gleif_df = country_cleaner_obj.clean_df(df=gleif_df, cols=['legal_country', 'headquarter_country'], output_names_as= 'suffix')

Cleaning column [headquarter_country]: 100%|██████████████████████████████████████████████████| 2252977/2252977 [4:50:10<00:00, 129.40it/s]  


In [14]:
gleif_df=spott_df

In [None]:
gleif_df.drop(['legal_country', 'headquarter_country'], axis=1, inplace=True)

In [15]:
gleif_df.head()

Unnamed: 0,lei,legal_name,other_name1,other_name2,other_name3,other_name4,other_name5,legal_country,headquarter_country,status,legal_country_short_name,legal_country_alpha2,headquarter_country_short_name,headquarter_country_alpha2
0,001GPB6A9XPE8XJICC14,FIDELITY ADVISOR SERIES I - Fidelity Advisor L...,,,,,,US,US,ACTIVE,united states,us,united states,us
1,004L5FPTUREIWK9T2N63,"Hutchin Hill Capital, LP",,,,,,US,US,ACTIVE,united states,us,united states,us
2,00EHHQ2ZHDCFXJCPCL46,Vanguard Russell 1000 Growth Index Trust,,,,,,US,US,ACTIVE,united states,us,united states,us
3,00GBW0Z2GYIER7DHDS71,"ARISTEIA CAPITAL, L.L.C.",,,,,,US,US,ACTIVE,united states,us,united states,us
4,00KLB2PFTM3060S2N216,HARRIS ASSOCIATES INVESTMENT TRUST - Oakmark I...,,,,,,US,US,ACTIVE,united states,us,united states,us


## 3. Cleaning LEI

In [None]:
# Create an object based on the BankingIdCleaner() class
id_cleaner_obj = BankingIdCleaner()

In [None]:
# Setup cleaning properties
id_cleaner_obj.invalid_ids_as_nan = True
id_cleaner_obj.validation_as_categorical = True
id_cleaner_obj.output_lettercase = "BankingIdCleaner.UPPER_LETTER_CASE"

In [None]:
gleif_df = id_cleaner_obj.clean_df(gleif_df, cols=['lei'], remove_cols= True, 
                                   output_names_as= 'suffix', types = ['lei'])

In [None]:
gleif_df.head()

## 5. Cleaning Company's names

In [16]:
# Create a CompanyNameCleaner object
company_cleaner_obj = CompanyNameCleaner()

In [17]:
# Set cleaner properties
company_cleaner_obj.normalize_legal_terms = True
company_cleaner_obj.letter_case="lower"
company_cleaner_obj.remove_accents = True
company_cleaner_obj.default_cleaning_rules = ['place_word_the_at_the_beginning',
                                              'remove_words_in_asterisk', 
                                              'remove_words_in_parentheses',
                                              'remove_question_marks_in_parentheses', 
                                              'replace_hyphen_by_space', 
                                              'replace_underscore_by_space', 
                                              'remove_text_puctuation_except_dot', 
                                              'remove_math_symbols', 
                                              'remove_parentheses', 
                                              'remove_brackets', 
                                              'remove_curly_brackets', 
                                              'remove_single_quote_next_character', 
                                              'remove_double_quote', 
                                              'enforce_single_space_between_words']

In [20]:
gleif_df = company_cleaner_obj.clean_df(gleif_df, 'legal_name', 'name_legal_clean', 'legal_country_alpha2', 'False')

In [24]:
gleif_df = company_cleaner_obj.clean_df(gleif_df, 'legal_name', 'name_headq_clean', 'headquarter_country_alpha2', 'False')

In [29]:
gleif_df = company_cleaner_obj.clean_df(gleif_df, 'other_name1', 'name1_legal_clean', 'legal_country_alpha2', 'False')
gleif_df = company_cleaner_obj.clean_df(gleif_df, 'other_name1', 'name2_legal_clean', 'legal_country_alpha2', 'False')
gleif_df = company_cleaner_obj.clean_df(gleif_df, 'other_name1', 'name3_legal_clean', 'legal_country_alpha2', 'False')
gleif_df = company_cleaner_obj.clean_df(gleif_df, 'other_name1', 'name4_legal_clean', 'legal_country_alpha2', 'False')
gleif_df = company_cleaner_obj.clean_df(gleif_df, 'other_name1', 'name5_legal_clean', 'legal_country_alpha2', 'False')

In [30]:
gleif_df = company_cleaner_obj.clean_df(gleif_df, 'other_name1', 'name1_headq_clean', 'headquarter_country_alpha2', 'False')
gleif_df = company_cleaner_obj.clean_df(gleif_df, 'other_name1', 'name2_headq_clean', 'headquarter_country_alpha2', 'False')
gleif_df = company_cleaner_obj.clean_df(gleif_df, 'other_name1', 'name3_headq_clean', 'headquarter_country_alpha2', 'False')
gleif_df = company_cleaner_obj.clean_df(gleif_df, 'other_name1', 'name4_headq_clean', 'headquarter_country_alpha2', 'False')
gleif_df = company_cleaner_obj.clean_df(gleif_df, 'other_name1', 'name5_headq_clean', 'headquarter_country_alpha2', 'False')

In [31]:
gleif_df.head()

Unnamed: 0,lei,legal_name,other_name1,other_name2,other_name3,other_name4,other_name5,legal_country,headquarter_country,status,...,name1_legal_clean,name2_legal_clean,name3_legal_clean,name4_legal_clean,name5_legal_clean,name1_headq_clean,name2_headq_clean,name3_headq_clean,name4_headq_clean,name5_headq_clean
0,001GPB6A9XPE8XJICC14,FIDELITY ADVISOR SERIES I - Fidelity Advisor L...,,,,,,US,US,ACTIVE,...,,,,,,,,,,
1,004L5FPTUREIWK9T2N63,"Hutchin Hill Capital, LP",,,,,,US,US,ACTIVE,...,,,,,,,,,,
2,00EHHQ2ZHDCFXJCPCL46,Vanguard Russell 1000 Growth Index Trust,,,,,,US,US,ACTIVE,...,,,,,,,,,,
3,00GBW0Z2GYIER7DHDS71,"ARISTEIA CAPITAL, L.L.C.",,,,,,US,US,ACTIVE,...,,,,,,,,,,
4,00KLB2PFTM3060S2N216,HARRIS ASSOCIATES INVESTMENT TRUST - Oakmark I...,,,,,,US,US,ACTIVE,...,,,,,,,,,,


In [32]:
print('Total companies in GLEIF {}'.format(gleif_df.shape[0]))

Total companies in GLEIF 2252977


## 6. Save cleaned GLEIF

In [33]:
# Save locally to "data" folder
saved_path = "..\..\data\cleaned"
filename = "gleif_cleaned.csv"
gleif_filename = os.path.join(saved_path, filename)
gleif_df.to_csv(gleif_filename, header=True, index=False)

In [34]:
# Save final results to S3
s3_filename = 'GLEIF/cleaned/gleif_cleaned.csv'
s3_resource.meta.client.upload_file(Filename=gleif_filename, Bucket=bucket_name, Key=s3_filename)

NameError: name 's3_resource' is not defined