# Cleaning SPOTT pre-processed file

This notebook reads the pre-processed SPOTT dataset from local data folder and performs the cleaning of country, IDs and company's names. The final result is saved locally and in the SPOTT S3 bucket.

In [1]:
import pandas as pd
import os

In [2]:
# Import CountryCleaner
from financial_entity_cleaner.location import CountryCleaner

In [3]:
# Import the module for cleaning company's name
from financial_entity_cleaner.company import CompanyNameCleaner

In [4]:
# Import BankingIdCleaner
from financial_entity_cleaner.id import BankingIdCleaner

## 1. Read SPOTT pre-processed file

In [5]:
# Save locally to "data" folder
saved_path = "..\..\data\pre_processed"
filename = "spott_pre_processed.csv"

In [6]:
spott_df = pd.read_csv(os.path.join(saved_path, filename))

In [7]:
spott_df.head()

Unnamed: 0,company_name,parent_company,subsidiaries,market_cap,bbg_ticker,reuters_ticker,isin,sedol,landbank_ha,gpsnr_member,activities,small_holders,locations,headquarter_country,notes,website,landbank_oil_ha,rspo_member,member_since
0,Austindo Nusantara Jaya Tbk PT,,,,,,,,,No,,,,,,,,,
1,Bakrie Sumatera Plantations Tbk PT,PT Bakrie & Brothers Tbk,PT Huma Indah Mekar (HIM),20000000.0,UNSP IJ Equity,UNSP.JK,ID1000099708,,19789.0,No,Natural rubber cultivation and processing,Unclear whether company has industrial plantat...,Indonesia (Sumatra),Indonesia,,https://www.bakriesumatera.com/,,,
2,Bridgestone Corporation,,"Firestone Natural Rubber Company, LLC",29197000000.0,5108 JP Equity,5108.T,JP3830800003,,,Yes,"Natural rubber cultivation, processing and dis...",Company has industrial plantation suppliers; C...,"Liberia (Harbel), Indonesia (Kalimantan, Sumat...",Japan,,https://www.bridgestone.com,,,
3,Cheng Shin Rubber Industry Co Ltd (正新橡胶工业股份有限公司),,"Cheng Shin Rubber (China) Co., Ltd. and Cheng ...",4124000000.0,2105 TT Equity,,TW0002105004,,,No,"Natural rubber manufacturing, trading and dist...",Company has scheme/outgrower smallholder suppl...,"China (Xiamen, Chongqing, Zhangzhou)",Taiwan,,https://www.csttires.com/int/about-cst/,,,
4,Continental AG,,"General Tire, Continental Reifen Deutschland GmbH",15804000000.0,CON GR Equity,,DE0005439004,,,No,Natural rubber manufacturing,Company has scheme/outgrower smallholder suppl...,"Germany (Hanover, Korbatch), Czech Republic (O...",Germany,,https://www.continental.com/en/,,,


## 2. Cleaning County information

In [8]:
# Create an object based on CountryCleaner() class
country_cleaner_obj=CountryCleaner()

In [9]:
country_cleaner_obj.letter_case='lower'
country_cleaner_obj.output_info = [CountryCleaner.ATTRIBUTE_ALPHA2,
                                   CountryCleaner.ATTRIBUTE_SHORT_NAME]

In [10]:
spott_df = country_cleaner_obj.clean_df(df=spott_df, cols=['headquarter_country'], output_names_as= 'suffix')

Cleaning column [headquarter_country]: 100%|██████████████████████████████████████████████████| 245/245 [00:01<00:00, 190.62it/s]


In [11]:
spott_df.drop('headquarter_country', axis=1, inplace=True)

In [12]:
spott_df.head()

Unnamed: 0,company_name,parent_company,subsidiaries,market_cap,bbg_ticker,reuters_ticker,isin,sedol,landbank_ha,gpsnr_member,activities,small_holders,locations,notes,website,landbank_oil_ha,rspo_member,member_since,headquarter_country_short_name,headquarter_country_alpha2
0,Austindo Nusantara Jaya Tbk PT,,,,,,,,,No,,,,,,,,,,
1,Bakrie Sumatera Plantations Tbk PT,PT Bakrie & Brothers Tbk,PT Huma Indah Mekar (HIM),20000000.0,UNSP IJ Equity,UNSP.JK,ID1000099708,,19789.0,No,Natural rubber cultivation and processing,Unclear whether company has industrial plantat...,Indonesia (Sumatra),,https://www.bakriesumatera.com/,,,,indonesia,id
2,Bridgestone Corporation,,"Firestone Natural Rubber Company, LLC",29197000000.0,5108 JP Equity,5108.T,JP3830800003,,,Yes,"Natural rubber cultivation, processing and dis...",Company has industrial plantation suppliers; C...,"Liberia (Harbel), Indonesia (Kalimantan, Sumat...",,https://www.bridgestone.com,,,,japan,jp
3,Cheng Shin Rubber Industry Co Ltd (正新橡胶工业股份有限公司),,"Cheng Shin Rubber (China) Co., Ltd. and Cheng ...",4124000000.0,2105 TT Equity,,TW0002105004,,,No,"Natural rubber manufacturing, trading and dist...",Company has scheme/outgrower smallholder suppl...,"China (Xiamen, Chongqing, Zhangzhou)",,https://www.csttires.com/int/about-cst/,,,,taiwan,tw
4,Continental AG,,"General Tire, Continental Reifen Deutschland GmbH",15804000000.0,CON GR Equity,,DE0005439004,,,No,Natural rubber manufacturing,Company has scheme/outgrower smallholder suppl...,"Germany (Hanover, Korbatch), Czech Republic (O...",,https://www.continental.com/en/,,,,germany,de


## 3. Cleaning IDs

In [13]:
# Create an object based on the BankingIdCleaner() class
id_cleaner_obj = BankingIdCleaner()

In [14]:
# Setup cleaning properties
id_cleaner_obj.invalid_ids_as_nan = True
id_cleaner_obj.validation_as_categorical = True
id_cleaner_obj.output_lettercase = "BankingIdCleaner.UPPER_LETTER_CASE"

In [15]:
spott_df = id_cleaner_obj.clean_df(spott_df, cols=['isin', 'sedol'], remove_cols= True, 
                                   output_names_as= 'suffix', types = ['isin', 'sedol'])

Column [sedol] Type [sedol] : 100%|██████████████████████████████████████████████████| 245/245 [00:02<00:00, 88.98it/s] 


In [16]:
spott_df.head()

Unnamed: 0,company_name,parent_company,subsidiaries,market_cap,bbg_ticker,reuters_ticker,landbank_ha,gpsnr_member,activities,small_holders,...,website,landbank_oil_ha,rspo_member,member_since,headquarter_country_short_name,headquarter_country_alpha2,isin_cleaned_id,isin_isvalid_id,sedol_cleaned_id,sedol_isvalid_id
0,Austindo Nusantara Jaya Tbk PT,,,,,,,No,,,...,,,,,,,,,,
1,Bakrie Sumatera Plantations Tbk PT,PT Bakrie & Brothers Tbk,PT Huma Indah Mekar (HIM),20000000.0,UNSP IJ Equity,UNSP.JK,19789.0,No,Natural rubber cultivation and processing,Unclear whether company has industrial plantat...,...,https://www.bakriesumatera.com/,,,,indonesia,id,ID1000099708,1.0,,
2,Bridgestone Corporation,,"Firestone Natural Rubber Company, LLC",29197000000.0,5108 JP Equity,5108.T,,Yes,"Natural rubber cultivation, processing and dis...",Company has industrial plantation suppliers; C...,...,https://www.bridgestone.com,,,,japan,jp,JP3830800003,1.0,,
3,Cheng Shin Rubber Industry Co Ltd (正新橡胶工业股份有限公司),,"Cheng Shin Rubber (China) Co., Ltd. and Cheng ...",4124000000.0,2105 TT Equity,,,No,"Natural rubber manufacturing, trading and dist...",Company has scheme/outgrower smallholder suppl...,...,https://www.csttires.com/int/about-cst/,,,,taiwan,tw,TW0002105004,1.0,,
4,Continental AG,,"General Tire, Continental Reifen Deutschland GmbH",15804000000.0,CON GR Equity,,,No,Natural rubber manufacturing,Company has scheme/outgrower smallholder suppl...,...,https://www.continental.com/en/,,,,germany,de,DE0005439004,1.0,,


## 4. Getting LEI from cleaned ISIN

In [17]:
# From the AWS Account page, copy the export scripts from the appropriate role using the "Command Line or Programmatic Access" link
# Paste the copied text into ~/credentials.env

from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [18]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
bucket_name = os.environ['S3_LANDING_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

In [19]:
# Get the ISIN - LEI dataset from GLEIF S3
isin_lei = bucket.Object('GLEIF/raw/ISIN_LEI_20221101.csv').get()['Body']
isin_lei_df = pd.read_csv(isin_lei, encoding='utf-8', delimiter=',', low_memory=False)

In [20]:
isin_lei_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7728459 entries, 0 to 7728458
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   LEI     object
 1   ISIN    object
dtypes: object(2)
memory usage: 117.9+ MB


In [22]:
isin_lei_df.columns = ['lei', 'isin']

In [30]:
isin_lei_df.head()

Unnamed: 0,lei,isin
0,724500V6UOK62XEZ2L78,NLEN03247298
1,529900W18LQJJN6SJ336,DE000SN4ESX0
2,724500V6UOK62XEZ2L78,NLENX7751979
3,724500V6UOK62XEZ2L78,NLICE3689953
4,959800T2W59YXMVKRU25,ES0A04191961


In [25]:
# Merge ISIN to get LEI
spott_df = spott_df.merge(isin_lei_df, left_on=['isin_cleaned_id'], right_on=['isin'], how='left')

In [31]:
spott_df.drop('isin', axis=1, inplace=True)

In [32]:
spott_df = id_cleaner_obj.clean_df(spott_df, cols=['lei'], remove_cols= True, 
                                   output_names_as= 'suffix', types = ['lei'])

Column [lei] Type [lei] : 100%|██████████████████████████████████████████████████| 245/245 [00:01<00:00, 204.79it/s]


In [33]:
spott_df.head()

Unnamed: 0,company_name,parent_company,subsidiaries,market_cap,bbg_ticker,reuters_ticker,landbank_ha,gpsnr_member,activities,small_holders,...,rspo_member,member_since,headquarter_country_short_name,headquarter_country_alpha2,isin_cleaned_id,isin_isvalid_id,sedol_cleaned_id,sedol_isvalid_id,lei_cleaned_id,lei_isvalid_id
0,Austindo Nusantara Jaya Tbk PT,,,,,,,No,,,...,,,,,,,,,,
1,Bakrie Sumatera Plantations Tbk PT,PT Bakrie & Brothers Tbk,PT Huma Indah Mekar (HIM),20000000.0,UNSP IJ Equity,UNSP.JK,19789.0,No,Natural rubber cultivation and processing,Unclear whether company has industrial plantat...,...,,,indonesia,id,ID1000099708,1.0,,,,
2,Bridgestone Corporation,,"Firestone Natural Rubber Company, LLC",29197000000.0,5108 JP Equity,5108.T,,Yes,"Natural rubber cultivation, processing and dis...",Company has industrial plantation suppliers; C...,...,,,japan,jp,JP3830800003,1.0,,,549300DHPOF90OYYD780,1.0
3,Cheng Shin Rubber Industry Co Ltd (正新橡胶工业股份有限公司),,"Cheng Shin Rubber (China) Co., Ltd. and Cheng ...",4124000000.0,2105 TT Equity,,,No,"Natural rubber manufacturing, trading and dist...",Company has scheme/outgrower smallholder suppl...,...,,,taiwan,tw,TW0002105004,1.0,,,,
4,Continental AG,,"General Tire, Continental Reifen Deutschland GmbH",15804000000.0,CON GR Equity,,,No,Natural rubber manufacturing,Company has scheme/outgrower smallholder suppl...,...,,,germany,de,DE0005439004,1.0,,,529900A7YD9C0LLXM621,1.0


## 5. Cleaning Company's names

In [34]:
# Create a CompanyNameCleaner object
company_cleaner_obj = CompanyNameCleaner()

In [35]:
# Set cleaner properties
company_cleaner_obj.normalize_legal_terms = True
company_cleaner_obj.letter_case="lower"
company_cleaner_obj.remove_accents = True
company_cleaner_obj.default_cleaning_rules = ['place_word_the_at_the_beginning',
                                              'remove_words_in_asterisk', 
                                              'remove_words_in_parentheses',
                                              'remove_question_marks_in_parentheses', 
                                              'replace_hyphen_by_space', 
                                              'replace_underscore_by_space', 
                                              'remove_text_puctuation_except_dot', 
                                              'remove_math_symbols', 
                                              'remove_parentheses', 
                                              'remove_brackets', 
                                              'remove_curly_brackets', 
                                              'remove_single_quote_next_character', 
                                              'remove_double_quote', 
                                              'enforce_single_space_between_words']

In [36]:
spott_df = company_cleaner_obj.clean_df(spott_df, 'company_name', 'company_name_clean', 'headquarter_country_alpha2', 'True')

In [41]:
spott_df.reset_index(inplace=True)
spott_df = spott_df.rename(columns = {'index':'company_id'})

In [42]:
spott_df.head()

Unnamed: 0,company_id,company_name,parent_company,subsidiaries,market_cap,bbg_ticker,reuters_ticker,landbank_ha,gpsnr_member,activities,...,member_since,headquarter_country_short_name,headquarter_country_alpha2,isin_cleaned_id,isin_isvalid_id,sedol_cleaned_id,sedol_isvalid_id,lei_cleaned_id,lei_isvalid_id,company_name_clean
0,0,Austindo Nusantara Jaya Tbk PT,,,,,,,No,,...,,,,,,,,,,austindo nusantara jaya tbk pt
1,1,Bakrie Sumatera Plantations Tbk PT,PT Bakrie & Brothers Tbk,PT Huma Indah Mekar (HIM),20000000.0,UNSP IJ Equity,UNSP.JK,19789.0,No,Natural rubber cultivation and processing,...,,indonesia,id,ID1000099708,1.0,,,,,bakrie sumatera plantations tbk perseroan terb...
2,2,Bridgestone Corporation,,"Firestone Natural Rubber Company, LLC",29197000000.0,5108 JP Equity,5108.T,,Yes,"Natural rubber cultivation, processing and dis...",...,,japan,jp,JP3830800003,1.0,,,549300DHPOF90OYYD780,1.0,bridgestone corporation
3,3,Cheng Shin Rubber Industry Co Ltd (正新橡胶工业股份有限公司),,"Cheng Shin Rubber (China) Co., Ltd. and Cheng ...",4124000000.0,2105 TT Equity,,,No,"Natural rubber manufacturing, trading and dist...",...,,taiwan,tw,TW0002105004,1.0,,,,,cheng shin rubber industry co limited
4,4,Continental AG,,"General Tire, Continental Reifen Deutschland GmbH",15804000000.0,CON GR Equity,,,No,Natural rubber manufacturing,...,,germany,de,DE0005439004,1.0,,,529900A7YD9C0LLXM621,1.0,continental aktiengesellschaft


In [43]:
print('Total companies in SPOTT {}'.format(spott_df.shape[0]))

Total companies in SPOTT 245


## 6. Save cleaned SPOTT

In [44]:
# Save locally to "data" folder
saved_path = "..\..\data\cleaned"
filename = "spott_cleaned.csv"
spott_filename = os.path.join(saved_path, filename)
spott_df.to_csv(spott_filename, header=True, index=False)

In [45]:
# Save final results to S3
s3_filename = 'SPOTT/cleaned/spott_cleaned.csv'
s3_resource.meta.client.upload_file(Filename=spott_filename, Bucket=bucket_name, Key=s3_filename)