# GLEIF Pre-processing

In [1]:
import pandas as pd

## 1. Read GLEIF from S3

In [2]:
# From the AWS Account page, copy the export scripts from the appropriate role using the "Command Line or Programmatic Access" link
# Paste the copied text into ~/credentials.env

from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [16]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
bucket_name = os.environ['S3_LANDING_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

In [4]:
gleif_file = bucket.Object('GLEIF/raw/20221031-0800-gleif-goldencopy-lei2-golden-copy.csv').get()['Body']

In [5]:
gleif_df = pd.read_csv(gleif_file, encoding='utf-8', delimiter=',', low_memory=False)

In [6]:
for col_name in gleif_df.columns:
    print(col_name)

LEI
Entity.LegalName
Entity.LegalName.xmllang
Entity.OtherEntityNames.OtherEntityName.1
Entity.OtherEntityNames.OtherEntityName.1.xmllang
Entity.OtherEntityNames.OtherEntityName.1.type
Entity.OtherEntityNames.OtherEntityName.2
Entity.OtherEntityNames.OtherEntityName.2.xmllang
Entity.OtherEntityNames.OtherEntityName.2.type
Entity.OtherEntityNames.OtherEntityName.3
Entity.OtherEntityNames.OtherEntityName.3.xmllang
Entity.OtherEntityNames.OtherEntityName.3.type
Entity.OtherEntityNames.OtherEntityName.4
Entity.OtherEntityNames.OtherEntityName.4.xmllang
Entity.OtherEntityNames.OtherEntityName.4.type
Entity.OtherEntityNames.OtherEntityName.5
Entity.OtherEntityNames.OtherEntityName.5.xmllang
Entity.OtherEntityNames.OtherEntityName.5.type
Entity.TransliteratedOtherEntityNames.TransliteratedOtherEntityName.1
Entity.TransliteratedOtherEntityNames.TransliteratedOtherEntityName.1.xmllang
Entity.TransliteratedOtherEntityNames.TransliteratedOtherEntityName.1.type
Entity.TransliteratedOtherEntityName

In [9]:
# Select columns
cols = ['LEI', 'Entity.LegalName', 'Entity.OtherEntityNames.OtherEntityName.1', 'Entity.OtherEntityNames.OtherEntityName.2',
        'Entity.OtherEntityNames.OtherEntityName.3', 'Entity.OtherEntityNames.OtherEntityName.4',
        'Entity.OtherEntityNames.OtherEntityName.5', 'Entity.LegalAddress.Country',
        'Entity.HeadquartersAddress.Country', 'Entity.EntityStatus']
gleif_df = gleif_df.loc[:, cols]

In [11]:
gleif_df.columns = ['lei', 'legal_name', 'other_name1', 'other_name2', 'other_name3', 'other_name4', 'other_name5',
                    'legal_country', 'headquarter_country', 'status']

In [12]:
gleif_df.head()

Unnamed: 0,lei,legal_name,other_name1,other_name2,other_name3,other_name4,other_name5,legal_country,headquarter_country,status
0,001GPB6A9XPE8XJICC14,FIDELITY ADVISOR SERIES I - Fidelity Advisor L...,,,,,,US,US,ACTIVE
1,004L5FPTUREIWK9T2N63,"Hutchin Hill Capital, LP",,,,,,US,US,ACTIVE
2,00EHHQ2ZHDCFXJCPCL46,Vanguard Russell 1000 Growth Index Trust,,,,,,US,US,ACTIVE
3,00GBW0Z2GYIER7DHDS71,"ARISTEIA CAPITAL, L.L.C.",,,,,,US,US,ACTIVE
4,00KLB2PFTM3060S2N216,HARRIS ASSOCIATES INVESTMENT TRUST - Oakmark I...,,,,,,US,US,ACTIVE


In [13]:
print('Total companies in GLEIF {}'.format(gleif_df.shape[0]))

Total companies in GLEIF 2252977


## 2. Save pre-processed GLEIF

In [14]:
# Save locally to "data" folder
saved_path = "..\..\data\pre_processed"
filename = "gleif_pre_processed.csv"
gleif_filename = os.path.join(saved_path, filename)
gleif_df.to_csv(gleif_filename, header=True, index=False)

In [18]:
# Save final results to S3
s3_filename = 'GLEIF/pre_processed/gleif_pre_processed.csv'
s3_resource.meta.client.upload_file(Filename=gleif_filename, Bucket=bucket_name, Key=s3_filename)