# CORPWATCH Pre-processing

This notebook reads from S3 the three main CORPWATCH datasets that contain information about companies. The pre-processing here consists on selecting some information from these files and combine them together as to have one dataset with unique company's record.

In [1]:
import pandas as pd

## 1. Setup to read CORPWATCH from S3

In [2]:
# From the AWS Account page, copy the export scripts from the appropriate role using the "Command Line or Programmatic Access" link
# Paste the copied text into ~/credentials.env

from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [3]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
bucket_name = os.environ['S3_LANDING_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

## 2. Link company names, location and lei from CorpWatch files

In [13]:
# Read company's main file from S3
companies_file = bucket.Object('CorpWatch/raw/companies.csv').get()['Body']
corpwatch_df = pd.read_csv(companies_file, encoding='utf-8', delimiter='\t', low_memory=False)

In [14]:
corpwatch_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1283087 entries, 0 to 1283086
Data columns (total 6 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   row_id        1283087 non-null  int64  
 1   cw_id         653524 non-null   float64
 2   cik           1221379 non-null  float64
 3   company_name  1283087 non-null  object 
 4   source_type   1283087 non-null  object 
 5   source_id     1283087 non-null  int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 58.7+ MB


In [15]:
# Select some information and remove duplications 
corpwatch_df = corpwatch_df.loc[:, ['cw_id', 'cik', 'company_name']] 
corpwatch_df.drop_duplicates(inplace=True)

In [16]:
# Remove companies without corpwatch ID
mask = corpwatch_df['cw_id'].notnull()
corpwatch_df = corpwatch_df[mask]

In [17]:
corpwatch_df.head()

Unnamed: 0,cw_id,cik,company_name
0,781188.0,13.0,CORPORATE INCOME FUND SEVENTY NINTH SHORT TERM...
1,1.0,20.0,K TRON INTERNATIONAL INC
2,1249859.0,63.0,FNW BANCORP INC
3,2.0,1750.0,AAR CORP
4,3.0,1800.0,ABBOTT LABORATORIES


In [18]:
print('Total companies in CORPWATCH {}'.format(corpwatch_df.shape[0]))

Total companies in CORPWATCH 653524


In [19]:
# Read company's location file
locations_file = bucket.Object('CorpWatch/raw/company_locations.csv').get()['Body']
locations_df = pd.read_csv(locations_file, encoding='utf-8', delimiter='\t', low_memory=False)

In [20]:
locations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2234325 entries, 0 to 2234324
Data columns (total 16 columns):
 #   Column        Dtype  
---  ------        -----  
 0   location_id   int64  
 1   cw_id         float64
 2   date          object 
 3   type          object 
 4   raw_address   object 
 5   street_1      object 
 6   street_2      object 
 7   city          object 
 8   state         object 
 9   postal_code   object 
 10  country       float64
 11  country_code  object 
 12  subdiv_code   object 
 13  min_year      int64  
 14  max_year      int64  
 15  most_recent   int64  
dtypes: float64(2), int64(4), object(10)
memory usage: 272.7+ MB


In [26]:
# Select corpwatch ID + city + country and remove duplicates 
locations_df = locations_df.loc[:,['cw_id', 'city', 'country_code']]
locations_df.drop_duplicates(inplace=True)

In [27]:
# Keep only the non-null entries 
mask1 = locations_df['cw_id'].notnull()
mask2 = locations_df['city'].notnull()
mask3 = locations_df['country_code'].notnull()
locations_df = locations_df[mask1 & mask2 & mask3]

In [28]:
locations_df.head()

Unnamed: 0,cw_id,city,country_code
0,1.0,PITMAN,US
1,2.0,WOOD DALE,US
2,3.0,ABBOTT PARK,US
4,4.0,NEW YORK,US
6,5.0,NEW YORK,US


In [29]:
# Merge company's record and locations
corpwatch_df = corpwatch_df.merge(locations_df, left_on=['cw_id'], right_on=['cw_id'], how='left')

In [30]:
# Some companies are added because they exist in different locations
print('Total companies in CORPWATCH {}'.format(corpwatch_df.shape[0]))

Total companies in CORPWATCH 719598


In [31]:
corpwatch_df.head()

Unnamed: 0,cw_id,cik,company_name,city,country_code
0,781188.0,13.0,CORPORATE INCOME FUND SEVENTY NINTH SHORT TERM...,NEW YORK,US
1,1.0,20.0,K TRON INTERNATIONAL INC,PITMAN,US
2,1249859.0,63.0,FNW BANCORP INC,ELGIN,US
3,2.0,1750.0,AAR CORP,WOOD DALE,US
4,3.0,1800.0,ABBOTT LABORATORIES,ABBOTT PARK,US


In [35]:
# Read company's CIK-LEI file from S3
lei_file = bucket.Object('CorpWatch/raw/cik_lei.csv').get()['Body']
cik_lei_df = pd.read_csv(lei_file, encoding='utf-8', delimiter='|', low_memory=False)

In [36]:
# Select cik, name and lei and remove duplications
cik_lei_df = cik_lei_df.loc[:, ['CIK', 'Name', 'LEI']]
cik_lei_df.columns = ['cik', 'company_name_lei', 'lei']
cik_lei_df.drop_duplicates(inplace=True)

In [37]:
cik_lei_df.head()

Unnamed: 0,cik,company_name_lei,lei
0,1750,Aar Corp,MP76T5YQX3YK5VVAQ802
1,1800,Abbott Laboratories,549300ZYY7524VF6JP88
2,2024,Ace Hardware Corp,5493002WMDWGNHBLKW26
3,2034,Aceto Corp,7MUSABK5TKG3168JQL21
4,2110,Columbia Acorn Trust,549300KHJ8TKYZRHYE36


In [38]:
# Merge company's record and lei
corpwatch_df = corpwatch_df.merge(cik_lei_df, left_on=['cik'], right_on=['cik'], how='left')

In [40]:
corpwatch_df.head()

Unnamed: 0,cw_id,cik,company_name,city,country_code,company_name_lei,lei
0,781188.0,13.0,CORPORATE INCOME FUND SEVENTY NINTH SHORT TERM...,NEW YORK,US,,
1,1.0,20.0,K TRON INTERNATIONAL INC,PITMAN,US,,
2,1249859.0,63.0,FNW BANCORP INC,ELGIN,US,,
3,2.0,1750.0,AAR CORP,WOOD DALE,US,Aar Corp,MP76T5YQX3YK5VVAQ802
4,3.0,1800.0,ABBOTT LABORATORIES,ABBOTT PARK,US,Abbott Laboratories,549300ZYY7524VF6JP88


In [39]:
# Show corpwatch company's records
print('Total companies in CORPWATCH {}'.format(corpwatch_df.shape[0]))

Total companies in CORPWATCH 719598


## 4. Save CorpWatch pre-processed file

In [41]:
# Save locally to "data" folder
saved_path = "..\..\data\pre_processed"
filename = "corpwatch_pre_processed.csv"
corpwatch_filename = os.path.join(saved_path, filename)
corpwatch_df.to_csv(corpwatch_filename, header=True, index=False)

In [42]:
# Save final results to S3
s3_filename = 'CorpWatch/pre_processed/corpwatch_pre_processed.csv'
s3_resource.meta.client.upload_file(Filename=corpwatch_filename, Bucket=bucket_name, Key=s3_filename)