# Pre-processing for WIKI Dataset

This notebook reads the WIKI datasets from S3, concatenates them together and saves the final result locally and into the WIKI S3 bucket.

In [1]:
import pandas as pd

## 1. Read WIKI from S3

In [2]:
# From the AWS Account page, copy the export scripts from the appropriate role using the "Command Line or Programmatic Access" link
# Paste the copied text into ~/credentials.env

from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [3]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
bucket_name = os.environ['S3_LANDING_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

In [4]:
wiki_df = None
folder_s3 = 'WIKI/raw/'

In [5]:
for count in range(1,21):
    filename = 'results_' + str(count) + '.csv'
    wiki_ds = bucket.Object(folder_s3+filename).get()['Body']
    wiki_temp = pd.read_csv(wiki_ds, encoding='utf-8', delimiter=',')
    wiki_df = pd.concat([wiki_df, wiki_temp], axis=0)

In [8]:
wiki_df.drop(['Idx'], axis=1, inplace=True)
wiki_df.columns = ['short_name', 'aliases', 'company_name', 'country', 'lei']

In [9]:
wiki_df.head()

Unnamed: 0,short_name,aliases,company_name,country,lei
0,Boeing,"['The Boeing Company', 'Boeing Company']",THE BOEING COMPANY,United States of America,RVHJWBXLJ1RFUBSY1F30
1,Airbus,['Airbus Commercial Aircraft'],Airbus SE,France,529900FCMZ4LKXFD0R69
2,Google,"['Google Inc.', 'Google LLC']",Google LLC,United States of America,7ZW8QJWVPR4P1J1KQY45
3,Intel,"['Intel Corporation', 'N M Electronics', 'Inte...",Intel Corporation,United States of America,KNX4USFCNGPY45LOCE31
4,Apple Inc.,"['Apple Computer, Inc.', 'Apple Computer Inc',...",Apple Inc.,United States of America,HWUPKR0MPOU8FGXBT394


In [10]:
print('Total companies in WIKI {}'.format(wiki_df.shape[0]))

Total companies in WIKI 198509


## 2. Save pre-processed WIKI

In [11]:
# Save locally to "data" folder
saved_path = "..\..\data\pre_processed"
filename = "wiki_pre_processed.csv"
wiki_filename = os.path.join(saved_path, filename)
wiki_df.to_csv(wiki_filename, header=True, index=False)

In [13]:
# Save final results to S3
s3_filename = 'WIKI/pre_processed/wiki_pre_processed.csv'
s3_resource.meta.client.upload_file(Filename=wiki_filename, Bucket=bucket_name, Key=s3_filename)