# Pre-processing for WIKI Dataset

...

In [1]:
import pandas as pd

## 1. Set up

In [2]:
# From the AWS Account page, copy the export scripts from the appropriate role using the "Command Line or Programmatic Access" link
# Paste the copied text into ~/credentials.env

from dotenv import dotenv_values, load_dotenv
import os
import pathlib

In [3]:
env_var = dotenv_values('../../.env')

In [4]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=env_var['S3_ENDPOINT'],
    aws_access_key_id=env_var['S3_ACCESS_KEY'],
    aws_secret_access_key=env_var['S3_SECRET_KEY'],
)
bucket_name = env_var['S3_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

## 2. Read WIKI from S3

### Company

In [62]:
company = bucket.Object('WIKI/raw/company.csv').get()['Body']
company_df = pd.read_csv(company, encoding='utf-8', delimiter=',', low_memory=False )

In [66]:
company_df.head()

Unnamed: 0,wikidata_id,name,lei,perm_id,bloomberg_id,siren,isin,siret,country_name,hq
0,Q66,Boeing,RVHJWBXLJ1RFUBSY1F30,4295903076.0,,,US0970231058,,United States of America,United States of America
1,Q67,Airbus,529900FCMZ4LKXFD0R69,,,383474814.0,,,France,France
2,Q95,Google,7ZW8QJWVPR4P1J1KQY45,4295899948.0,,,,,United States of America,United States of America
3,Q248,Intel,KNX4USFCNGPY45LOCE31,4295906830.0,,,US4581401001,,United States of America,United States of America
4,Q312,Apple,HWUPKR0MPOU8FGXBT394,4295905573.0,,,US0378331005,,United States of America,United States of America


In [65]:
company_df.drop('index', axis='columns', inplace=True)

In [67]:
# Save localy
saved_path = "../../../dataset/pre_processed/"
filename = "wiki_company.csv"

company_filename = os.path.join(saved_path, filename)
company_df.to_csv(company_filename,encoding='utf-8',header=True, index=False)

In [68]:
# Upload
s3_filename = 'WIKI/pre_processed/company.csv'
s3_resource.meta.client.upload_file(Filename=company_filename,
                                    Bucket=env_var['S3_BUCKET'], 
                                    Key=s3_filename)

### Companies 

In [23]:
companies = bucket.Object('WIKI/raw/companies.csv').get()['Body']
companies_df = pd.read_csv(companies, encoding='utf-8', delimiter=',', low_memory=False )

In [26]:
companies_df.drop('index', axis='columns', inplace=True)

In [None]:
company = bucket.Object('WIKI/raw/company.csv').get()['Body']
company_df = pd.read_csv(company, encoding='utf-8', delimiter=',', low_memory=False )

In [55]:
# Save localy
saved_path = "../../../dataset/pre_processed/"
filename = "wiki_companies.csv"

companies_filename = os.path.join(saved_path, filename)
companies_df.to_csv(companies_filename,encoding='utf-8',header=True, index=False)

In [56]:
# Upload
s3_filename = 'WIKI/pre_processed/companies.csv'
s3_resource.meta.client.upload_file(Filename=companies_filename,
                                    Bucket=env_var['S3_BUCKET'], 
                                    Key=s3_filename)

In [58]:
for obj in bucket.objects.filter(Prefix="WIKI/pre_processed"):
    print(obj.key)

WIKI/pre_processed/
WIKI/pre_processed/companies.csv
WIKI/pre_processed/company.csv
WIKI/pre_processed/wiki_pre_processed.csv
