# Pre-processing for WRI Dataset

This notebook reads from S3 the main WRI dataset to select only company's record and country information.
No change in the file content is performed, apart from column's names. The result of this pre-processing is saved back to S3.

In [1]:
import pandas as pd

## 1. Read WRI from S3

In [2]:
# From the AWS Account page, copy the export scripts from the appropriate role using the "Command Line or Programmatic Access" link
# Paste the copied text into ~/credentials.env

from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [3]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
bucket_name = os.environ['S3_LANDING_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

wri_file = bucket.Object('WRI/global_power_plant_database_v_1_3/global_power_plant_database.csv').get()['Body']

In [4]:
wri_df = pd.read_csv(wri_file, encoding='utf-8', delimiter=',', dtype={'latitude':'float64',
                                                                      'longitude': 'float64',
                                                                      'capacity_mw':'float64',
                                                                      'other_fuel3':'str'})

In [5]:
# Get only company's records
mask=wri_df['owner'].notnull()
wri_df = wri_df[mask]

In [6]:
# Select company's names + country and remove duplicates 
wri_df = wri_df.loc[:, ['owner', 'country']]
wri_df.drop_duplicates(inplace=True)

In [7]:
# Rename column names and check the current dataset
wri_df.columns = ['company_name', 'country']
wri_df.head()

Unnamed: 0,company_name,country
19,SociÃ©te AlgÃ©rienne de Production de l\'Elect...,DZA
25,SociÃ©tÃ© AlgÃ©rienne de Production de lâ€™Ele...,DZA
27,Sonelgaz,DZA
34,Sonelgaz Production de lâ€™Electricite,DZA
36,Sharikat Kahraba Hadjret En-Nouss,DZA


In [8]:
print('Total companies in WRI {}'.format(wri_df.shape[0]))

Total companies in WRI 10181


## 2. Save pre-processed WRI

In [9]:
# Save locally to "data" folder
saved_path = "..\..\data\pre_processed"
filename = "wri_pre_processed.csv"
wri_filename = os.path.join(saved_path, filename)
wri_df.to_csv(wri_filename, header=True, index=False)

In [10]:
# Save final results to S3
s3_filename = 'WRI/pre_processed/wri_pre_processed.csv'
s3_resource.meta.client.upload_file(Filename=wri_filename, Bucket=bucket_name, Key=s3_filename)