# Pre-processing for SPOTT Dataset

This notebook reads the three SPOTT datasets from S3, concatenates them together and saves the final result locally and into the SPOTT S3 bucket.

In [1]:
import pandas as pd

## 1. Read SPOTT from S3

In [2]:
# From the AWS Account page, copy the export scripts from the appropriate role using the "Command Line or Programmatic Access" link
# Paste the copied text into ~/credentials.env

from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [8]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
bucket_name = os.environ['S3_LANDING_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

spott_rubber = bucket.Object('SPOTT/raw/SPOTT-natural-rubber-companies-profile-data-downloaded-2022-10-28.csv').get()['Body']
spott_oil = bucket.Object('SPOTT/raw/SPOTT-palm-oil-companies-profile-data-downloaded-2022-10-28.csv').get()['Body']
spott_pulp = bucket.Object('SPOTT/raw/SPOTT-timber-pulp-paper-companies-profile-data-downloaded-2022-10-28.csv').get()['Body']

In [9]:
spott_df1 = pd.read_csv(spott_rubber, encoding='utf-8', delimiter=',', skiprows=7)
spott_df2 = pd.read_csv(spott_oil, encoding='utf-8', delimiter=',', skiprows=7)
spott_df3 = pd.read_csv(spott_pulp, encoding='utf-8', delimiter=',', skiprows=7)

In [12]:
spott_df = pd.concat([spott_df1, spott_df2, spott_df3], axis=0)

In [13]:
spott_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 245 entries, 0 to 102
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Company                  245 non-null    object 
 1   Parent company           92 non-null     object 
 2   Subsidiaries             131 non-null    object 
 3   Market cap (USD)         98 non-null     float64
 4   Bloomberg ticker         240 non-null    object 
 5   Thomson Reuters ticker   156 non-null    object 
 6   ISIN                     97 non-null     object 
 7   Sedol                    30 non-null     object 
 8   Landbank (ha)            89 non-null     float64
 9   GPSNR member?            31 non-null     object 
 10  Activities               237 non-null    object 
 11  Smallholders             243 non-null    object 
 12  Locations                240 non-null    object 
 13  Headquarters             244 non-null    object 
 14  Notes                    7

In [15]:
spott_df.columns = ['company_name', 'parent_company', 'subsidiaries', 'market_cap', 'bbg_ticker', 'reuters_ticker', 
                    'isin', 'sedol', 'landbank_ha', 'gpsnr_member', 'activities', 'small_holders', 'locations',
                    'headquarter_country', 'notes', 'website', 'landbank_oil_ha', 'rspo_member', 'member_since']

In [18]:
spott_df.head()

Unnamed: 0,company_name,parent_company,subsidiaries,market_cap,bbg_ticker,reuters_ticker,isin,sedol,landbank_ha,gpsnr_member,activities,small_holders,locations,headquarter_country,notes,website,landbank_oil_ha,rspo_member,member_since
0,Austindo Nusantara Jaya Tbk PT,,,,,,,,,No,,,,,,,,,
1,Bakrie Sumatera Plantations Tbk PT,PT Bakrie & Brothers Tbk,PT Huma Indah Mekar (HIM),20000000.0,UNSP IJ Equity,UNSP.JK,ID1000099708,,19789.0,No,Natural rubber cultivation and processing,Unclear whether company has industrial plantat...,Indonesia (Sumatra),Indonesia,,https://www.bakriesumatera.com/,,,
2,Bridgestone Corporation,,"Firestone Natural Rubber Company, LLC",29197000000.0,5108 JP Equity,5108.T,JP3830800003,,,Yes,"Natural rubber cultivation, processing and dis...",Company has industrial plantation suppliers; C...,"Liberia (Harbel), Indonesia (Kalimantan, Sumat...",Japan,,https://www.bridgestone.com,,,
3,Cheng Shin Rubber Industry Co Ltd (正新橡胶工业股份有限公司),,"Cheng Shin Rubber (China) Co., Ltd. and Cheng ...",4124000000.0,2105 TT Equity,,TW0002105004,,,No,"Natural rubber manufacturing, trading and dist...",Company has scheme/outgrower smallholder suppl...,"China (Xiamen, Chongqing, Zhangzhou)",Taiwan,,https://www.csttires.com/int/about-cst/,,,
4,Continental AG,,"General Tire, Continental Reifen Deutschland GmbH",15804000000.0,CON GR Equity,,DE0005439004,,,No,Natural rubber manufacturing,Company has scheme/outgrower smallholder suppl...,"Germany (Hanover, Korbatch), Czech Republic (O...",Germany,,https://www.continental.com/en/,,,


In [20]:
print('Total companies in SPOTT {}'.format(spott_df.shape[0]))

Total companies in SPOTT 245


## 2. Save pre-processed SPOTT

In [22]:
# Save locally to "data" folder
saved_path = "..\..\data\pre_processed"
filename = "spott_pre_processed.csv"
spott_filename = os.path.join(saved_path, filename)
spott_df.to_csv(spott_filename, header=True, index=False)

In [23]:
# Save final results to S3
s3_filename = 'SPOTT/pre_processed/spott_pre_processed.csv'
s3_resource.meta.client.upload_file(Filename=spott_filename, Bucket=bucket_name, Key=s3_filename)