# Upload data to S3

In this notebook, we will upload dataset to S3 programmatically. 

In [None]:
import boto3
import pandas as pd
import sqlite3

## Upload csv file downloaded from kaggle. 

https://www.kaggle.com/datasets/zanjibar/100-million-data-csv

In [None]:
data = pd.read_csv("dataset/custom_1988_2020.csv",
                   header=None)

In [None]:
len(data)

In [None]:
data.head(10)

In [None]:
data.columns = ["year_month", "export_import", "country", "custom", "hs9", "q1", "q2", "value"]

convert to parquet file. 

In [None]:
data.to_parquet('dataset/trade_1988_2020.parquet', engine='fastparquet')

In [None]:
s3 = boto3.client('s3')

In [None]:
# Try to create the bucket
try:
    s3.create_bucket(Bucket='trade-final-project-bucket')
except boto3.exceptions.S3ObjectAlreadyExistsError:
    print()

In [None]:
s3.upload_file("dataset/trade_1988_2020.parquet", "trade-final-project-bucket", "dataset/trade_1988_2020.parquet")

## Upload csv file downloaded from IMF.

International Financial Statistics (IFS), 1988-2023 quarterly, all countries, 

Exchange Rates, National Currency Per U.S. Dollar, Period Average Rate

GDP, Nominal, Domestic Currency, Seasonally Adjusted

https://data.imf.org/?sk=4c514d48-b6ba-49ed-8ab9-52b0c1a0179b&sId=1390030341854

### Matching country codes between different dataset. 

Country codes: https://www.kaggle.com/datasets/zanjibar/100-million-data-csv

In [None]:
countrycodes = list(data2.iloc[:,0])

In [None]:
conn = sqlite3.connect("dataset/codes.db")

In [None]:
countrycodes2 = pd.read_sql_query("select * from country_eng", conn)

In [None]:
countrycodes2

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
def find_best_match(name, choices):
    best_match = process.extractOne(name, choices)
    return best_match[0]  # Return the best matching choice

In [None]:
matched = []
for i in range(len(countrycodes)):
    best_matched = find_best_match(countrycodes[i], countrycodes2["Country_name"])
    matched.append(countrycodes2[countrycodes2["Country_name"]==best_matched].to_records(index=False)[0])

In [None]:
countrycodes = pd.DataFrame(countrycodes)
countrycodes.columns = ["Country_name"]
countrycodes

In [None]:
matched = pd.DataFrame.from_records(matched, columns=["Code", "Country_name2", "Area"])
matched

In [None]:
countrycodes = pd.concat([countrycodes, matched], axis=1)
countrycodes

In [None]:
countrycodes.to_csv("dataset/countrycodes.csv")

Modify the unmatched manually. 

In [None]:
countrycodes = pd.read_csv("dataset/countrycodes.csv")

In [None]:
s3.upload_file("dataset/countrycodes.csv", "trade-final-project-bucket", "dataset/countrycodes.csv")

### Load exchange rate data

In [None]:
data2 = pd.read_excel("dataset/International_Financial_Statistics_EX.xlsx", header=1) 

In [None]:
data2.rename(columns={data2.columns[0]: "Country"}, inplace=True)

In [None]:
countrycodes.columns

In [None]:
new_data2 = countrycodes.merge(data2, right_on=data2.columns[0], left_on="Country_name", how="inner")

In [None]:
new_data2 = new_data2.drop(new_data2.columns[[0,5]], axis=1)

Currently the data is domestic currency/US dollar. Change to domestic currency/Japanese Yen. 

In [None]:
subset = new_data2[new_data2["Country_name"] == "Japan"].iloc[:, 4:]

new_data2.iloc[:, 4:] = new_data2.iloc[:, 4:].div(subset, fill_value=0)

In [None]:
new_columns = []
for c in list(new_data2.columns):
    new_columns.append(c.replace (" ", "_"))

new_columns

In [None]:
new_data2.columns = new_columns

In [None]:
new_data2 = new_data2.melt(id_vars=new_data2.columns[:4], value_vars=new_data2.columns[4:], var_name="Time")

In [None]:
new_data2.to_parquet("dataset/exchangerate2.parquet")

In [None]:
s3.upload_file("dataset/exchangerate2.parquet", "trade-final-project-bucket", "dataset/exchangerate2.parquet")

### Load GDP data 

In [None]:
data3 = pd.read_excel("dataset/International_Financial_Statistics_GDP.xlsx", header=1) 

In [None]:
data3.rename(columns={data3.columns[0]: "Country"}, inplace=True)

In [None]:
countrycodes.columns

In [None]:
new_data3 = countrycodes.merge(data3, right_on=data3.columns[0], left_on="Country_name", how="inner")

In [None]:
new_data3 = new_data3.drop(new_data3.columns[[0,5]], axis=1)

In [None]:
subset = new_data3[new_data3["Country_name"]=="Japan"]

In [None]:
new_columns = []
for c in list(new_data3.columns):
    new_columns.append(c.replace (" ", "_"))

new_columns

In [None]:
new_data3.columns = new_columns

In [None]:
new_data3 = new_data3.melt(id_vars=new_data3.columns[:4], value_vars=new_data3.columns[4:], var_name="Time")

In [None]:
new_data3.to_parquet("dataset/gdp_quarter.parquet")

In [None]:
s3.upload_file("dataset/gdp_quarter.parquet", "trade-final-project-bucket", "dataset/gdp_quarter.parquet")

## Upload hs codes + names files

https://www.kaggle.com/datasets/zanjibar/100-million-data-csv

In [None]:
hs9codes = pd.read_sql_query("select * from hs9_eng", conn)

In [None]:
hs9codes.to_csv("dataset/hs9codes.csv")

In [None]:
s3.upload_file("dataset/hs9codes.csv", "trade-final-project-bucket", "dataset/hs9codes.csv")

In [None]:
hs6codes = pd.read_sql_query("select * from hs6_eng", conn)

In [None]:
hs6codes.to_csv("dataset/hs6codes.csv")

In [None]:
s3.upload_file("dataset/hs6codes.csv", "trade-final-project-bucket", "dataset/hs6codes.csv")

In [None]:
hs2codes = pd.read_sql_query("select * from hs2_eng", conn)

In [None]:
hs2codes.to_csv("dataset/hs2codes.csv")

In [None]:
s3.upload_file("dataset/hs2codes.csv", "trade-final-project-bucket", "dataset/hs2codes.csv")

## Upload csv file downloaded from e-stat. 

https://www.e-stat.go.jp/en/stat-search/files?page=1&toukei=00350300&tstat=000001013141

In [None]:
s3.upload_file("dataset/trade_ex_2021_2023.csv", "trade-final-project-bucket", "dataset/trade_ex_2021_2023.csv")

In [None]:
s3.upload_file("dataset/trade_im_2021_2023.csv", "trade-final-project-bucket", "dataset/etrade_im_2021_2023.csv")