In [116]:
import boto3
import pandas as pd

## Upload csv file downloaded from kaggle. 

https://www.kaggle.com/datasets/zanjibar/100-million-data-csv

In [26]:
data = pd.read_csv("dataset/custom_1988_2020.csv",
                   header=None)

In [27]:
len(data)

113607322

In [28]:
data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7
0,198801,1,103,100,190,0,35843,34353
1,198801,1,103,100,120991000,0,1590,4154
2,198801,1,103,100,210390900,0,4500,2565
3,198801,1,103,100,220890200,0,3000,757
4,198801,1,103,100,240220000,0,26000,40668
5,198801,1,103,100,250410000,0,5,8070
6,198801,1,103,100,271000700,0,374,2485
7,198801,1,103,100,271220000,0,400,616
8,198801,1,103,100,280300000,0,1900,2020
9,198801,1,103,100,280610000,0,1000,227


In [29]:
data.columns = ["year_month", "export_import", "country", "custom", "hs9", "q1", "q2", "value"]

convert to parquet file. 

In [30]:
data.to_parquet('dataset/trade_1988_2020.parquet', engine='fastparquet')

In [122]:
s3 = boto3.client('s3')

In [3]:
# Try to create the bucket
try:
    s3.create_bucket(Bucket='trade-final-project-bucket')
except boto3.exceptions.S3ObjectAlreadyExistsError:
    print()

In [3]:
s3.upload_file("dataset/trade_1988_2020.parquet", "trade-final-project-bucket", "dataset/trade_1988_2020.parquet")

## Upload csv file downloaded from IMF.

International Financial Statistics (IFS), 1988-2023 monthly, all countries, 

Exchange Rates, National Currency Per U.S. Dollar, Period Average Rate

https://data.imf.org/?sk=4c514d48-b6ba-49ed-8ab9-52b0c1a0179b&sId=1390030341854

In [79]:
data2 = pd.read_excel("dataset/International_Financial_Statistics_.xlsx", header=1)

In [80]:
len(data2)

212

In [21]:
data2.head()

Unnamed: 0.1,Unnamed: 0,Jan 1988,Feb 1988,Mar 1988,Apr 1988,May 1988,Jun 1988,Jul 1988,Aug 1988,Sep 1988,...,Jun 2022,Jul 2022,Aug 2022,Sep 2022,Oct 2022,Nov 2022,Dec 2022,Jan 2023,Feb 2023,Mar 2023
0,"Afghanistan, Islamic Rep. of",39.27643,39.27643,39.27643,39.27643,39.27643,39.27643,39.27643,39.27643,39.27643,...,,,,,,,,,,
1,Albania,,,,,,,,,,...,113.25,115.29,115.47,118.01,119.26,115.15,108.52,107.9,107.98,106.91
2,Algeria,5.072,5.2615,5.2951,5.3867,5.5401,5.7655,6.1058,6.3977,6.5118,...,145.8061,146.3142,142.4505,140.5599,140.2631,139.2511,137.5775,136.2533,136.4209,135.9905
3,"Andorra, Principality of",,,,,,,,,,...,0.946448,0.982424,0.987319,1.009716,1.017743,0.98027,0.944393,0.928591,0.933262,0.934071
4,Angola,2.9918e-08,2.9918e-08,2.9918e-08,2.9918e-08,2.9918e-08,2.9918e-08,2.9918e-08,2.9918e-08,2.9918e-08,...,430.624952,430.474643,429.656502,430.515843,451.695645,500.900607,504.4643,503.801764,504.114361,504.466905


### Matching country codes between different dataset. 

IMF: https://www.imf.org/external/pubs/ft/weo/2022/02/weodata/co.pdf

Japanese government: https://www.customs.go.jp/toukei/sankou/code/country_e.htm

In [63]:
countrycodes = list(data2.iloc[:,0])

In [32]:
import sqlite3

In [33]:
conn = sqlite3.connect("dataset/codes.db")

In [34]:
countrycodes2 = pd.read_sql_query("select * from country_eng", conn)

In [64]:
countrycodes2

Unnamed: 0,Country,Country_name,Area
0,103,Korea,Asia
1,104,North_Korea,Asia
2,105,China,Asia
3,106,Taiwan,Asia
4,107,Mongolia,Asia
...,...,...,...
227,627,Northern_Mariana_Islands_(USA),Oceania
228,628,Palau,Oceania
229,701,For_Order,Special_Area
230,702,Unknown,Special_Area


In [35]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [65]:
def find_best_match(name, choices):
    best_match = process.extractOne(name, choices)
    return best_match[0]  # Return the best matching choice

In [66]:
matched = []
for i in range(len(countrycodes)):
    best_matched = find_best_match(countrycodes[i], countrycodes2["Country_name"])
    matched.append(countrycodes2[countrycodes2["Country_name"]==best_matched].to_records(index=False)[0])

In [67]:
countrycodes = pd.DataFrame(countrycodes)
countrycodes.columns = ["Country_name"]
countrycodes

Unnamed: 0,Country_name
0,"Afghanistan, Islamic Rep. of"
1,Albania
2,Algeria
3,"Andorra, Principality of"
4,Angola
...,...
207,"Yemen, People's Dem. Rep. of"
208,"Yemen, Rep. of"
209,Yugoslavia
210,Zambia


In [72]:
matched = pd.DataFrame.from_records(matched, columns=["Code", "Country_name2", "Area"])
matched

Unnamed: 0,Code,Country_name,Area
0,130,Afghanistan,Asia
1,229,Albania,Central_and_East_Europe_Russia
2,503,Algeria,Africa
3,212,Andorra,Western_Europe
4,535,Angola,Africa
...,...,...,...
207,149,Yemen,Middle_East
208,149,Yemen,Middle_East
209,236,Latvia,Central_and_East_Europe_Russia
210,554,Zambia,Africa


In [73]:
countrycodes = pd.concat([countrycodes, matched], axis=1)
countrycodes

Unnamed: 0,Country_name,Code,Country_name.1,Area
0,"Afghanistan, Islamic Rep. of",130,Afghanistan,Asia
1,Albania,229,Albania,Central_and_East_Europe_Russia
2,Algeria,503,Algeria,Africa
3,"Andorra, Principality of",212,Andorra,Western_Europe
4,Angola,535,Angola,Africa
...,...,...,...,...
207,"Yemen, People's Dem. Rep. of",149,Yemen,Middle_East
208,"Yemen, Rep. of",149,Yemen,Middle_East
209,Yugoslavia,236,Latvia,Central_and_East_Europe_Russia
210,Zambia,554,Zambia,Africa


In [74]:
countrycodes.to_csv("dataset/countrycodes.csv")

Modify the unmatched manually. 

In [117]:
countrycodes = pd.read_csv("dataset/countrycodes.csv")

In [118]:
countrycodes.columns

Index(['Unnamed: 0', 'Country_name', 'Code', 'Country_name2', 'Area'], dtype='object')

In [98]:
new_data2 = countrycodes.merge(data2, right_on=data2.columns[0], left_on="Country_name", how="inner")

In [105]:
new_data2.head()

Unnamed: 0,Unnamed: 0_x,Country_name,Code,Country_name2,Area,Unnamed: 0_y,Jan 1988,Feb 1988,Mar 1988,Apr 1988,...,Jun 2022,Jul 2022,Aug 2022,Sep 2022,Oct 2022,Nov 2022,Dec 2022,Jan 2023,Feb 2023,Mar 2023
0,0,"Afghanistan, Islamic Rep. of",130,Afghanistan,Asia,"Afghanistan, Islamic Rep. of",39.27643,39.27643,39.27643,39.27643,...,,,,,,,,,,
1,1,Albania,229,Albania,Central_and_East_Europe_Russia,Albania,,,,,...,113.25,115.29,115.47,118.01,119.26,115.15,108.52,107.9,107.98,106.91
2,2,Algeria,503,Algeria,Africa,Algeria,5.072,5.2615,5.2951,5.3867,...,145.8061,146.3142,142.4505,140.5599,140.2631,139.2511,137.5775,136.2533,136.4209,135.9905
3,3,"Andorra, Principality of",212,Andorra,Western_Europe,"Andorra, Principality of",,,,,...,0.946448,0.982424,0.987319,1.009716,1.017743,0.98027,0.944393,0.928591,0.933262,0.934071
4,4,Angola,535,Angola,Africa,Angola,2.9918e-08,2.9918e-08,2.9918e-08,2.9918e-08,...,430.624952,430.474643,429.656502,430.515843,451.695645,500.900607,504.4643,503.801764,504.114361,504.466905


In [108]:
new_data2 = new_data2.drop(new_data2.columns[[0,5]], axis=1)

In [109]:
new_data2.head()

Unnamed: 0,Country_name,Code,Country_name2,Area,Jan 1988,Feb 1988,Mar 1988,Apr 1988,May 1988,Jun 1988,...,Jun 2022,Jul 2022,Aug 2022,Sep 2022,Oct 2022,Nov 2022,Dec 2022,Jan 2023,Feb 2023,Mar 2023
0,"Afghanistan, Islamic Rep. of",130,Afghanistan,Asia,39.27643,39.27643,39.27643,39.27643,39.27643,39.27643,...,,,,,,,,,,
1,Albania,229,Albania,Central_and_East_Europe_Russia,,,,,,,...,113.25,115.29,115.47,118.01,119.26,115.15,108.52,107.9,107.98,106.91
2,Algeria,503,Algeria,Africa,5.072,5.2615,5.2951,5.3867,5.5401,5.7655,...,145.8061,146.3142,142.4505,140.5599,140.2631,139.2511,137.5775,136.2533,136.4209,135.9905
3,"Andorra, Principality of",212,Andorra,Western_Europe,,,,,,,...,0.946448,0.982424,0.987319,1.009716,1.017743,0.98027,0.944393,0.928591,0.933262,0.934071
4,Angola,535,Angola,Africa,2.9918e-08,2.9918e-08,2.9918e-08,2.9918e-08,2.9918e-08,2.9918e-08,...,430.624952,430.474643,429.656502,430.515843,451.695645,500.900607,504.4643,503.801764,504.114361,504.466905


In [110]:
new_data2 = new_data2.to_parquet("exchangerate.parquet")

In [111]:
s3.upload_file("dataset/exchangerate.parquet", "trade-final-project-bucket", "dataset/exchangerate.parquet")

In [None]:
hs9codes = pd.read_sql_query("select * from hs9_eng", conn)

In [120]:
hs9codes.to_csv("dataset/hs9codes.csv")

In [None]:
s3.upload_file("dataset/hs9codes.csv", "trade-final-project-bucket", "dataset/hs9codes.csv")

In [124]:
hs6codes = pd.read_sql_query("select * from hs6_eng", conn)

In [125]:
hs6codes.to_csv("dataset/hs6codes.csv")

In [None]:
s3.upload_file("dataset/hs6codes.csv", "trade-final-project-bucket", "dataset/hs6codes.csv")

In [126]:
hs2codes = pd.read_sql_query("select * from hs2_eng", conn)

In [127]:
hs2codes.to_csv("dataset/hs2codes.csv")

## Upload csv file downloaded from e-stat. 

https://www.e-stat.go.jp/en/stat-search/files?page=1&toukei=00350300&tstat=000001013141

In [None]:
s3.upload_file("dataset/trade_ex_2021_2023.csv", "trade-final-project-bucket", "dataset/trade_ex_2021_2023.csv")

In [None]:
s3.upload_file("dataset/trade_im_2021_2023.csv", "trade-final-project-bucket", "dataset/etrade_im_2021_2023.csv")