# Container forcast using raindom forest model

In this project, we aims the build a prediction model for container throughput in Thailand port. 

We first load all the data collected throughout year 2001-2021. This included inbound and outbound container throughput. As for the features, we use features as follows

- Consumer price index
- Export value
- Import value
- GDP constant
- Inflation rate
- Interest rate
- Manufacture product index
- Population
- Unemployment rate
- USD to THB conversion rate

In [134]:
import numpy as np
import pandas as pd

# Loading label for model prediction

In [135]:
months = ['January', 'February', 'March', 'April', 'May', 
'June', 'July', 'August', 'September', 'October', 'November', 'December']

label_df = pd.DataFrame(data={'year': np.array([[year]*12 for year in range(2001,2022)]).flatten(),
                        'month': months*21 })


throughput_df = pd.read_csv("container_throughput_label.csv")

label_df = pd.merge(left=label_df, right=throughput_df, how='left', on=['year','month'])

label_df = pd.get_dummies(label_df)

label_df

Unnamed: 0,year,inbound,outbound,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
0,2001,,,0,0,0,0,1,0,0,0,0,0,0,0
1,2001,,,0,0,0,1,0,0,0,0,0,0,0,0
2,2001,,,0,0,0,0,0,0,0,1,0,0,0,0
3,2001,,,1,0,0,0,0,0,0,0,0,0,0,0
4,2001,,,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,2021,344497.0,356412.0,0,1,0,0,0,0,0,0,0,0,0,0
248,2021,336237.0,355128.0,0,0,0,0,0,0,0,0,0,0,0,1
249,2021,355733.0,357439.0,0,0,0,0,0,0,0,0,0,0,1,0
250,2021,349185.0,326402.0,0,0,0,0,0,0,0,0,0,1,0,0


In [136]:
months = ['January', 'February', 'March', 'April', 'May', 
'June', 'July', 'August', 'September', 'October', 'November', 'December']

label_df = pd.DataFrame(data={'year': np.array([[year]*12 for year in range(2001,2022)]).flatten(),
                        'month': months*21 })


throughput_df = pd.read_csv("container_throughput_label.csv")

label_df = pd.merge(left=label_df, right=throughput_df, how='left', on=['year','month'])

month_to_int = {'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 
'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}
label_df['month'] = label_df['month'].map(lambda x: month_to_int[x])

label_df.head()


Unnamed: 0,year,month,inbound,outbound
0,2001,1,,
1,2001,2,,
2,2001,3,,
3,2001,4,,
4,2001,5,,


# Load features for model prediction

In [75]:
exportval_df = pd.read_csv("export_value.csv")
gdp_df = pd.read_csv("GDP_constant.csv")
importval_df = pd.read_csv("import_value.csv")
inflate_df = pd.read_csv("inflation_%.csv")
interest_df = pd.read_csv("interest_rate.csv")
manu_df = pd.read_csv("manufac_prod_index.csv")
pop_df = pd.read_csv("population.csv")
unemp_df = pd.read_csv("unemployment.csv")
ex_df = pd.read_csv("usd_thb.csv")
cons_df = pd.read_csv("consumer_price_index.csv")

In [76]:
exportval_df = exportval_df.drop('id', axis=1)

exportval_df = exportval_df[["year", "month", "export_value"]]
exportval_df['export_value'] = exportval_df['export_value'].map(lambda x: x.replace(',', ''))
exportval_df['export_value'] = pd.to_numeric(exportval_df['export_value'])


In [77]:
gdp_df = gdp_df.drop('id', axis=1)

gdp_df = gdp_df[["year", "month", "GDP_constant"]]
gdp_df['GDP_constant'] = gdp_df['GDP_constant'].map(lambda x: x.replace(',', ''))
gdp_df['GDP_constant'] = pd.to_numeric(gdp_df['GDP_constant'])

In [78]:
importval_df = importval_df.drop('id', axis=1)

importval_df = importval_df[["year", "month", "import_value"]]
importval_df['import_value'] = importval_df['import_value'].map(lambda x: x.replace(',', ''))
importval_df['import_value'] = pd.to_numeric(importval_df['import_value'])

In [79]:
inflate_df = inflate_df.drop('id', axis=1)

inflate_df = inflate_df[["year", "month", "inflation_percentage_change"]]
inflate_df['inflation_percentage_change'] = inflate_df['inflation_percentage_change'].map(lambda x: x.replace('%', ''))
inflate_df['inflation_percentage_change'] = pd.to_numeric(inflate_df['inflation_percentage_change'])

In [80]:
interest_df = interest_df.drop('id', axis=1)

interest_df = interest_df[["year", "month", "interest_rate"]]
interest_df['interest_rate'] = pd.to_numeric(interest_df['interest_rate'])

In [81]:
manu_df = manu_df.drop('id', axis=1)

manu_df = manu_df[["year", "month", "manufac_prod_index"]]
manu_df['manufac_prod_index'] = pd.to_numeric(manu_df['manufac_prod_index'])

In [82]:
pop_df = pop_df.drop('id', axis=1)

pop_df = pop_df[["year", "month", "population"]]
pop_df['population'] = pop_df['population'].map(lambda x: x.replace(',', ''))
pop_df['population'] = pd.to_numeric(pop_df['population'])

In [83]:
unemp_df = unemp_df.drop('id', axis=1)

unemp_df = unemp_df[["year", "month", "unemployment_rate"]]
unemp_df = unemp_df.replace(' n.a. ', np.nan)
unemp_df['unemployment_rate'] = pd.to_numeric(unemp_df['unemployment_rate'])

In [84]:
ex_df = ex_df.drop('id', axis=1)

ex_df = ex_df[["year", "month", "exchange_rate"]]
ex_df = ex_df.replace(' n.a. ', np.nan)
ex_df['exchange_rate'] = pd.to_numeric(ex_df['exchange_rate'])

In [85]:
cons_df = cons_df.drop('id', axis=1)

cons_df = cons_df[["year", "month", "consumer_price_index"]]
cons_df = cons_df.replace(' n.a. ', np.nan)
cons_df['consumer_price_index'] = pd.to_numeric(cons_df['consumer_price_index'])

# Merge all Dataframe into one

In [86]:
months = ['January', 'February', 'March', 'April', 'May', 
'June', 'July', 'August', 'September', 'October', 'November', 'December']

f_df = pd.DataFrame(data={'year': np.array([[year]*12 for year in range(2001,2022)]).flatten(),
                        'month': months*21 })


In [87]:
f_df = pd.merge(left=f_df, right=exportval_df, how='left', on=['year','month'])
f_df = pd.merge(left=f_df, right=gdp_df, how='left', on=['year','month'])
f_df = pd.merge(left=f_df, right=importval_df, how='left', on=['year','month'])
f_df = pd.merge(left=f_df, right=inflate_df, how='left', on=['year','month'])
f_df = pd.merge(left=f_df, right=interest_df, how='left', on=['year','month'])
f_df = pd.merge(left=f_df, right=manu_df, how='left', on=['year','month'])
f_df = pd.merge(left=f_df, right=pop_df, how='left', on=['year','month'])
f_df = pd.merge(left=f_df, right=unemp_df, how='left', on=['year','month'])
f_df = pd.merge(left=f_df, right=ex_df, how='left', on=['year','month'])
f_df = pd.merge(left=f_df, right=cons_df, how='left', on=['year','month'])


month_to_int = {'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 
'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}
f_df['month'] = f_df['month'].map(lambda x: month_to_int[x])
f_df = f_df.interpolate(axis=0)

In [88]:
f_df.head()

Unnamed: 0,year,month,export_value,GDP_constant,import_value,inflation_percentage_change,interest_rate,manufac_prod_index,population,unemployment_rate,exchange_rate,consumer_price_index
0,2001,1,279973.0,459359.0,255061.0,0.72,2.5,52.47,62308887.0,5.73,43.12,68.8
1,2001,2,279973.0,459359.0,255061.0,0.44,2.0,53.02,62308887.0,4.25,42.64,69.1
2,2001,3,279973.0,459359.0,255061.0,0.0,2.0,52.18,62308887.0,4.04,43.9,69.1
3,2001,4,283056.0,442241.0,255379.0,0.72,2.0,51.29,62308887.0,4.06,45.46,69.6
4,2001,5,283056.0,442241.0,255379.0,0.29,2.0,52.51,62308887.0,4.24,45.48,69.8


## Generate Training and Testing set

In [158]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression

feature_list = list(f_df.columns)
features = np.array(f_df[f_df['year'] > 2002])
labels = np.array(label_df[label_df['year'] > 2002].drop(['year','month'], axis=1))
train_features, test_features, train_labels, test_labels = train_test_split(features, 
                                                            labels, test_size=0.25, random_state=42)


In [163]:
rf = RandomForestRegressor(n_estimators = 1000000, max_features = 'sqrt', max_depth = 5, random_state = 18)
rf.fit(train_features, train_labels)


In [164]:

result = rf.predict(test_features)

errors = abs(result - test_labels)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

mape = 100*(errors / test_labels)

accuracy = 100 - np.mean(mape)
print('Accuracy: ', round(accuracy, 2), '%.')

Mean Absolute Error: 12129.91 degrees.
Accuracy:  95.0 %.


In [94]:
from sklearn.tree import export_graphviz
import pydot

In [147]:
tree = rf.estimators_[5]
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

(graph, ) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')


![](tree.png)