# Container forcast using raindom forest model

In this project, we aims the build a prediction model for container throughput in Thailand port. 

We first load all the data collected throughout year 2001-2021. This included inbound and outbound container throughput. As for the features, we use features as follows

- Consumer price index
- Export value
- Import value
- GDP constant
- Inflation rate
- Interest rate
- Manufacture product index
- Population
- Unemployment rate
- USD to THB conversion rate

In [2]:
import numpy as np
import pandas as pd

# Loading label for model prediction

In [3]:
month_to_int = {'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 
'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}


labels_df = pd.read_csv("container_throughput_label.csv")
labels_df['month'] = labels_df['month'].map(lambda x: month_to_int[x])

labels_df.head()


Unnamed: 0,year,month,inbound,outbound
0,2001,1,91751,82123
1,2001,2,83475,83014
2,2001,3,95149,95933
3,2001,4,86302,91967
4,2001,5,96431,101828


# Load features for model prediction

In [4]:
features_df = pd.read_csv("features.csv")
features_df = features_df.drop('id', axis=1)


features_df['month'] = features_df['month'].map(lambda x: month_to_int[x])
features_df = features_df.interpolate(axis=0)

features_df.head()

Unnamed: 0,year,month,export_value,GDP_constant,import_value,inflation_percentage_change,interest_rate,manufac_prod_index,population,unemployment_rate,exchange_rate,consumer_price_index
0,2001,1,279973.0,459359.0,255061.0,0.72,2.5,52.47,62308887.0,5.73,43.12,68.8
1,2001,2,279973.0,459359.0,255061.0,0.44,2.0,53.02,62308887.0,4.25,42.64,69.1
2,2001,3,279973.0,459359.0,255061.0,0.0,2.0,52.18,62308887.0,4.04,43.9,69.1
3,2001,4,283056.0,442241.0,255379.0,0.72,2.0,51.29,62308887.0,4.06,45.46,69.6
4,2001,5,283056.0,442241.0,255379.0,0.29,2.0,52.51,62308887.0,4.24,45.48,69.8


## Generate Training and Testing set

In [102]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.tree import export_graphviz
import pydot

In [6]:
# Convert pandas dataframe to numpy array
feature_list = list(features_df.columns)
features = np.array(features_df)
labels = np.array(labels_df.drop(['year','month'], axis=1))
# Split data into training and testing set with 25% of test set
train_features, test_features, train_labels, test_labels = train_test_split(features, 
                                                            labels, test_size=0.25, random_state=42)


# Create Random forest model

In [98]:
# Create set of parameters to perform parameter optimization
# n_estimators = [int(x) for x in np.linspace(10000, 50000, num=3)]
n_estimators = [900]
max_features = ['log2', 'sqrt', None]
max_depth = [int(x) for x in np.linspace(1, 100, num = 5)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_state = [int(x) for x in np.linspace(0, 50, num=5)]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'random_state': random_state}

print(random_grid)

{'n_estimators': [900], 'max_features': ['log2', 'sqrt', None], 'max_depth': [1, 25, 50, 75, 100, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'random_state': [0, 12, 25, 37, 50]}


# Train model

In [99]:
%timeit
# Random search of parameters, using 3 fold cross validation, 
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, 
                            cv = 3, scoring = 'neg_mean_absolute_error', verbose=1, 
                            random_state=42, n_jobs = -1)


# Fit the random search model
rf_random.fit(train_features, train_labels)
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'random_state': 50,
 'n_estimators': 900,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'log2',
 'max_depth': 75,
 'bootstrap': True}

In [100]:
best_model = rf_random.best_estimator_
best_model.fit(train_features, train_labels)
predictions = best_model.predict(test_features)

In [109]:
# Print evaluation metric
print(f'Mean Absolute Error (MAE): {mean_absolute_error(test_labels, predictions)}')
print(f'Mean Squared Error (MSE): {mean_squared_error(test_labels, predictions)}')
print(f'Root Mean Squared Error (RMSE): {np.sqrt(mean_absolute_error(test_labels, predictions))}')
print(f'Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(test_labels, predictions)}')

Mean Absolute Error (MAE): 14946.331758365259
Mean Squared Error (MSE): 383561103.8205525
Root Mean Squared Error (RMSE): 122.2551911305416
Mean Absolute Percentage Error (MAPE): 0.06523886339257695


# Show a tree in Random forest

In [110]:
tree = best_model.estimators_[0]
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

(graph, ) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')


![](tree.png)