In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt 
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.gaussian_process import GaussianProcessRegressor

In [2]:
df = pd.read_csv("../train.csv")
df

Unnamed: 0,date,Price,Country,Price_7d,Price_14d,Price_21d,Price_28d,Price_7d_der,Price_14d_der,Price_21d_der,Price_28d_der,Month,Day,target
0,2017-12-27,0.0,Australia,0.0,6.650000e+00,6.65,19.25,0.000000,0.000,-0.119048,-1.442857,12,27,0.00
1,2017-12-28,0.0,Australia,0.0,6.650000e+00,6.65,19.25,0.000000,0.000,0.000000,0.000000,12,28,0.00
2,2017-12-29,0.0,Australia,0.0,6.650000e+00,6.65,19.25,0.000000,0.000,0.000000,0.000000,12,29,0.00
3,2017-12-30,0.0,Australia,0.0,-1.776357e-15,6.65,19.25,0.000000,-0.475,0.000000,0.000000,12,30,0.00
4,2017-12-31,0.0,Australia,0.0,-1.776357e-15,6.65,19.25,0.000000,0.000,0.000000,0.000000,12,31,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15789,2019-06-21,0.0,Unspecified,42.1,4.210000e+01,42.10,42.10,0.000000,0.000,0.000000,0.000000,6,21,219.88
15790,2019-06-22,0.0,Unspecified,42.1,4.210000e+01,42.10,42.10,0.000000,0.000,0.000000,0.000000,6,22,219.88
15791,2019-06-23,0.0,Unspecified,42.1,4.210000e+01,42.10,42.10,0.000000,0.000,0.000000,0.000000,6,23,219.88
15792,2019-06-24,0.0,Unspecified,42.1,4.210000e+01,42.10,42.10,0.000000,0.000,0.000000,0.000000,6,24,219.88


In [6]:
#find top 10 countries
train_countries = list(df.groupby("Country").agg({"Price": "sum"}).sort_values(by="Price", ascending=False).iloc[:10].index)
train_countries

['United Kingdom',
 'EIRE',
 'Germany',
 'Norway',
 'France',
 'Spain',
 'Hong Kong',
 'Portugal',
 'Singapore',
 'Netherlands']

In [7]:
for country in train_countries:
    country_df = df.loc[df["Country"]==country, :]
    X = country_df.loc[:, country_df.columns[3:-1]]
    y = country_df.loc[:, "target"]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25,
                                                        shuffle=True, random_state=2021) 
    pipe = Pipeline(steps=[('scaler', StandardScaler()),
                           ('RF', RandomForestRegressor())])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_valid)
    eval_rmse =  round(np.sqrt(mean_squared_error(y_valid, y_pred)))
    print(country + " RMSE: " + str(eval_rmse))

United Kingdom RMSE: 35312
EIRE RMSE: 2312
Germany RMSE: 366
Norway RMSE: 2932
France RMSE: 576
Spain RMSE: 243
Hong Kong RMSE: 853
Portugal RMSE: 511
Singapore RMSE: 290
Netherlands RMSE: 121
