# Import

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import os
import time
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import scipy.stats as stats

import seaborn as sns

from datetime import datetime

import re
import random

from tqdm.notebook import tqdm

import dask.dataframe as dd
from dask.distributed import LocalCluster, Client

sys.path.insert(0, 'tools/')

from tools import * 

In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn import neighbors
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

# Read files

In [4]:
%%time

X_train = pd.read_csv('dades/processed/training_data.csv', index_col='index')
# y_train = pd.read_csv('dades/processed/training_label.csv', index_col='index')
X_train = X_train.reset_index(drop=True)
y_train = X_train.ctx0.copy()
#X_train.drop(columns=['ctx0'], axis=1, inplace=True)

X_val1 = pd.read_csv('dades/processed/validation_data.csv', index_col='index')
# y_val1 = pd.read_csv('dades/processed/validation_label.csv', index_col='index')
X_val1 = X_val1.reset_index(drop=True)
y_val1 = X_val1.ctx0.copy()
#X_val1.drop(columns=['ctx0'], axis=1, inplace=True)

X_val2 = pd.read_csv('dades/processed/testing_data.csv', index_col='index')
# y_val2 = pd.read_csv('dades/processed/testing_label.csv', index_col='index')
X_val2 = X_val2.reset_index(drop=True)
y_val2 = X_val2.ctx0.copy()
#X_val2.drop(columns=['ctx0'], axis=1, inplace=True)


CPU times: user 34.2 s, sys: 3.1 s, total: 37.3 s
Wall time: 37.3 s


In [5]:
cat_y_train = ((y_train.copy()*100)/1).round()/100
cat_y_val1 = ((y_val1.copy()*100)/1).round()/100
cat_y_val2 = ((y_val2.copy()*100)/1).round()/100

In [6]:
month = X_train.month.unique().tolist()

# Run pipeline

In [7]:
X_train.columns

Index(['station_id', 'year', 'month', 'dayofweek', 'day', 'dayofyear', 'hour',
       'capacity', 'ctx0', 'ctx1', 'ctx2', 'ctx3', 'ctx4', 'festius',
       'festius_sun', 'festius_sun_sat', 'weekend', 'VALOR_TM_D5',
       'VALOR_TX_D5', 'VALOR_TN_D5', 'VALOR_HRM_D5', 'VALOR_PPT_D5',
       'VALOR_PM_D5', 'VALOR_RS24h_D5', 'VALOR_VVM10_D5', 'VALOR_DVM10_D5',
       'VALOR_VVX10_D5', 'VALOR_DVVX10_D5', 'VALOR_HRX_D5', 'VALOR_HRN_D5',
       'VALOR_PX_D5', 'VALOR_PN_D5', 'VALOR_TM_X2', 'VALOR_TX_X2',
       'VALOR_TN_X2', 'VALOR_HRM_X2', 'VALOR_HRX_X2', 'VALOR_HRN_X2',
       'VALOR_TM_X4', 'VALOR_TX_X4', 'VALOR_TN_X4', 'VALOR_HRM_X4',
       'VALOR_PPT_X4', 'VALOR_PM_X4', 'VALOR_RS24h_X4', 'VALOR_VVM10_X4',
       'VALOR_DVM10_X4', 'VALOR_VVX10_X4', 'VALOR_DVVX10_X4', 'VALOR_HRX_X4',
       'VALOR_HRN_X4', 'VALOR_PX_X4', 'VALOR_PN_X4', 'VALOR_TM_X8',
       'VALOR_TX_X8', 'VALOR_TN_X8', 'VALOR_HRM_X8', 'VALOR_PPT_X8',
       'VALOR_PM_X8', 'VALOR_RS24h_X8', 'VALOR_VVM10_X8', 'VALOR_DVM1

In [8]:
columns_meteo = ['VALOR_TN_X4','VALOR_TM_X4', 'VALOR_TX_X4', 'VALOR_PPT_X4']

class Config:
    num_attribs0 = ['year', 'capacity', 'ctx1', 'ctx2', 'ctx3', 'ctx4'] + columns_meteo
    cat_attribs0 = ['month', 'dayofyear', 'day', 'hour']
    cat_attribs1 = ['station_id'] # dayofweek
    gen_attribs0 = ['festius_sun', 'weekend']
    target_col = ['ctx0']

    seed=42

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    

In [9]:
config=Config()
seed_everything(config.seed)

In [10]:
def build_preprocessor(config):
    num_attribs0 = config.num_attribs0
    cat_attribs0 = config.cat_attribs0
    cat_attribs1 = config.cat_attribs1
    gen_attribs0 = config.gen_attribs0

    num_transformer0 = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ('std_scaler', (StandardScaler())),
    ])

    categorical_transformer0 = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant",fill_value=0)),
        ('ordinal_encoder', (OrdinalEncoder())),
    ])
    
    categorical_transformer1 = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant",fill_value=0)),
        ('one_hot_encoder', (OneHotEncoder(handle_unknown='ignore'))),
    ])
    
    generic_transformer0 = Pipeline([
        ("imputer", SimpleImputer(strategy="constant",fill_value=0)),
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ("num0", num_transformer0, num_attribs0),
            ("gen1", generic_transformer0, gen_attribs0),
            ("cat0", categorical_transformer0, cat_attribs0),
            ("cat1", categorical_transformer1, cat_attribs1),
        ],
        remainder="drop"
    )
    
    return preprocessor

In [11]:
%%time 

full_pipeline = build_preprocessor(config)

CPU times: user 68 µs, sys: 11 µs, total: 79 µs
Wall time: 85.4 µs


In [12]:
full_pipeline.fit(X_train)

In [13]:
X_train_prepared = full_pipeline.transform(X_train[X_train.month.isin([2, 3, 4])].copy())
X_val_prepared1 = full_pipeline.transform(X_val1[X_val1.month.isin([2, 3, 4])].copy())
X_val_prepared2 = full_pipeline.transform(X_val2[X_val2.month.isin([2, 3, 4])].copy())

cat_y_train_prepared = cat_y_train[X_train.month.isin([2, 3, 4])].copy()
cat_y_val1_prepared = cat_y_val1[X_val1.month.isin([2, 3, 4])].copy()
cat_y_val2_prepared = cat_y_val2[X_val2.month.isin([2, 3, 4])].copy()

print("x_train_prepared:",X_train_prepared.shape,"y_train: ",y_train.shape)
print("x_test_prepared:",X_val_prepared1.shape,"y_test: ",y_val1.shape)
print("x_test_prepared:",X_val_prepared2.shape,"y_test: ",y_val2.shape)

x_train_prepared: (1693038, 424) y_train:  (6993034,)
x_test_prepared: (219387, 424) y_test:  (899816,)
x_test_prepared: (1000367, 424) y_test:  (1694015,)


# Predicción______________________________

a) Regresión lineal: relación lineal entre las variables de entrada y la variable de salida. 

b) Regresión Redes Neuronales (RNN -redes neuronales recurrentes-): pueden capturar relaciones no lineales entre las variables de entrada y salida.

## LinearRegression

In [14]:
res = {}

with tqdm(range (12), unit="months", mininterval=0, disable=False) as bar:
    for index in bar:
        bar.set_description(f"Month {month[index-2]}-{month[index-1]}-{month[index]}")
        
        print('///////////////////////////////////////////////////////////')
        
        print(f"Month {month[index-2]}-{month[index-1]}-{month[index]}")
        
        X_train_prepared = full_pipeline.transform(X_train[X_train.month.isin([month[index-2], month[index-1], month[index]])].copy())
        X_val_prepared1 = full_pipeline.transform(X_val1[X_val1.month.isin([month[index-2], month[index-1], month[index]])].copy())
        
        cat_y_train_prepared = cat_y_train[X_train.month.isin([month[index-2], month[index-1], month[index]])].copy()
        cat_y_val1_prepared = cat_y_val1[X_val1.month.isin([month[index-2], month[index-1], month[index]])].copy()
        
        print("x_train_prepared:",X_train_prepared.shape,"y_train: ",y_train.shape)
        print("x_test_prepared:",X_val_prepared1.shape,"y_test: ",y_val1.shape)
        
        # define model 
        lin_reg = LinearRegression(n_jobs=5)
        
        # cros validate trainig
#         cross_val_evaluation(lin_reg, X_train_prepared, cat_y_train_prepared, 'LinearRegression', n_jobs=15)
        
        lin_reg.fit(X_train_prepared, cat_y_train_prepared)
        
        res[index] = [
            test_model(lin_reg, X_train_prepared, cat_y_train_prepared, show=False)[['mse_t','rmse_t','mae_t','r2_t']],
            test_model(lin_reg, X_val_prepared1, cat_y_val1_prepared, X_val_prepared2, cat_y_val2_prepared, show=False)[['mse_t','rmse_t','mae_t','r2_t', 'mse_v','rmse_v','mae_v','r2_v']]
        ]
        
        print('///////////////////////////////////////////////////////////')
        
        print(res[index])
        
        print('///////////////////////////////////////////////////////////')

  0%|          | 0/12 [00:00<?, ?months/s]

///////////////////////////////////////////////////////////
Month 11-12-1
x_train_prepared: (1776741, 424) y_train:  (6993034,)
x_test_prepared: (225462, 424) y_test:  (899816,)
///////////////////////////////////////////////////////////
[mse_t     0.012477
rmse_t    0.111701
mae_t     0.072535
r2_t      0.846247
dtype: object, mse_t     0.009103
rmse_t     0.09541
mae_t     0.062124
r2_t      0.858594
mse_v     0.013345
rmse_v    0.115519
mae_v     0.076697
r2_v      0.816071
dtype: object]
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
Month 12-1-2
x_train_prepared: (1744743, 424) y_train:  (6993034,)
x_test_prepared: (220593, 424) y_test:  (899816,)
///////////////////////////////////////////////////////////
[mse_t     0.011437
rmse_t    0.106942
mae_t     0.067241
r2_t      0.860103
dtype: object, mse_t     0.008996
rmse_t    0.094847
mae_t     0.060245
r2_t      0.860396
mse_v     0.013391
rmse_v    0.115717


In [15]:
res

{0: [mse_t     0.012477
  rmse_t    0.111701
  mae_t     0.072535
  r2_t      0.846247
  dtype: object,
  mse_t     0.009103
  rmse_t     0.09541
  mae_t     0.062124
  r2_t      0.858594
  mse_v     0.013345
  rmse_v    0.115519
  mae_v     0.076697
  r2_v      0.816071
  dtype: object],
 1: [mse_t     0.011437
  rmse_t    0.106942
  mae_t     0.067241
  r2_t      0.860103
  dtype: object,
  mse_t     0.008996
  rmse_t    0.094847
  mae_t     0.060245
  r2_t      0.860396
  mse_v     0.013391
  rmse_v    0.115717
  mae_v     0.076264
  r2_v      0.815439
  dtype: object],
 2: [mse_t     0.010452
  rmse_t    0.102236
  mae_t     0.060319
  r2_t      0.879884
  dtype: object,
  mse_t     0.009559
  rmse_t    0.097769
  mae_t     0.061011
  r2_t      0.854228
  mse_v     0.013463
  rmse_v    0.116028
  mae_v     0.076118
  r2_v      0.814446
  dtype: object],
 3: [mse_t     0.010747
  rmse_t    0.103667
  mae_t     0.062073
  r2_t      0.884233
  dtype: object,
  mse_t     0.009735
  rms

In [16]:
# first thre are the best ones

## KNN3
### Not ideal to store and sort large data.
https://www.kaggle.com/general/352216

## SVM 
### SVM is not suitable for large datasets because of its high training time and it also takes more time in training compared to Naïve Bayes. 
https://www.datacamp.com/tutorial/svm-classification-scikit-learn-python#advantages

# Decision Tree

In [17]:
from sklearn.tree import DecisionTreeRegressor

In [18]:
X_train_prepared = full_pipeline.transform(X_train[X_train.month.isin([2, 3, 4])].copy())
X_val_prepared1 = full_pipeline.transform(X_val1[X_val1.month.isin([2, 3, 4])].copy())
X_val_prepared2 = full_pipeline.transform(X_val2[X_val2.month.isin([2, 3, 4])].copy())

cat_y_train_prepared = cat_y_train[X_train.month.isin([2, 3, 4])].copy()
cat_y_val1_prepared = cat_y_val1[X_val1.month.isin([2, 3, 4])].copy()
cat_y_val2_prepared = cat_y_val2[X_val2.month.isin([2, 3, 4])].copy()

print("x_train_prepared:",X_train_prepared.shape,"y_train: ",y_train.shape)
print("x_test_prepared:",X_val_prepared1.shape,"y_test: ",y_val1.shape)
print("x_test_prepared:",X_val_prepared2.shape,"y_test: ",y_val2.shape)

x_train_prepared: (1693038, 424) y_train:  (6993034,)
x_test_prepared: (219387, 424) y_test:  (899816,)
x_test_prepared: (1000367, 424) y_test:  (1694015,)


In [19]:
%%time 

for i in tqdm(range(10,17)):
    print('///////////////////////////////////////////////////////////')
        
    # Build decision tree
    tree = DecisionTreeRegressor(max_depth=i+1)

#     cross_val_evaluation(tree, X_train_prepared, y_train,'Decision Tree')
    tree.fit(X_train_prepared, cat_y_train_prepared)

    res[index] = [
        test_model(tree, X_train_prepared, cat_y_train_prepared, show=False)[['mse_t','rmse_t','mae_t','r2_t']],
        test_model(tree, X_val_prepared1, cat_y_val1_prepared, X_val_prepared2, cat_y_val2_prepared, show=False)[['mse_t','rmse_t','mae_t','r2_t', 'mse_v','rmse_v','mae_v','r2_v']]
    ]

    print('///////////////////////////////////////////////////////////')

    print(res[index][0])
    print(res[index][1])
    
    print('///////////////////////////////////////////////////////////')

# depth = 11 0.1099 

  0%|          | 0/7 [00:00<?, ?it/s]

///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
mse_t     0.009622
rmse_t    0.098092
mae_t     0.056461
r2_t      0.896349
dtype: object
mse_t     0.009191
rmse_t     0.09587
mae_t     0.060656
r2_t       0.86334
mse_v     0.012909
rmse_v    0.113619
mae_v     0.076038
r2_v      0.822072
dtype: object
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
mse_t     0.009411
rmse_t    0.097009
mae_t     0.055819
r2_t      0.898624
dtype: object
mse_t     0.009178
rmse_t    0.095803
mae_t     0.060515
r2_t      0.863531
mse_v     0.013021
rmse_v    0.114112
mae_v     0.076091
r2_v      0.820525
dtype: object
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
mse_t   

In [20]:
res = {}

with tqdm(range (12), unit="months", mininterval=0, disable=False) as bar:
    for index in bar:
        bar.set_description(f"Month {month[index-2]}-{month[index-1]}-{month[index]}")
        
        print('///////////////////////////////////////////////////////////')
        
        print(f"Month {month[index-2]}-{month[index-1]}-{month[index]}")
        
        X_train_prepared = full_pipeline.transform(X_train[X_train.month.isin([month[index-2], month[index-1], month[index]])].copy())
        X_val_prepared1 = full_pipeline.transform(X_val1[X_val1.month.isin([month[index-2], month[index-1], month[index]])].copy())
        
        cat_y_train_prepared = cat_y_train[X_train.month.isin([month[index-2], month[index-1], month[index]])].copy()
        cat_y_val1_prepared = cat_y_val1[X_val1.month.isin([month[index-2], month[index-1], month[index]])].copy()
        
        print("x_train_prepared:",X_train_prepared.shape,"y_train: ",y_train.shape)
        print("x_test_prepared:",X_val_prepared1.shape,"y_test: ",y_val1.shape)
        
        # define model 
        tree = DecisionTreeRegressor(criterion='friedman_mse', max_depth=11, random_state=config.seed)
        
        # cros validate trainig
#         cross_val_evaluation(tree, X_train_prepared, cat_y_train_prepared, 'DecisionTreeRegressor', n_jobs=15)
        
        tree.fit(X_train_prepared, cat_y_train_prepared)
        
        res[index] = [
            test_model(tree, X_train_prepared, cat_y_train_prepared, show=False)[['mse_t','rmse_t','mae_t','r2_t']],
            test_model(tree, X_val_prepared1, cat_y_val1_prepared, X_val_prepared2, cat_y_val2_prepared, show=False)[['mse_t','rmse_t','mae_t','r2_t', 'mse_v','rmse_v','mae_v','r2_v']]
        ]


    print('///////////////////////////////////////////////////////////')

    print(res[index][0])
    print(res[index][1])

    print('///////////////////////////////////////////////////////////')


  0%|          | 0/12 [00:00<?, ?months/s]

///////////////////////////////////////////////////////////
Month 11-12-1
x_train_prepared: (1776741, 424) y_train:  (6993034,)
x_test_prepared: (225462, 424) y_test:  (899816,)
///////////////////////////////////////////////////////////
Month 12-1-2
x_train_prepared: (1744743, 424) y_train:  (6993034,)
x_test_prepared: (220593, 424) y_test:  (899816,)
///////////////////////////////////////////////////////////
Month 1-2-3
x_train_prepared: (1740536, 424) y_train:  (6993034,)
x_test_prepared: (220244, 424) y_test:  (899816,)
///////////////////////////////////////////////////////////
Month 2-3-4
x_train_prepared: (1693038, 424) y_train:  (6993034,)
x_test_prepared: (219387, 424) y_test:  (899816,)
///////////////////////////////////////////////////////////
Month 3-4-5
x_train_prepared: (1743633, 424) y_train:  (6993034,)
x_test_prepared: (226867, 424) y_test:  (899816,)
///////////////////////////////////////////////////////////
Month 4-5-6
x_train_prepared: (1723800, 424) y_train:  (6

In [21]:
res

{0: [mse_t     0.011462
  rmse_t    0.107059
  mae_t     0.068714
  r2_t       0.85876
  dtype: object,
  mse_t     0.008569
  rmse_t    0.092568
  mae_t     0.058568
  r2_t      0.866893
  mse_v     0.012806
  rmse_v    0.113163
  mae_v     0.075142
  r2_v      0.823497
  dtype: object],
 1: [mse_t      0.01039
  rmse_t    0.101933
  mae_t     0.062323
  r2_t        0.8729
  dtype: object,
  mse_t     0.008529
  rmse_t    0.092355
  mae_t      0.05812
  r2_t      0.867636
  mse_v     0.012872
  rmse_v    0.113456
  mae_v     0.075574
  r2_v      0.822581
  dtype: object],
 2: [mse_t     0.009335
  rmse_t     0.09662
  mae_t     0.054526
  r2_t      0.892718
  dtype: object,
  mse_t     0.009055
  rmse_t    0.095157
  mae_t     0.059947
  r2_t      0.861911
  mse_v     0.013029
  rmse_v    0.114144
  mae_v     0.076489
  r2_v      0.820423
  dtype: object],
 3: [mse_t     0.009622
  rmse_t    0.098092
  mae_t     0.056461
  r2_t      0.896349
  dtype: object,
  mse_t     0.009192
  rms

In [22]:
# find best months and then iterate for 20 depth to find best depth
# next go to random forest to test using the same depth and months with different random seeds

## Random Forest

In [23]:
X_train_prepared = full_pipeline.transform(X_train[X_train.month.isin([2, 3, 4])].copy())
X_val_prepared1 = full_pipeline.transform(X_val1[X_val1.month.isin([2, 3, 4])].copy())
X_val_prepared2 = full_pipeline.transform(X_val2[X_val2.month.isin([2, 3, 4])].copy())

cat_y_train_prepared = cat_y_train[X_train.month.isin([2, 3, 4])].copy()
cat_y_val1_prepared = cat_y_val1[X_val1.month.isin([2, 3, 4])].copy()
cat_y_val2_prepared = cat_y_val2[X_val2.month.isin([2, 3, 4])].copy()

print("x_train_prepared:",X_train_prepared.shape,"y_train: ",y_train.shape)
print("x_test_prepared:",X_val_prepared1.shape,"y_test: ",y_val1.shape)
print("x_test_prepared:",X_val_prepared2.shape,"y_test: ",y_val2.shape)

x_train_prepared: (1693038, 424) y_train:  (6993034,)
x_test_prepared: (219387, 424) y_test:  (899816,)
x_test_prepared: (1000367, 424) y_test:  (1694015,)


In [24]:
%%time 

for i in tqdm(range(10,17)):
    print('///////////////////////////////////////////////////////////')
        

    # define model
    forest=RandomForestRegressor(n_estimators=10,
                                 max_depth=i+1,
                                 criterion='friedman_mse',
                                 n_jobs=15,
                                 random_state=config.seed
                                )

#     cross_val_evaluation(tree, X_train_prepared, y_train,'Decision Tree')
    forest.fit(X_train_prepared, cat_y_train_prepared)

    res[index] = [
        test_model(forest, X_train_prepared, cat_y_train_prepared, show=False)[['mse_t','rmse_t','mae_t','r2_t']],
        test_model(forest, X_val_prepared1, cat_y_val1_prepared, X_val_prepared2, cat_y_val2_prepared, show=False)[['mse_t','rmse_t','mae_t','r2_t', 'mse_v','rmse_v','mae_v','r2_v']]
    ]

    print('///////////////////////////////////////////////////////////')

    print(res[index][0])
    print(res[index][1])
    
    print('///////////////////////////////////////////////////////////')

# depth = 11 0.1099 


# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# mse_t     0.009441
# rmse_t    0.097165
# mae_t     0.056063
# r2_t        0.8983
# dtype: object
# mse_t     0.009046
# rmse_t     0.09511
# mae_t     0.060023
# r2_t      0.865498
# mse_v     0.012595
# rmse_v    0.112229
# mae_v     0.075283
# r2_v      0.826398
# dtype: object
# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# mse_t     0.009213
# rmse_t    0.095983
# mae_t     0.055513
# r2_t      0.900757
# dtype: object
# mse_t     0.009025
# rmse_t    0.095002
# mae_t     0.059863
# r2_t      0.865804
# mse_v     0.012559
# rmse_v    0.112069
# mae_v      0.07513
# r2_v      0.826893
# dtype: object
# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# mse_t     0.008962
# rmse_t    0.094667
# mae_t     0.054915
# r2_t      0.903461
# dtype: object
# mse_t     0.009004
# rmse_t    0.094892
# mae_t     0.059685
# r2_t      0.866116
# mse_v     0.012546
# rmse_v    0.112008
# mae_v     0.075044
# r2_v      0.827082
# dtype: object
# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# mse_t     0.008701
# rmse_t    0.093279
# mae_t     0.054299
# r2_t      0.906272
# dtype: object
# mse_t     0.008992
# rmse_t    0.094824
# mae_t     0.059538
# r2_t      0.866307
# mse_v     0.012546
# rmse_v    0.112011
# mae_v     0.074984
# r2_v      0.827073
# dtype: object
# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# mse_t     0.008431
# rmse_t     0.09182
# mae_t     0.053627
# r2_t      0.909181
# dtype: object
# mse_t     0.008976
# rmse_t    0.094742
# mae_t     0.059384
# r2_t      0.866537
# mse_v     0.012547
# rmse_v    0.112013
# mae_v      0.07494
# r2_v      0.827066
# dtype: object
# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# mse_t     0.008158
# rmse_t    0.090321
# mae_t     0.052921
# r2_t      0.912122
# dtype: object
# mse_t     0.008971
# rmse_t    0.094716
# mae_t     0.059273
# r2_t       0.86661
# mse_v     0.012559
# rmse_v    0.112066
# mae_v     0.074936
# r2_v      0.826902
# dtype: object
# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////
# mse_t     0.007881
# rmse_t    0.088777
# mae_t     0.052191
# r2_t      0.915099
# dtype: object
# mse_t     0.008971
# rmse_t    0.094715
# mae_t     0.059162
# r2_t      0.866612
# mse_v     0.012576
# rmse_v    0.112143
# mae_v     0.074941
# r2_v      0.826664
# dtype: object
# ///////////////////////////////////////////////////////////

  0%|          | 0/7 [00:00<?, ?it/s]

///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
mse_t     0.009441
rmse_t    0.097165
mae_t     0.056063
r2_t        0.8983
dtype: object
mse_t     0.009046
rmse_t     0.09511
mae_t     0.060023
r2_t      0.865498
mse_v     0.012595
rmse_v    0.112229
mae_v     0.075283
r2_v      0.826398
dtype: object
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
mse_t     0.009213
rmse_t    0.095983
mae_t     0.055513
r2_t      0.900757
dtype: object
mse_t     0.009025
rmse_t    0.095002
mae_t     0.059863
r2_t      0.865804
mse_v     0.012559
rmse_v    0.112069
mae_v      0.07513
r2_v      0.826893
dtype: object
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
mse_t   

In [24]:
sample_data = pd.read_csv('dades/processed/kaggle_sample_data.csv')

In [25]:
sample_data.shape

(54999, 68)

In [26]:
X_test_prepared = full_pipeline.transform(sample_data)

print("x_test_prepared:",X_test_prepared.shape)

x_test_prepared: (54999, 424)


In [21]:
from sklearn.ensemble import RandomForestRegressor

In [27]:
res = {}

with tqdm(range (12), unit="months", mininterval=0, disable=False) as bar:
    for index in bar:
        bar.set_description(f"Month {month[index-2]}-{month[index-1]}-{month[index]}")
        
        print('///////////////////////////////////////////////////////////')
        
        print(f"Month {month[index-2]}-{month[index-1]}-{month[index]}")
        
        X_train_prepared = full_pipeline.transform(X_train[X_train.month.isin([month[index-2], month[index-1], month[index]])].copy())
        X_val_prepared1 = full_pipeline.transform(X_val1[X_val1.month.isin([month[index-2], month[index-1], month[index]])].copy())
        
        cat_y_train_prepared = cat_y_train[X_train.month.isin([month[index-2], month[index-1], month[index]])].copy()
        cat_y_val1_prepared = cat_y_val1[X_val1.month.isin([month[index-2], month[index-1], month[index]])].copy()
        
        print("x_train_prepared:",X_train_prepared.shape,"y_train: ",y_train.shape)
        print("x_test_prepared:",X_val_prepared1.shape,"y_test: ",y_val1.shape)
        
        # define model
        forest=RandomForestRegressor(n_estimators=10,
                                     max_depth=11,
                                     criterion='friedman_mse',
                                     n_jobs=15,
                                     random_state=config.seed
                                    )
        
        # cros validate trainig
#         cross_val_evaluation(forest, X_train_prepared, cat_y_train_prepared, 'RandomForestRegressor', n_jobs=15)
        
        forest.fit(X_train_prepared, cat_y_train_prepared)
        
        res[index] = [
            test_model(forest, X_train_prepared, cat_y_train_prepared, show=False)[['mse_t','rmse_t','mae_t','r2_t']],
            test_model(forest, X_val_prepared1, cat_y_val1_prepared, X_val_prepared2, cat_y_val2_prepared, show=False)[['mse_t','rmse_t','mae_t','r2_t', 'mse_v','rmse_v','mae_v','r2_v']]
        ]

        print('///////////////////////////////////////////////////////////')

        print(res[index][0])
        print(res[index][1])

        print('///////////////////////////////////////////////////////////')
        
        yhat = forest.predict(X_test_prepared)

        print(yhat.shape)

        sample_data['percentage_docks_available'] = yhat

        sample_data['percentage_docks_available'].to_csv(f'predicton_RandomForest{index}.csv', header=True, index_label='index')
        

  0%|          | 0/12 [00:00<?, ?months/s]

///////////////////////////////////////////////////////////
Month 11-12-1
x_train_prepared: (884790, 424) y_train:  (3541851,)
x_test_prepared: (221050, 424) y_test:  (882910,)
///////////////////////////////////////////////////////////
[mse_t     0.011133
rmse_t    0.105515
mae_t     0.070008
r2_t      0.834626
dtype: object, mse_t     0.008431
rmse_t    0.091819
mae_t     0.058058
r2_t      0.868086
mse_v     0.012533
rmse_v    0.111949
mae_v     0.074582
r2_v      0.827264
dtype: object]
///////////////////////////////////////////////////////////
(54999,)
///////////////////////////////////////////////////////////
Month 12-1-2
x_train_prepared: (868540, 424) y_train:  (3541851,)
x_test_prepared: (216275, 424) y_test:  (882910,)
///////////////////////////////////////////////////////////
[mse_t     0.011213
rmse_t    0.105892
mae_t     0.069874
r2_t      0.830229
dtype: object, mse_t     0.008429
rmse_t    0.091808
mae_t     0.057719
r2_t      0.868058
mse_v     0.012578
rmse_v     0


KeyboardInterrupt



In [None]:
res

# Gaussian Process

## Gradiend Boosting

## Generate Sample

In [None]:
sample_data = pd.read_csv('dades/processed/kaggle_sample_data.csv')

In [None]:
sample_data

# apply pipeline

# Prediction 

In [None]:
X_test_prepared = full_pipeline.transform(sample_data)

print("x_train_prepared:",X_train_prepared.shape,"y_train: ",y_train.shape)
print("x_test_prepared:",X_val_prepared1.shape,"y_test: ",y_val1.shape)
print("x_test_prepared:",X_val_prepared2.shape,"y_test: ",y_val2.shape)
print("x_test_prepared:",X_test_prepared.shape)

In [None]:
yhat = forest.predict(X_test_prepared)


In [None]:
yhat.shape

In [None]:
sample_data['percentage_docks_available'] = yhat

In [None]:
sample_data['percentage_docks_available'].to_csv('predicton_RandomForest.csv', header=True, index_label='index')