In [1]:
from os import pardir
import pandas as pd
import numpy as np

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor
from sklearn.linear_model  import LinearRegression
from sklearn import metrics 
from sklearn.preprocessing import StandardScaler
import time

In [2]:


data = pd.read_csv(r"C:\Users\rnr31\Documents\GitHub\Ravella_DS22_Strive_School_Excercises\02.Chapter\12. TimeSeries\climate.csv")
data = data.drop(["Date Time"], axis=1)

def pairing(data, seq_len=6):

    x = []
    y = []

    for i in range(0,(data.shape[0] - seq_len+1), seq_len+1 ): # range is reduced by the len of seq + 1 so that we do not go out of bounds
                                                            # we step for that same amount of steps as the seq_len
        seq = np.zeros( (seq_len, data.shape[1]) ) #creating a matrix of zeros with the shape of seq_len and the number of columns of the data
        
        for j in range(seq_len):  # filling the matrix with the data

            seq[j] = data.values[i+j]  # filling the matrix with the data we add i to make the jump of the seq_len

        x.append(seq.flatten())  # flattening the matrix and appending it to the x list
        y.append( data["T (degC)"][i+seq_len] )  # appending the target to the y list 

    return np.array(x), np.array(y)

#print(data.shape)

x, y = pairing(data)

In [3]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(test_size=2)
for train_index, test_index in tscv.split(x):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]


In [4]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [5]:
regressors = {
   "Decision Tree": DecisionTreeRegressor(),
  # "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
   "Random Forest": RandomForestRegressor(n_estimators=100),
  # "AdaBoost":      AdaBoostRegressor(n_estimators=100),
  # "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
  # "XGBoost":       XGBRegressor(n_estimators=100),
  # "LightGBM":      LGBMRegressor(n_estimators=100),
  # "CatBoost":      CatBoostRegressor(n_estimators=100),
  'Linear Regression' : LinearRegression()
}

In [6]:
results = pd.DataFrame({'Model': [], 'MSE': [], 'MAB': [], " % error": [], 'Time': []})
rang = abs(y_train.max()) - abs(y_train.min())
for model_name, model in regressors.items():
    
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_test)
    
    results = results.append({"Model":    model_name,
                              "MSE": metrics.mean_squared_error(y_test, pred),
                              "MAB": metrics.mean_absolute_error(y_test, pred),
                              " % error": metrics.mean_squared_error(y_test, pred) / rang,
                              "Time":     total_time},
                              ignore_index=True)
### END SOLUTION


results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['MSE', 'MAE'], vmin=0, vmax=100, color='#5fba7d')

print(results_ord)




               Model       MSE       MAB   % error        Time
1  Linear Regression  0.035095  0.187259  0.002487    1.235386
2      Random Forest  0.046086  0.207800  0.003266  334.399595
3      Decision Tree  0.046250  0.175000  0.003278    5.586050
