In [3]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

df = pd.read_csv('https://datahub.io/core/global-temp/r/monthly.csv')
df = df[df.Source == 'GCAG']
df = df[['Date', 'Mean']]
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)


In [4]:
df = df.rename(columns={'Date':'ds', 'Mean': 'y'})

In [16]:
df.shape

(1644, 2)

In [5]:
cutoff = pd.to_datetime('1980-01-01')

train = df[df.ds < cutoff]
test = df[df.ds > cutoff]

In [6]:
# begin with baselines
features = ['ds']
target = 'y'

X_train = train[features]
y_train = train[target]

In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred_train = [y_train.mean()] * len(y_train)

print('Mean Baseline:')
print('Train Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('Train Mean Absolute Error:', mean_absolute_error(y_train, y_pred_train))
print('Train R^2 Score:', r2_score(y_train, y_pred_train))

Mean Baseline:
Train Root Mean Squared Error: 0.18645599413236597
Train Mean Absolute Error: 0.14993029444444445
Train R^2 Score: 0.0


In [8]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(
    SimpleImputer(strategy='mean'), 
    StandardScaler(),  
    Ridge(alpha=1.0)
)

k = 3
scores = cross_val_score(pipeline, X_train, y_train, cv=k, 
                         scoring='neg_mean_absolute_error')
print(f'MAE for {k} folds:', -scores)

MAE for 3 folds: [0.10536987 0.12809387 0.14546971]


In [9]:
-scores.mean()

0.12631114671258378

In [10]:
from sklearn.ensemble import RandomForestRegressor

pipeline = make_pipeline( 
    SimpleImputer(strategy='mean'), 
    RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
)

k = 3
scores = cross_val_score(pipeline, X_train, y_train, cv=k, 
                         scoring='neg_mean_absolute_error')
print(f'MAE for {k} folds:', -scores)

MAE for 3 folds: [0.11980662 0.16267479 0.17137098]


In [11]:
-scores.mean()

0.15128412937499988

In [12]:
from sklearn.linear_model import LinearRegression

pipeline = make_pipeline(
    SimpleImputer(strategy='mean'), 
    StandardScaler(),  
    LinearRegression()
)

k = 3
scores = cross_val_score(pipeline, X_train, y_train, cv=k, 
                         scoring='neg_mean_absolute_error')
print(f'MAE for {k} folds:', -scores)

MAE for 3 folds: [0.10539642 0.12807661 0.1455487 ]


In [13]:
-scores.mean()

0.12634057674962032

In [14]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

pipeline = make_pipeline( 
    SimpleImputer(), 
    StandardScaler(), 
    Ridge()
)

param_distributions = {
    'simpleimputer__strategy': ['mean', 'median'], 
    'ridge__alpha': [0.1, 1, 10], 
}

search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_distributions, 
    n_iter=100, 
    cv=5, 
    scoring='neg_mean_absolute_error', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)

search.fit(X_train, y_train);

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1765s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.8s finished


In [15]:
print('Best hyperparameters', search.best_params_)
print('Cross-validation MAE', -search.best_score_)

Best hyperparameters {'simpleimputer__strategy': 'mean', 'ridge__alpha': 10}
Cross-validation MAE 0.14206799747082244
