In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from lofo import LOFOImportance, Dataset, plot_importance
from sklearn.linear_model import LinearRegression, Ridge
import plotly.graph_objects as go
import plotly.offline as py
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics.scorer import make_scorer

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

my_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

def generate_dataset(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

def train_linear_model(X,y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle = False, stratify = None,random_state=42)

    regr = LinearRegression()

    regr = regr.fit(X_train,y_train)

    predictions = regr.predict(X_test)

    actual = y_test

    # calculate RMSE
    rmse = math.sqrt(mean_squared_error(actual, predictions))
    print('Test RMSE: %.3f' % rmse)

    pct = mean_absolute_percentage_error(actual,predictions)
    print('MAPE : %.3f' % pct +'%' )
    
    cv_scores = np.abs(cross_val_score(regr, X, y,scoring=my_scorer,cv=25))
   
    print('CV Score :', cv_scores )
    
    print('CV Score Mean:', np.mean(cv_scores))
    
    rmse_percentage = (rmse/y.mean())*100
    print('RMSE Percentage : %.3f' % rmse_percentage +'%' )

    rmse_std = rmse/y.std()
    print('RMSE/STD : %.3f' % rmse_std +'%' )

    fig = go.Figure()
    fig.add_trace(go.Scatter(x = np.arange(0,len(y_train)),
                                     y = y_train['var1(t)'],
                                     mode='lines+markers', name='Train'))
    fig.add_trace(go.Scatter(x = np.arange(len(X_train),len(X_train)+len(actual)),
                                     y = actual['var1(t)'],
                                     mode='lines+markers', name='Actual'))
    fig.add_trace(go.Scatter(x = np.arange(len(X_train),len(X_train)+len(actual)),
                                     y = predictions[:,0],
                                     mode='lines+markers', name='Prediction'))
    fig.update_layout(title="Mean Abs. Pct Error " + '%'+str(round(pct,2)),
                    )
    fig.show()
    
    return regr

  from tqdm.autonotebook import tqdm


In [2]:
train = pd.read_csv('DailyDelhiClimateTrain.csv')
train.index = pd.to_datetime(train.date)
train = train.drop(columns=['date'])
train.head()

Unnamed: 0_level_0,meantemp,humidity,wind_speed,meanpressure
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-01-01,10.0,84.5,0.0,1015.666667
2013-01-02,7.4,92.0,2.98,1017.8
2013-01-03,7.166667,87.0,4.633333,1018.666667
2013-01-04,8.666667,71.333333,1.233333,1017.166667
2013-01-05,6.0,86.833333,3.7,1016.5


In [12]:
test = pd.read_csv('DailyDelhiClimateTest.csv')
test.index = pd.to_datetime(test.date)
test = test.drop(columns=['date'])
test.head()

Unnamed: 0_level_0,meantemp,humidity,wind_speed,meanpressure
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,15.913043,85.869565,2.743478,59.0
2017-01-02,18.5,77.222222,2.894444,1018.277778
2017-01-03,17.111111,81.888889,4.016667,1018.333333
2017-01-04,18.7,70.05,4.545,1015.7
2017-01-05,18.388889,74.944444,3.3,1014.333333


# Generate a Supervised dataset

In [3]:
reframed = generate_dataset(train, 3, 1)

# Check Feature Importance

In [4]:
cv = KFold(n_splits=4, shuffle=False, random_state=0)

ds = Dataset(df=reframed, target="var1(t)", features=[col for col in reframed.columns if col != 'var1(t)'])

model = LinearRegression()

lofo_imp = LOFOImportance(ds, cv=cv, scoring="neg_mean_squared_error",model = model)

# get the mean and standard deviation of the importances in pandas format
importance_df = lofo_imp.get_importance()

# plot the means and standard deviations of the importances
plot_importance(importance_df, figsize=(12, 20))

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [10]:
features = importance_df.feature.head(10).values
features

array(['var2(t)', 'var2(t-1)', 'var3(t-3)', 'var2(t-3)', 'var3(t-1)',
       'var3(t-2)', 'var3(t)', 'var2(t-2)', 'var1(t-3)', 'var1(t-2)'],
      dtype=object)

# Train Model

In [11]:
y = pd.DataFrame(reframed['var1(t)'])

X = pd.DataFrame(reframed[features])

train_linear_model(X,y)

Test RMSE: 1.667
MAPE : 5.526%
CV Score : [11.17205199  4.5544546   4.28398549  3.48753229  3.82713638  4.56913034
 11.73482011  6.44183124  3.56148704  3.33453225  2.43200194  9.34036127
 12.8690418   8.57927603  4.4626313   5.10306109  2.75852749  4.60211584
  9.50168648  8.67119116  4.76353999  4.34313845  2.98583539  2.55476039
  5.78217858]
CV Score Mean: 5.828652356515857
RMSE Percentage : 6.531%
RMSE/STD : 0.228%


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Resampling

In [7]:
resampled = train.copy()
#resampled = resampled.resample('W', on="date").mean()  Date = Index, no need to define "on" parameter
resampled = resampled.resample('W').mean()
display(resampled.head())

Unnamed: 0_level_0,meantemp,humidity,wind_speed,meanpressure
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-01-06,7.705556,84.077778,2.337778,1017.3
2013-01-13,12.343537,65.145238,8.82517,1016.632653
2013-01-20,13.642857,81.663265,4.242857,1017.139456
2013-01-27,12.329592,67.748299,4.770544,1020.337075
2013-02-03,15.736905,67.667347,2.337194,1017.020578
