In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from lofo import LOFOImportance, Dataset, plot_importance
from sklearn.linear_model import LinearRegression, Ridge
import plotly.graph_objects as go
import plotly.offline as py
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics.scorer import make_scorer
import pandas_profiling

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

my_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

def generate_dataset(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

def train_linear_model(X,y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle = False, stratify = None,random_state=42)

    regr = LinearRegression()

    regr = regr.fit(X_train,y_train)

    predictions = regr.predict(X_test)

    actual = y_test

    # calculate RMSE
    rmse = math.sqrt(mean_squared_error(actual, predictions))
    print('Test RMSE: %.3f' % rmse)

    pct = mean_absolute_percentage_error(actual,predictions)
    print('MAPE : %.3f' % pct +'%' )
    
    cv_scores = np.abs(cross_val_score(regr, X, y,scoring=my_scorer,cv=25))
   
    print('CV Score :', cv_scores )
    
    print('CV Score Mean:', np.mean(cv_scores))
    
    rmse_percentage = (rmse/y.mean())*100
    print('RMSE Percentage : %.3f' % rmse_percentage +'%' )

    rmse_std = rmse/y.std()
    print('RMSE/STD : %.3f' % rmse_std +'%' )

    fig = go.Figure()
    fig.add_trace(go.Scatter(x = np.arange(0,len(y_train)),
                                     y = y_train['var1(t)'],
                                     mode='lines+markers', name='Train'))
    fig.add_trace(go.Scatter(x = np.arange(len(X_train),len(X_train)+len(actual)),
                                     y = actual['var1(t)'],
                                     mode='lines+markers', name='Actual'))
    fig.add_trace(go.Scatter(x = np.arange(len(X_train),len(X_train)+len(actual)),
                                     y = predictions[:,0],
                                     mode='lines+markers', name='Prediction'))
    fig.update_layout(title="Mean Abs. Pct Error " + '%'+str(round(pct,2)),
                    )
    fig.show()
    
    return regr

  from tqdm.autonotebook import tqdm


ImportError: No module named 'pandas_profiling'

In [None]:
train = pd.read_csv('DailyDelhiClimateTrain.csv')
train.index = pd.to_datetime(train.date)
train = train.drop(columns=['date'])
train.head()

In [None]:
test = pd.read_csv('DailyDelhiClimateTest.csv')
test.index = pd.to_datetime(test.date)
test = test.drop(columns=['date'])
test.head()

# Generate a Supervised dataset

In [None]:
reframed = generate_dataset(train, 3, 1)

# Check Feature Importance

In [None]:
cv = KFold(n_splits=4, shuffle=False, random_state=0)

ds = Dataset(df=reframed, target="var1(t)", features=[col for col in reframed.columns if col != 'var1(t)'])

model = LinearRegression()

lofo_imp = LOFOImportance(ds, cv=cv, scoring="neg_mean_squared_error",model = model)

# get the mean and standard deviation of the importances in pandas format
importance_df = lofo_imp.get_importance()

# plot the means and standard deviations of the importances
plot_importance(importance_df, figsize=(12, 20))

In [None]:
features = importance_df.feature.head(10).values
features

# Train Model

In [None]:
y = pd.DataFrame(reframed['var1(t)'])

X = pd.DataFrame(reframed[features])

train_linear_model(X,y)

# Resampling

In [None]:
resampled = train.copy()
#resampled = resampled.resample('W', on="date").mean()  Date = Index, no need to define "on" parameter
resampled = resampled.resample('W').mean()
display(resampled.head())