In [None]:
import pandas as pd 
from sklearn.linear_model import RidgeCV, LinearRegression, Ridge, Lasso, LassoLarsCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
raw_data = pd.read_csv('../data/output/mannheim_transformed.csv')

In [None]:
raw_data.info()

In [None]:
col_to_drop = ['bike_number','start_position','end_time','end_position','end_position_name']
prediction_data = raw_data.drop(columns=col_to_drop)

In [None]:
prediction_data['start_time'] = pd.to_datetime(prediction_data.start_time)

In [None]:
# Creating an individual column for hour of the day
prediction_data['HOUR'] = prediction_data.start_time.dt.strftime('%-H').astype('int')

# Creating an individual column for week of the year
prediction_data['WEEK_OF_YEAR'] = prediction_data.start_time.dt.strftime('%W').astype('int')

# Creating an individual column for day of the week
prediction_data['DAY_OF_WEEK'] = prediction_data.start_time.dt.strftime('%w').astype('int')

seasons = []
for month in prediction_data.start_time.dt.strftime('%m').astype('int'):
    if month in [1, 2, 12]:
        seasons.append('WINTER')
    elif month in [3, 4, 5]:
        seasons.append('SPRING')
    elif month in [6, 7, 8, 9]:
        seasons.append('SUMMER')
    elif month in [10, 11]:
        seasons.append('FALL')
prediction_data['season'] = seasons

In [None]:
# Applying sine,cosine transformation on column hour to retain the cyclical nature
#prediction_data['HOUR_SIN'] = np.sin(prediction_data.HOUR * (2. * np.pi / 24))
#prediction_data['HOUR_COS'] = np.cos(prediction_data.HOUR * (2. * np.pi / 24))

In [None]:
# Applying sine,cosine transformation on column WEEK_OF_YEAR to retain the cyclical nature
#prediction_data['WEEK_OF_YEAR_SIN'] = np.sin(prediction_data.WEEK_OF_YEAR * (2. * np.pi / 52))
#prediction_data['WEEK_OF_YEAR_COS'] = np.cos(prediction_data.WEEK_OF_YEAR * (2. * np.pi / 52))

In [None]:
# Applying sine,cosine transformation on column DAY_OF_WEEK to retain the cyclical nature
#prediction_data['DAY_OF_WEEK_SIN'] = np.sin(prediction_data.DAY_OF_WEEK * (2. * np.pi / 7))
#prediction_data['DAY_OF_WEEK_COS'] = np.cos(prediction_data.DAY_OF_WEEK * (2. * np.pi / 7))

In [None]:
hour_dummies = pd.get_dummies(prediction_data['HOUR'])

In [None]:
week_dummies = pd.get_dummies(prediction_data['WEEK_OF_YEAR'])

In [None]:
day_dummies = pd.get_dummies(prediction_data['DAY_OF_WEEK'])

In [None]:
seasonal_dummies = pd.get_dummies(prediction_data['season'])

In [None]:
# Dropping individual time columns since their transformation will be used
prediction_data.drop(columns=['WEEK_OF_YEAR', 'DAY_OF_WEEK', 'HOUR','start_time','season','start_position_name'], axis=1, inplace=True)

In [None]:
prediction_data = pd.concat([prediction_data, seasonal_dummies, hour_dummies, week_dummies, day_dummies], axis=1)

In [None]:
prediction_data = pd.concat([prediction_data, seasonal_dummies], axis=1)

In [None]:
from scipy import stats

In [None]:
z = np.abs(stats.zscore(prediction_data['duration']))

In [None]:
prediction_data = prediction_data[(z < 3)]

In [None]:
X = prediction_data.drop(columns=['duration'])
y = prediction_data['duration'].values.reshape(-1,1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
target_transformer = StandardScaler().fit(y_train)
y_train = target_transformer.transform(y_train)

In [None]:
model = LassoLarsCV(cv=5, normalize=False).fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
y_pred = target_transformer.inverse_transform(model.predict(X_train).reshape(-1, 1))

In [None]:
reg_residuals = target_transformer.inverse_transform(y_train.reshape(-1, 1)) - y_pred

In [None]:
# Creating fit and subplots
fix, axs = plt.subplots(2, 1, figsize=(17, 15), sharex=True)

# Plotting actuals and fit of linear and random forest regression
pd.DataFrame({'ACTUALS': target_transformer.inverse_transform(y_train.reshape(-1, 1)).reshape(-1),
              'LinearRegression': y_pred.reshape(-1)}).plot(ax=axs[0])


# Plotting residuals of linear and random forest regression
pd.DataFrame({'LinearRegression': reg_residuals.reshape(-1)}).plot(ax=axs[1])

axs[1].set_title('LinearRegression Residuals');

In [None]:
mean_absolute_error(target_transformer.inverse_transform(y_train), y_pred)

In [None]:
rf_model = RandomForestRegressor(n_jobs=-1, random_state=123)

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
# Calculating the fit of the model
rf_predictions = target_transformer.inverse_transform(rf_model.predict(X_train).reshape(-1, 1))



# Calculating the residuals
rf_residuals = target_transformer.inverse_transform(y_train.reshape(-1, 1)) - rf_predictions

In [None]:
# Creating fit and subplots
fix, axs = plt.subplots(2, 1, figsize=(17, 15), sharex=True)

# Plotting actuals and fit of linear and random forest regression
pd.DataFrame({'ACTUALS': target_transformer.inverse_transform(y_train.reshape(-1, 1)).reshape(-1),
              'RandomForestRegression': rf_predictions.reshape(-1)}).plot(ax=axs[0])


# Plotting residuals of linear and random forest regression
pd.DataFrame({'RandomForestRegression': rf_residuals.reshape(-1)}).plot(ax=axs[1])

axs[1].set_title('RandomForest Residuals');

In [None]:
mean_absolute_error(target_transformer.inverse_transform(y_train.reshape(-1, 1)), rf_predictions)