In [None]:
import pandas as pd 
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
raw_data = pd.read_csv('../data/output/mannheim_transformed.csv', parse_dates=['start_time'])

In [None]:
raw_data.info()

In [None]:
raw_data.loc[raw_data['is_station']==True , 'start_position_name'].nunique()

In [None]:
raw_data.loc[(raw_data['duration'] <= 180) | (
        raw_data["start_position"] == raw_data["end_position"]), 'false_booking'] = 1

raw_data.fillna(0.0, inplace=True)
false_bookings_series = raw_data['false_booking']

col_to_drop = ['bike_number', 'start_position', 'end_time', 'end_position', 'end_position_name','false_booking']

prediction_data = raw_data.drop(columns=col_to_drop)

In [None]:
# Creating an individual column for hour of the day
prediction_data['HOUR'] = prediction_data.start_time.dt.strftime('%-H').astype('int')

# Creating an individual column for week of the year
prediction_data['WEEK_OF_YEAR'] = prediction_data.start_time.dt.strftime('%W').astype('int')

# Creating an individual column for day of the week
prediction_data['DAY_OF_WEEK'] = prediction_data.start_time.dt.strftime('%w').astype('int')

seasons = []
for month in prediction_data.start_time.dt.strftime('%m').astype('int'):
    if month in [1, 2, 12]:
        seasons.append('WINTER')
    elif month in [3, 4, 5]:
        seasons.append('SPRING')
    elif month in [6, 7, 8]:
        seasons.append('SUMMER')
    elif month in [9, 10, 11]:
        seasons.append('FALL')
prediction_data['season'] = seasons

In [None]:
# Creating fig and subplots
#fig, axes = plt.subplots(2, 5, figsize=(20, 7), sharex=True, sharey=True, dpi=500)

# Plotting the relationship between the actual count and its lagged values
#for i, ax in enumerate(axes.flatten()[:10]):
#    pd.plotting.lag_plot(raw_data.sort_values(by=['start_time'])['duration'], lag=i + 1, ax=ax)
#    ax.set_title('Lag ' + str(i + 1))
#plt.tight_layout();

In [None]:
sns.boxplot(x='weekend', y='duration', data=prediction_data)

In [None]:
sns.boxplot(x='is_station', y='duration', data=prediction_data)

In [None]:
sns.boxplot(x='season', y='duration', data=prediction_data)

In [None]:
sns.boxplot(x='HOUR', y='duration', data=prediction_data)

In [None]:
sns.boxplot(x='WEEK_OF_YEAR', y='duration', data=prediction_data)

In [None]:
sns.boxplot(x='DAY_OF_WEEK', y='duration', data=prediction_data)

In [None]:
# Applying sine,cosine transformation on column hour to retain the cyclical nature
prediction_data['HOUR_SIN'] = np.sin(prediction_data.HOUR * (2. * np.pi / 24))
prediction_data['HOUR_COS'] = np.cos(prediction_data.HOUR * (2. * np.pi / 24))

In [None]:
# Applying sine,cosine transformation on column WEEK_OF_YEAR to retain the cyclical nature
prediction_data['WEEK_OF_YEAR_SIN'] = np.sin(prediction_data.WEEK_OF_YEAR * (2. * np.pi / 52))
prediction_data['WEEK_OF_YEAR_COS'] = np.cos(prediction_data.WEEK_OF_YEAR * (2. * np.pi / 52))

In [None]:
# Applying sine,cosine transformation on column DAY_OF_WEEK to retain the cyclical nature
prediction_data['DAY_OF_WEEK_SIN'] = np.sin(prediction_data.DAY_OF_WEEK * (2. * np.pi / 7))
prediction_data['DAY_OF_WEEK_COS'] = np.cos(prediction_data.DAY_OF_WEEK * (2. * np.pi / 7))

In [None]:
# hier muss kein drop first weil es ja noch floating starts gibt
station_dummies = pd.get_dummies(prediction_data.loc[raw_data['is_station']==True , 'start_position_name'])

In [None]:
seasonal_dummies = pd.get_dummies(prediction_data['season'], drop_first=True)

In [None]:
# Dropping individual time columns since their transformation will be used
prediction_data.drop(columns=['WEEK_OF_YEAR', 'DAY_OF_WEEK', 'HOUR','start_time','season','start_position_name'], axis=1, inplace=True)

In [None]:
prediction_data = pd.concat([prediction_data, seasonal_dummies, station_dummies], axis=1)

In [None]:
prediction_data.fillna(0.0, inplace=True)

In [None]:
prediction_data = prediction_data.merge(false_bookings_series, left_index=True, right_index=True)

# Linear Regression

In [None]:
X_duration = prediction_data.drop(columns=['duration'])
y_duration = prediction_data['duration'].values.reshape(-1,1)

In [None]:
X_train_duration, X_test_duration, y_train_duration, y_test_duration = train_test_split(X_duration, y_duration, random_state=0)

In [None]:
model = LassoLarsCV(cv=5, normalize=False).fit(X_train_duration, y_train_duration)

In [None]:
y_pred_lin = model.predict(X_test_duration).reshape(-1, 1)

In [None]:
reg_residuals = y_test_duration.reshape(-1, 1) - y_pred_lin

In [None]:
# Creating fit and subplots
fix, axs = plt.subplots(2, 1, figsize=(17, 15), sharex=True)

# Plotting actuals and fit of linear and random forest regression
pd.DataFrame({'ACTUALS': y_test_duration.reshape(-1),
              'LinearRegression': y_pred_lin.reshape(-1)}).plot(ax=axs[0])


# Plotting residuals of linear and random forest regression
pd.DataFrame({'LinearRegression': reg_residuals.reshape(-1)}).plot(ax=axs[1])

axs[1].set_title('LinearRegression Residuals');

In [None]:
mean_absolute_error(y_test_duration, y_pred_lin)

# Random Forest Regression

In [None]:
rf_model = RandomForestRegressor(n_jobs=-1, 
                                 random_state=123)

In [None]:
rf_model.fit(X_train_duration, y_train_duration.ravel())

In [None]:
# Calculating the fit of the model
y_pred_duration_rf = rf_model.predict(X_test_duration).reshape(-1)



# Calculating the residuals
rf_residuals = y_test_duration.reshape(-1) - y_pred_duration_rf

In [None]:
# Creating fit and subplots
fix, axs = plt.subplots(2, 1, figsize=(17, 15), sharex=True)

# Plotting actuals and fit of linear and random forest regression
pd.DataFrame({'ACTUALS': y_test_duration.reshape(-1),
              'RandomForestRegression': y_pred_duration_rf.reshape(-1)}).plot(ax=axs[0])


# Plotting residuals of linear and random forest regression
pd.DataFrame({'RandomForestRegression': rf_residuals.reshape(-1)}).plot(ax=axs[1])

axs[1].set_title('RandomForest Residuals');

In [None]:
mean_absolute_error(y_test_duration.reshape(-1), y_pred_duration_rf)

# Booking Filter

In [None]:
X_bookings = prediction_data.drop(columns=['duration', 'false_booking'])
y_bookings = prediction_data['false_booking'].values.reshape(-1,1)

In [None]:
rfc = RandomForestClassifier(n_jobs=-1)

In [None]:
X_train_bookings, X_test_bookings, y_train_bookings, y_test_bookings = train_test_split(X_bookings, y_bookings, random_state=0)

In [None]:
rfc.fit(X_train_bookings, y_train_bookings.ravel())

In [None]:
y_pred_bookings = rfc.predict(X_test_bookings)

In [None]:
print(accuracy_score(y_true=y_test_bookings, y_pred=y_pred_bookings))

In [None]:
X_test_final = np.concatenate((X_test_bookings, np.vstack(y_pred_bookings)), axis=1)

In [None]:
# Calculating the fit of the model
rf_predictions_final = rf_model.predict(X_test_final).reshape(-1)



# Calculating the residuals
rf_residuals = y_test_duration.reshape(-1) - rf_predictions_final

In [None]:
# Creating fit and subplots
fix, axs = plt.subplots(2, 1, figsize=(17, 15), sharex=True)

# Plotting actuals and fit of linear and random forest regression
pd.DataFrame({'ACTUALS': y_test_duration.reshape(-1),
              'RandomForestRegression': rf_predictions_final.reshape(-1)}).plot(ax=axs[0])


# Plotting residuals of linear and random forest regression
pd.DataFrame({'RandomForestRegression': rf_residuals.reshape(-1)}).plot(ax=axs[1])

axs[1].set_title('RandomForest Residuals');

In [None]:
mean_absolute_error(y_test_duration.reshape(-1), rf_predictions_final)

# Grid Search

In [None]:
y_train_pred_bookings = rfc.predict(X_train_bookings)

In [None]:
X_train_final = np.concatenate((X_train_bookings, np.vstack(y_train_pred_bookings)), axis=1)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 1000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(50, 100, num = 6)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10, 15]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4, 6]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=1, random_state=42, n_jobs = -1)
# Fit the random search model

rf_random.fit(X_train_final, y_train_duration)

In [None]:
#rf_random.best_params_

In [None]:
#rf_mode_opt = RandomForestRegressor(n_jobs=-1, 
#                                 random_state=123,
#                                 n_estimators=800,
#                                 min_samples_split=10,
#                                 min_samples_leaf=4,
#                                 max_features='sqrt',
#                                 max_depth=80,
#                                 bootstrap=True)

In [None]:
#rf_mode_opt.fit(X_train, y_train)

In [None]:
# Calculating the fit of the model
#rf_opt_predictions = rf_mode_opt.predict(X_train).reshape(-1)



# Calculating the residuals
#rf_opt_residuals = y_train.reshape(-1) - rf_opt_predictions

In [None]:
# Creating fit and subplots
#fix, axs = plt.subplots(2, 1, figsize=(17, 15), sharex=True)

# Plotting actuals and fit of linear and random forest regression
#pd.DataFrame({'ACTUALS': y_train.reshape(-1),
              'OPT_RandomForestRegression': rf_opt_predictions.reshape(-1)}).plot(ax=axs[0])


# Plotting residuals of linear and random forest regression
#pd.DataFrame({'RandomForestRegression': rf_opt_residuals.reshape(-1)}).plot(ax=axs[1])

#axs[1].set_title('OPT_RandomForest Residuals');

In [None]:
#mean_absolute_error(y_train.reshape(-1), rf_opt_predictions)

In [None]:
# Creating fig
#fig = plt.figure(figsize=(20, 10))

# Deriving the feature importances of the optimized and default model
#default_importances = list(rf_model.feature_importances_)
#optimized_importances = list(rf_mode_opt.feature_importances_)

# Creating an array with range of number of variables
#x_values = np.arange(len(default_importances))

# Creating bar plots of optimized and default importances
#plt.bar(x_values, default_importances,orientation='vertical', color='blue', width=.5, align='center', label='Default')
#plt.bar(x_values + 0.5, optimized_importances, orientation='vertical', color='red', width=.5, align='center', label='Optimized')

#plt.xticks(x_values + 0.25, list(X.columns), rotation='vertical')

#plt.ylabel('Importance')
#plt.xlabel('Variable')
#plt.title('Variable Importances')

#plt.legend(loc=1)
#plt.show();