In [16]:
#Importing necessary libraries(This may take sometime and may need explicit installs through pip/conda or manual installations
# if some of the libraries are used for the first time like xgboost etc.,)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import seaborn as sns
from sklearn import ensemble
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.grid_search import GridSearchCV
from datetime import datetime
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
%matplotlib inline
import xgboost
from sklearn import cross_validation

#Loading Data from local
train = pd.read_csv('F:/Warpath to Data Science/regression/bike-sharing/train.csv')

In [None]:
#if the date column is not imported as date format we can use below command to overcome the mismatch
#df = train
#df['datetime'] = pd.to_datetime(df['datetime'], format='%d%b%Y')
print("training data: ", train.head(2))

In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
datetime      10886 non-null object
season        10886 non-null int64
holiday       10886 non-null int64
workingday    10886 non-null int64
weather       10886 non-null int64
temp          10886 non-null float64
atemp         10886 non-null float64
humidity      10886 non-null int64
windspeed     10886 non-null float64
casual        10886 non-null int64
registered    10886 non-null int64
count         10886 non-null int64
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.6+ KB


Above we can see that there are no null values

## Basic Feature Engineering
Extrating the hour, day, month, year column from the date column.

In [None]:
#making new columns for hour, day, month, year from Datetime column
temp = pd.DatetimeIndex(train['datetime'])
train['year'] = temp.year
train['month'] = temp.month
train['hour'] = temp.hour
train['weekday'] = temp.weekday

In [26]:
#changing year to 0,1 for values 2011, 2012 respectively to be in range with other columns
train['year'] = train['year'].map({2011:0, 2012:1})
test['year'] = test['year'].map({2011:0, 2012:1})
test1['year'] = test1['year'].map({2011:0, 2012:1})

In [None]:
#making sure the extra columns we just created are reflecting
train.head()

In [None]:
#changing data types as appropriate
categoryVariableList = ["hour","weekday","month","season","weather","holiday","workingday", 'year']
for var in categoryVariableList:
    train[var] = train[var].astype("category")

In [None]:
#logarithmic transformation of the target variable as we have natural outliers
train['log_count'] =np.log(train['count']+1)

In [None]:
#creating indepdent(input) training variables dataset
dropFeatures = ['datetime', 'casual', 'registered', 'count', 'log_count']
x_train = train.drop(dropFeatures,axis=1)
y_train = train[['log_count']]

In [None]:
#splitting into training and test data for registered users
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size = .30, random_state = 0)

In [None]:
#defining a function for rmsle which is a criterion used on kaggle to evaluate the model performance
def  rmsle(y, y_, convertExp):
    if convertExp:
        y = np.exp(y)-1,
        y_ = np.exp(y_)-1
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

In [None]:
#Applying Random Forest Model
forest = RandomForestRegressor(n_estimators = 1000, random_state=1)
#fitting model
forest.fit(X_train, Y_train)
y_train_pred = forest.predict(X_train)
predictions = forest.predict(X_test)

#calculating RMSE value
Y_test1 = Y_test.values
print ("RMSLE Value For Random Forest: ",rmsle(Y_test1, predictions, True))

In [None]:
#Applying Gradient Boost Model
gbm = GradientBoostingRegressor(n_estimators=4000,alpha=0.01) ### Test 0.41
gbm.fit(X_train, Y_train)
predictions = gbm.predict(X= X_test)
Y_test1 = Y_test.values
#preds = preds.values
print ("RMSLE Value For Gradient Boost: ",rmsle(Y_test1, predictions, True))

In [None]:
#Applying Decision Tree Regressor model
clf = DecisionTreeRegressor()
clf.fit(X_train, Y_train)
predictions = clf.predict(X= X_test)
Y_test1 = Y_test.values
#preds = preds.values
print ("RMSLE Value For DecisionTree: ",rmsle(Y_test1, predictions, True))

In [None]:
#Applying XG Boost Model
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb.fit(X_train, Y_train)
predictions = xgb.predict(X= X_test)
Y_test1 = Y_test.values
#preds = preds.values
print ("RMSLE Value For XG Boost Model: ",rmsle(Y_test1, predictions, True))

## Evaluating Model Performance
We'll apply k fold cross validation to evaluate the performance of the existing models and be sure of the results

In [None]:
#Applying on Random Forest Model
scores = cross_validation.cross_val_score(estimator = forest, X = X_train, y = Y_train, cv = 10, n_jobs = -1, scoring='mean_squared_error')
scores.mean()
#scores.std()

In [None]:
#Applying on Gradient Boost Model
scores = cross_validation.cross_val_score(estimator = gbm, X = X_train, y = Y_train, cv = 10, n_jobs = -1, scoring='mean_squared_error')
scores.mean()
#scores.std()

In [None]:
#Applying on Decision Tree Model
scores = cross_validation.cross_val_score(estimator = clf, X = X_train, y = Y_train, cv = 10, n_jobs = -1, scoring='mean_squared_error')
scores.mean()
#scores.std()

In [None]:
#Applying on Random Forest Model
scores = cross_validation.cross_val_score(estimator = xbg, X = X_train, y = Y_train, cv = 10, n_jobs = -1, scoring='mean_squared_error')
scores.mean()
#scores.std()

## Improving the Model Performance
Hyper parameter tuning with Grid Search.
Here we applied for RandomForest, Feel free to apply for other models as well. Grid search helps us know if we need to use linear or non-linear ML model. The main usage is to tell the optimum values of the parameters to be used to improve the performance of the existing model. After we get the best parameters based on criterion here we've taken MSE, we can feed in those along with the nearby values to it again to get the even more better parameters to build the best model.

In [None]:
#Applying Grid Search on RandomForest for different values of estimators,  max features used while splitting
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators' : [500, 1000, 1200, 1500], 'max_features' : ["auto", "sqrt", "log2"]}]
grid_search = GridSearchCV(estimator = forest, param_grid = parameters, scoring = 'mean_squared_error', cv = 10)
grid_search = grid_search.fit(X_train, Y_train)

best_score = grid_search.best_score_
best_parameters = grid_search.best_params_