In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import metrics

In [2]:
df = pd.read_csv('../final_features.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'date', 'start_station_id', 'start_hour',
       'total_bike_trips', 'start_station_name', 'city', 'zip_code',
       'mean_temperature_f', 'mean_humidity', 'mean_visibility_miles',
       'mean_wind_speed_mph', 'precipitation_inches', 'weekday', 'isHoliday',
       'isRushHour', 'Hour 0', 'Hour 1', 'Hour 2', 'Hour 3', 'Hour 4',
       'Hour 5', 'Hour 6', 'Hour 7', 'Hour 8', 'Hour 9', 'Hour 10', 'Hour 11',
       'Hour 12', 'Hour 13', 'Hour 14', 'Hour 15', 'Hour 16', 'Hour 17',
       'Hour 18', 'Hour 19', 'Hour 20', 'Hour 21', 'Hour 22', 'Hour 23',
       'Station 3', 'Station 4', 'Station 5', 'Station 7', 'Station 8',
       'Station 14', 'Station 22', 'Station 27', 'Station 28', 'Station 30',
       'Station 32', 'Station 61', 'Station 64', 'Station 69', 'Station 70',
       'Station 83', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday',
       'Tuesday', 'Wednesday', '94041', '94063', '94107', '95113'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,start_station_id,start_hour,total_bike_trips,start_station_name,city,zip_code,mean_temperature_f,mean_humidity,...,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,94041,94063,94107,95113
0,0,2013-08-29,3,13,1,San Jose Civic Center,San Jose,95113,72.0,69.0,...,0,0,0,1,0,0,0,0,0,1
1,1,2013-08-29,3,18,1,San Jose Civic Center,San Jose,95113,72.0,69.0,...,0,0,0,1,0,0,0,0,0,1
2,2,2013-08-29,3,20,1,San Jose Civic Center,San Jose,95113,72.0,69.0,...,0,0,0,1,0,0,0,0,0,1
3,3,2013-08-29,3,21,2,San Jose Civic Center,San Jose,95113,72.0,69.0,...,0,0,0,1,0,0,0,0,0,1
4,4,2013-08-29,3,22,4,San Jose Civic Center,San Jose,95113,72.0,69.0,...,0,0,0,1,0,0,0,0,0,1


In [139]:
df_final = df.dropna()

def lin_reg(feature_cols):
    X = df_final[feature_cols]
    y = df_final["total_bike_trips"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    linreg = LinearRegression()
    linreg.fit(X_train, y_train) # Fit model to training data
    y_pred_train = linreg.predict(X_train) # X_train predictions
    print("\ny_pred_train:")
    print(y_pred_train)
    y_pred_test = linreg.predict(X_test) # X_test predictions
    print("\ny_pred_test:")
    print(y_pred_test)
    # Demonstrates accuracy 
    rounded_y_pred_test = np.rint(y_pred_test)
    rounded_y_pred_train = np.rint(y_pred_train)
    diff_test = rounded_y_pred_test - y_test
    diff_train = rounded_y_pred_train - y_train
    accuracy_train = (len(diff_train) - np.count_nonzero(diff_train))/len(diff_train)
    accuracy_test = (len(diff_test) - np.count_nonzero(diff_test))/len(diff_test)
    print("\nTraining Accuracy:")
    print(accuracy_train)
    print("\nTest Accuracy:")
    print(accuracy_test)
    # Prints y-intercept 
    print("\ny-intercept:")
    print(linreg.intercept_)
    # Prints the beta coefficients in same order as passed
    print("\nCoefficients in same order as passed:")
    print(linreg.coef_)
    # Zip can pair feature names and coefficients together
    zip(feature_cols, linreg.coef_)
    
    print("\nTESTING ERRORS:")
    # prints result of MAE
    print("\nMAE:")
    print(metrics.mean_absolute_error(y_test, y_pred_test))
    # prints result of MSE
    print("\nMSE:")
    print(metrics.mean_squared_error(y_test, y_pred_test))
    # prtins result of RMSE 
    print("\nRMSE:")
    print(np.sqrt(metrics.mean_squared_error(y_test, y_pred_test)))
    print("\n\n\nTRAINING ERRORS:")
    # prints result of MAE
    print("\nMAE:")
    print(metrics.mean_absolute_error(y_train, y_pred_train))
    # prints result of MSE
    print("\nMSE:")
    print(metrics.mean_squared_error(y_train, y_pred_train))
    # prtins result of RMSE 
    print("\nRMSE:")
    print(np.sqrt(metrics.mean_squared_error(y_train, y_pred_train)))

In [140]:
feature_cols = ["mean_humidity", "mean_visibility_miles", "precipitation_inches"]
lin_reg(feature_cols)


y_pred_train:
[2.78369948 2.46038145 3.06390844 ... 2.82680855 3.02079937 2.9776903 ]

y_pred_test:
[2.69748134 2.82680855 2.86991762 ... 2.76214495 2.99924484 2.89147216]

Training Accuracy:
0.10288155003249869

Test Accuracy:
0.10060324242805077

y-intercept:
1.4053760152665542

Coefficients in same order as passed:
[ 0.02155454  0.00419423 -0.52917649]

TESTING ERRORS:

MAE:
2.2813227541147096

MSE:
16.717997847515015

RMSE:
4.088764831524921



TRAINING ERRORS:

MAE:
2.217584250564252

MSE:
15.086158017517587

RMSE:
3.884090371955522


In [141]:
feature_cols = ['start_hour', 'zip_code', 'mean_temperature_f',
       'mean_humidity', 'mean_visibility_miles', 'mean_wind_speed_mph',
       'precipitation_inches', 'isHoliday', 'isRushHour', 'Friday',
       'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']
lin_reg(feature_cols)


y_pred_train:
[2.01160411 4.40554226 4.65637155 ... 4.89596368 2.91229229 2.43171739]

y_pred_test:
[2.89840357 2.89549002 1.45480555 ... 1.0173021  3.61004612 3.6184508 ]

Training Accuracy:
0.18453062614132285

Test Accuracy:
0.18059570189770013

y-intercept:
140.5527237581106

Coefficients in same order as passed:
[-1.02637802e-01 -1.44772706e-03 -1.13627780e-02  2.79840512e-03
 -1.47110022e-01  1.36682179e-01 -1.42962512e+00 -4.84463943e-01
  1.87438262e+00  1.44070548e-01  3.51501744e-01 -7.62035374e-01
 -8.27864704e-01  3.34365465e-01  3.67167452e-01  3.92794869e-01]

TESTING ERRORS:

MAE:
2.1535418245675824

MSE:
14.387318990854956

RMSE:
3.793061954523674



TRAINING ERRORS:

MAE:
2.0944436023858617

MSE:
13.038534586389275

RMSE:
3.610891106969203


In [142]:
feature_cols = ['start_hour', 'zip_code', 'isHoliday', 'isRushHour', 'Friday',
       'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']
lin_reg(feature_cols)


y_pred_train:
[1.9872484  3.64012672 3.92223096 ... 3.74968338 2.42545404 2.4239983 ]

y_pred_test:
[3.64012672 2.9046302  1.5160636  ... 0.77134423 3.84398934 3.84398934]

Training Accuracy:
0.18768764121452228

Test Accuracy:
0.18493150684931506

y-intercept:
157.9908424704842

Coefficients in same order as passed:
[-1.01931310e-01 -1.64302020e-03 -6.27698918e-01  1.82626054e+00
  1.74102015e-01  3.75948982e-01 -7.33832688e-01 -8.36483791e-01
  2.77489071e-01  3.87045725e-01  3.55730685e-01]

TESTING ERRORS:

MAE:
2.168058372992692

MSE:
14.626594181218968

RMSE:
3.8244730592879024



TRAINING ERRORS:

MAE:
2.107060816048972

MSE:
13.245487632627315

RMSE:
3.6394350705332434


In [143]:
# Model that includes all feature columns 
feature_cols = ['start_station_id', 'mean_temperature_f', 'mean_humidity', 'mean_visibility_miles',
       'mean_wind_speed_mph', 'precipitation_inches', 'isHoliday',
       'isRushHour', 'Hour 0', 'Hour 1', 'Hour 2', 'Hour 3', 'Hour 4',
       'Hour 5', 'Hour 6', 'Hour 7', 'Hour 8', 'Hour 9', 'Hour 10', 'Hour 11',
       'Hour 12', 'Hour 13', 'Hour 14', 'Hour 15', 'Hour 16', 'Hour 17',
       'Hour 18', 'Hour 19', 'Hour 20', 'Hour 21', 'Hour 22', 'Hour 23',
       'Station 3', 'Station 4', 'Station 5', 'Station 7', 'Station 8',
       'Station 14', 'Station 22', 'Station 27', 'Station 28', 'Station 30',
       'Station 32', 'Station 61', 'Station 64', 'Station 69', 'Station 70',
       'Station 83', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday',
       'Tuesday', 'Wednesday', '94041', '94063', '94107', '95113']
lin_reg(feature_cols)


y_pred_train:
[2.04246273 6.04439856 5.53422438 ... 4.87165945 2.42120807 4.40350941]

y_pred_test:
[3.88869007 4.06255679 0.54804536 ... 0.16310152 4.30191956 3.95205618]

Training Accuracy:
0.18781144572719677

Test Accuracy:
0.18398894055548573

y-intercept:
-2.1469376745463884

Coefficients in same order as passed:
[ 0.00832089  0.02402521  0.00739357  0.02005234 -0.02345736 -0.81076202
 -0.68047552  2.01449136 -0.63342169  0.71609851 -0.09784174 -0.53351545
 -0.91910557 -0.11050199  0.26241301  0.61952945  3.28968721  1.40218048
 -1.55481118  0.11122695  0.28663086  0.21443271  0.08037082  0.23171127
 -0.81324774  0.36402685 -0.09574606 -1.19712765 -0.2811876  -0.45018922
 -0.38169063 -0.50992159  0.22462277 -0.19497768 -0.1140814  -0.09817234
 -0.20213348  0.45382306  0.17043448 -0.01375224  0.21653996 -0.54107017
 -0.1999333  -0.21238957 -0.8415554   0.67391748  2.01502162 -1.33629379
  0.15534606  0.43216224 -0.87677381 -0.82747637  0.30747453  0.42884183
  0.38042553 -0.53821