In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import metrics
from sklearn.metrics import r2_score
# import autosklearn.regression

In [3]:
df = pd.read_csv('../final_features.csv')

In [4]:
df.columns

Index(['Unnamed: 0', 'date', 'start_station_id', 'start_hour',
       'total_bike_trips', 'start_station_name', 'city', 'zip_code',
       'mean_temperature_f', 'mean_humidity', 'mean_visibility_miles',
       'mean_wind_speed_mph', 'precipitation_inches', 'weekday', 'isHoliday',
       'isRushHour', 'Hour 0', 'Hour 1', 'Hour 2', 'Hour 3', 'Hour 4',
       'Hour 5', 'Hour 6', 'Hour 7', 'Hour 8', 'Hour 9', 'Hour 10', 'Hour 11',
       'Hour 12', 'Hour 13', 'Hour 14', 'Hour 15', 'Hour 16', 'Hour 17',
       'Hour 18', 'Hour 19', 'Hour 20', 'Hour 21', 'Hour 22', 'Hour 23',
       'Station 3', 'Station 4', 'Station 5', 'Station 7', 'Station 8',
       'Station 14', 'Station 22', 'Station 27', 'Station 28', 'Station 30',
       'Station 32', 'Station 61', 'Station 64', 'Station 69', 'Station 70',
       'Station 83', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday',
       'Tuesday', 'Wednesday', '94041', '94063', '94107', '95113'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,start_station_id,start_hour,total_bike_trips,start_station_name,city,zip_code,mean_temperature_f,mean_humidity,...,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,94041,94063,94107,95113
0,0,2013-08-29,3,13,1,San Jose Civic Center,San Jose,95113,72.0,69.0,...,0,0,0,1,0,0,0,0,0,1
1,1,2013-08-29,3,18,1,San Jose Civic Center,San Jose,95113,72.0,69.0,...,0,0,0,1,0,0,0,0,0,1
2,2,2013-08-29,3,20,1,San Jose Civic Center,San Jose,95113,72.0,69.0,...,0,0,0,1,0,0,0,0,0,1
3,3,2013-08-29,3,21,2,San Jose Civic Center,San Jose,95113,72.0,69.0,...,0,0,0,1,0,0,0,0,0,1
4,4,2013-08-29,3,22,4,San Jose Civic Center,San Jose,95113,72.0,69.0,...,0,0,0,1,0,0,0,0,0,1


In [6]:
df_final = df.dropna()

def lin_reg(feature_cols, linreg):
    X = df_final[feature_cols]
    y = df_final["total_bike_trips"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    linreg.fit(X_train, y_train) # Fit model to training data
    y_pred_train = linreg.predict(X_train) # X_train predictions
    print("\ny_pred_train:")
    print(y_pred_train)
    y_pred_test = linreg.predict(X_test) # X_test predictions
    print("\ny_pred_test:")
    print(y_pred_test)
    # Demonstrates accuracy 
    # rounded_y_pred_test = np.rint(y_pred_test)
    # rounded_y_pred_train = np.rint(y_pred_train)
    diff_test = y_pred_test - y_test
    diff_train = y_pred_train - y_train
    accuracy_train = (len(diff_train) - np.count_nonzero(diff_train))/len(diff_train)
    accuracy_test = (len(diff_test) - np.count_nonzero(diff_test))/len(diff_test)
    print("\nTraining Accuracy:")
    print(accuracy_train)
    print("\nTest Accuracy:")
    print(accuracy_test)
    # Prints y-intercept 
    print("\ny-intercept:")
    print(linreg.intercept_)
    # Prints the beta coefficients in same order as passed
    print("\nCoefficients in same order as passed:")
    print(linreg.coef_)
    # Zip can pair feature names and coefficients together
    zip(feature_cols, linreg.coef_)
    
    print("\nTESTING ERRORS:")
    # prints result of MAE
    print("\nMAE:")
    print(metrics.mean_absolute_error(y_test, y_pred_test))
    # prints result of MSE
    print("\nMSE:")
    print(metrics.mean_squared_error(y_test, y_pred_test))
    # prtins result of RMSE 
    print("\nRMSE:")
    print(np.sqrt(metrics.mean_squared_error(y_test, y_pred_test)))
    # prints R^2 score
    print("\nR^2 score:")
    print(r2_score(y_test, y_pred_test))
    print("\n\n\nTRAINING ERRORS:")
    # prints result of MAE
    print("\nMAE:")
    print(metrics.mean_absolute_error(y_train, y_pred_train))
    # prints result of MSE
    print("\nMSE:")
    print(metrics.mean_squared_error(y_train, y_pred_train))
    # prtins result of RMSE 
    print("\nRMSE:")
    print(np.sqrt(metrics.mean_squared_error(y_train, y_pred_train)))
    # prints R^2 score
    print("\nR^2 score:")
    print(r2_score(y_train, y_pred_train))

In [7]:
feature_cols = ["mean_humidity", "mean_visibility_miles", "precipitation_inches"]

### Ordinary Least Squares Linear regression

In [8]:
lin_reg(feature_cols, LinearRegression())


y_pred_train:
[2.78369948 2.46038145 3.06390844 ... 2.82680855 3.02079937 2.9776903 ]

y_pred_test:
[2.69748134 2.82680855 2.86991762 ... 2.76214495 2.99924484 2.89147216]

Training Accuracy:
0.0

Test Accuracy:
0.0

y-intercept:
1.4053760152665538

Coefficients in same order as passed:
[ 0.02155454  0.00419423 -0.52917649]

TESTING ERRORS:

MAE:
2.2813227541147096

MSE:
16.71799784751501

RMSE:
4.08876483152492

R^2 score:
0.0027324852735813865



TRAINING ERRORS:

MAE:
2.217584250564252

MSE:
15.086158017517587

RMSE:
3.884090371955522

R^2 score:
0.003103453412406254


## Ridge regression

In [9]:
lin_reg(feature_cols, Ridge(alpha=0.5))


y_pred_train:
[2.78369828 2.46039282 3.06389634 ... 2.82680567 3.02078894 2.97768155]

y_pred_test:
[2.69748349 2.82680567 2.86991306 ... 2.76214458 2.99923524 2.89146676]

Training Accuracy:
0.0

Test Accuracy:
0.0

y-intercept:
1.4052556959120233

Coefficients in same order as passed:
[ 0.0215537   0.00421134 -0.5286834 ]

TESTING ERRORS:

MAE:
2.2813230186008946

MSE:
16.718001837219454

RMSE:
4.088765319411161

R^2 score:
0.0027322472784176988



TRAINING ERRORS:

MAE:
2.2175838475819343

MSE:
15.086158021551057

RMSE:
3.884090372474752

R^2 score:
0.0031034531458735692


In [10]:
feature_cols = ['start_hour', 'zip_code', 'mean_temperature_f',
       'mean_humidity', 'mean_visibility_miles', 'mean_wind_speed_mph',
       'precipitation_inches', 'isHoliday', 'isRushHour', 'Friday',
       'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']
lin_reg(feature_cols, Ridge(alpha=0.5))


y_pred_train:
[2.01150666 4.40540379 4.65618152 ... 4.89576575 2.91226296 2.43174216]

y_pred_test:
[2.89836614 2.89540912 1.45499906 ... 1.01738142 3.60995993 3.61834522]

Training Accuracy:
0.0

Test Accuracy:
0.0

y-intercept:
140.55453859368095

Coefficients in same order as passed:
[-1.02638745e-01 -1.44775152e-03 -1.13599653e-02  2.79680806e-03
 -1.47051070e-01  1.36670186e-01 -1.42819620e+00 -4.84198911e-01
  1.87427004e+00  1.44035394e-01  3.51450115e-01 -7.61923843e-01
 -8.27719356e-01  3.34310697e-01  3.67116358e-01  3.92730634e-01]

TESTING ERRORS:

MAE:
2.153522197712855

MSE:
14.387339987929774

RMSE:
3.7930647223491682

R^2 score:
0.14176165566264498



TRAINING ERRORS:

MAE:
2.094423632880394

MSE:
13.03853462788152

RMSE:
3.610891112714633

R^2 score:
0.1384108446958543


In [11]:
lin_reg(feature_cols, Lasso(alpha=0.01))


y_pred_train:
[1.96366579 4.32915353 4.53442533 ... 4.75893414 2.87133391 2.49536057]

y_pred_test:
[2.8816648  2.85299946 1.58612657 ... 1.11699    3.57057427 3.56647324]

Training Accuracy:
0.0

Test Accuracy:
0.0

y-intercept:
141.67598070897327

Coefficients in same order as passed:
[-1.02669143e-01 -1.46080017e-03 -9.81354610e-03  2.28278097e-03
 -1.06070251e-01  1.30076770e-01 -7.47911986e-01 -1.35412789e-01
  1.84359261e+00 -1.03473700e-01  0.00000000e+00 -9.52226069e-01
 -1.00187741e+00  0.00000000e+00  7.43028866e-04  2.68382370e-02]

TESTING ERRORS:

MAE:
2.1440083874060454

MSE:
14.416174831890872

RMSE:
3.7968638152942584

R^2 score:
0.14004159005210404



TRAINING ERRORS:

MAE:
2.085817529540428

MSE:
13.05428847955913

RMSE:
3.61307188961957

R^2 score:
0.13736982680948273


In [12]:
feature_cols = ['start_hour', 'zip_code', 'isHoliday', 'isRushHour', 'Friday',
       'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']
lin_reg(feature_cols, LinearRegression())


y_pred_train:
[1.9872484  3.64012672 3.92223096 ... 3.74968338 2.42545404 2.4239983 ]

y_pred_test:
[3.64012672 2.9046302  1.5160636  ... 0.77134423 3.84398934 3.84398934]

Training Accuracy:
0.0

Test Accuracy:
0.0

y-intercept:
157.99084247048418

Coefficients in same order as passed:
[-1.01931310e-01 -1.64302020e-03 -6.27698918e-01  1.82626054e+00
  1.74102015e-01  3.75948982e-01 -7.33832688e-01 -8.36483791e-01
  2.77489071e-01  3.87045725e-01  3.55730685e-01]

TESTING ERRORS:

MAE:
2.168058372992678

MSE:
14.626594181218982

RMSE:
3.824473059287904

R^2 score:
0.12748958571110658



TRAINING ERRORS:

MAE:
2.107060816048957

MSE:
13.245487632627322

RMSE:
3.6394350705332443

R^2 score:
0.1247353459043492


In [13]:
lin_reg(feature_cols, Ridge(alpha=0.5))


y_pred_train:
[1.98716446 3.64005215 3.92215456 ... 3.74959391 2.42548466 2.4240362 ]

y_pred_test:
[3.64005215 2.90455397 1.51623862 ... 0.77143648 3.84391648 3.84391648]

Training Accuracy:
0.0

Test Accuracy:
0.0

y-intercept:
157.99179989079525

Coefficients in same order as passed:
[-1.01932167e-01 -1.64302951e-03 -6.27368800e-01  1.82616049e+00
  1.74068427e-01  3.75880496e-01 -7.33729156e-01 -8.36346764e-01
  2.77449053e-01  3.86990813e-01  3.55687131e-01]

TESTING ERRORS:

MAE:
2.168046548016035

MSE:
14.62660447323683

RMSE:
3.8244744048348434

R^2 score:
0.12748897176826757



TRAINING ERRORS:

MAE:
2.107049437433307

MSE:
13.245487642836862

RMSE:
3.6394350719358712

R^2 score:
0.12473534522970064


In [19]:
# Model that includes all feature columns 
feature_cols = ['start_station_id', 'mean_temperature_f', 'mean_humidity', 'mean_visibility_miles',
       'mean_wind_speed_mph', 'precipitation_inches', 'isHoliday',
       'isRushHour', 'Hour 0', 'Hour 1', 'Hour 2', 'Hour 3', 'Hour 4',
       'Hour 5', 'Hour 6', 'Hour 7', 'Hour 8', 'Hour 9', 'Hour 10', 'Hour 11',
       'Hour 12', 'Hour 13', 'Hour 14', 'Hour 15', 'Hour 16', 'Hour 17',
       'Hour 18', 'Hour 19', 'Hour 20', 'Hour 21', 'Hour 22', 'Hour 23',
       'Station 3', 'Station 4', 'Station 5', 'Station 7', 'Station 8',
       'Station 14', 'Station 22', 'Station 27', 'Station 28', 'Station 30',
       'Station 32', 'Station 61', 'Station 64', 'Station 69', 'Station 70',
       'Station 83', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday',
       'Tuesday', 'Wednesday', '94041', '94063', '94107', '95113']
lin_reg(feature_cols, Ridge(alpha=0.01))


y_pred_train:
[2.04246341 6.04439473 5.53422092 ... 4.87165741 2.42120551 4.4035032 ]

y_pred_test:
[3.88868977 4.06254488 0.54804992 ... 0.163107   4.30191908 3.95205593]

Training Accuracy:
0.0

Test Accuracy:
0.0

y-intercept:
-2.146921528445793

Coefficients in same order as passed:
[ 0.00832156  0.0240252   0.00739351  0.02005259 -0.02345728 -0.81074495
 -0.68046839  2.01446175 -0.63342241  0.71596879 -0.09784847 -0.53330496
 -0.91881963 -0.11053088  0.26237931  0.61952377  3.2896694   1.40217026
 -1.55480506  0.11119414  0.28659692  0.21439898  0.08033796  0.2316776
 -0.81324749  0.36402286 -0.09574824 -1.19712377 -0.2812168  -0.45021514
 -0.38171641 -0.50994075  0.22462441 -0.19497227 -0.11407571 -0.0981686
 -0.20212967  0.45381256  0.17042515 -0.01375327  0.2165379  -0.5410643
 -0.19993532 -0.21239063 -0.84155625  0.67390901  2.01500948 -1.33627249
  0.15534563  0.43216106 -0.87677104 -0.82747424  0.30747369  0.42884054
  0.38042434 -0.53821499 -1.16584734  1.63497163  0.06909

In [20]:
lin_reg(feature_cols, Ridge(alpha=0.1))


y_pred_train:
[2.04246957 6.04436024 5.53418981 ... 4.87163913 2.4211825  4.40344733]

y_pred_test:
[3.88868711 4.06243765 0.54809097 ... 0.16315636 4.3019148  3.95205368]

Training Accuracy:
0.0

Test Accuracy:
0.0

y-intercept:
-2.1467770852565002

Coefficients in same order as passed:
[ 0.00832754  0.0240252   0.00739299  0.02005486 -0.02345659 -0.81059141
 -0.68040419  2.01419602 -0.633428    0.71480386 -0.09790795 -0.53141828
 -0.9162541  -0.11078988  0.26207699  0.61947281  3.28950928  1.40207843
 -1.55474988  0.11089978  0.28629237  0.21409631  0.08004321  0.23137553
 -0.81324519  0.3639871  -0.09576773 -1.1970888  -0.28147872 -0.45044749
 -0.38194748 -0.51011218  0.22463911 -0.19492359 -0.11402453 -0.09813505
 -0.20209539  0.45371807  0.17034121 -0.01376245  0.21651935 -0.54101141
 -0.19995353 -0.21240014 -0.8415639   0.67383282  2.01490032 -1.33608088
  0.15534183  0.43215047 -0.87674609 -0.827455    0.30746613  0.42882901
  0.38041366 -0.53820805 -1.16573967  1.6347691   0.0

In [21]:
lin_reg(feature_cols, Lasso(alpha=0.01))


y_pred_train:
[2.13695344 6.0093052  5.45326349 ... 4.73476311 2.20197871 4.30002867]

y_pred_test:
[3.87193461 3.58121709 0.7029242  ... 0.1974065  4.26723233 3.94052867]

Training Accuracy:
0.0

Test Accuracy:
0.0

y-intercept:
-1.4845880542931353

Coefficients in same order as passed:
[-0.00342803  0.02318571  0.00427678  0.0158583  -0.01928019 -0.20594272
 -0.30797248  1.84330592 -0.          0.         -0.         -0.
 -0.         -0.          0.          0.54806524  3.22051272  1.33953922
 -1.27347593  0.          0.02377171  0.         -0.          0.
 -0.61457537  0.33612465 -0.         -0.9156356  -0.08581201 -0.11655268
 -0.         -0.          0.         -0.          0.          0.
 -0.          0.1031781  -0.         -0.          0.         -0.12982748
 -0.          0.         -0.51456004  0.89224323  2.23795834 -0.
 -0.08819997  0.05118196 -1.04789891 -0.9845022   0.          0.05288033
  0.0122465  -0.14598457 -0.52461129  2.10612798  0.        ]

TESTING ERRORS:

MAE:
