# Linear Predictive Analytics

## Preparation

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import datetime
#import plotly.express as px
#import plotly.graph_objects as go

In [3]:
Rides = pd.read_csv('/Users/jacquelinehuttebrauker/Desktop/chicago_2018.csv', sep=",")

Rides["start_time"] = pd.to_datetime(Rides["start_time"])
Rides["end_time"] = pd.to_datetime(Rides["end_time"])

Rides.sort_values("start_time", inplace = True)
Rides.head(8)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type
3215937,2018-01-01 00:12:00,2018-01-01 00:17:23,69,159,Damen Ave & Pierce Ave,Claremont Ave & Hirsch St,3304,Subscriber
3215938,2018-01-01 00:41:35,2018-01-01 00:47:52,253,325,Winthrop Ave & Lawrence Ave,Clark St & Winnemac Ave (Temp),5367,Subscriber
3215939,2018-01-01 00:44:46,2018-01-01 01:33:10,98,509,LaSalle St & Washington St,Troy St & North Ave,4599,Subscriber
3215940,2018-01-01 00:53:10,2018-01-01 01:05:37,125,364,Rush St & Hubbard St,Larrabee St & Oak St,2302,Subscriber
3215941,2018-01-01 00:53:37,2018-01-01 00:56:40,129,205,Blue Island Ave & 18th St,Paulina St & 18th St,3696,Subscriber
3215942,2018-01-01 00:56:15,2018-01-01 01:00:41,304,299,Broadway & Waveland Ave,Halsted St & Roscoe St,6298,Subscriber
3215943,2018-01-01 00:57:26,2018-01-01 01:02:40,164,174,Franklin St & Lake St,Canal St & Madison St,1169,Subscriber
3215944,2018-01-01 01:00:29,2018-01-01 01:13:43,182,142,Wells St & Elm St,McClurg Ct & Erie St,6351,Subscriber


The following cell is just for double checking the aggregated, hourly rental counts.

In [4]:
date1 = datetime.datetime(year=2018, month=1, day=1, hour = 5)
date2 = datetime.datetime(year=2018, month=1, day=1, hour = 6)

Rides[(Rides["start_time"] >= date1) & (Rides["start_time"] <= date2)]

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type
3215973,2018-01-01 05:32:22,2018-01-01 05:40:04,206,339,Halsted St & Archer Ave,Emerald Ave & 31st St,540,Subscriber
3215974,2018-01-01 05:46:51,2018-01-01 05:58:48,72,36,Wabash Ave & 16th St,Franklin St & Jackson Blvd,3775,Subscriber
3215975,2018-01-01 05:48:40,2018-01-01 05:53:01,303,296,Broadway & Cornelia Ave,Broadway & Belmont Ave,4784,Subscriber


Computing the hourly demand for 2018. The result is stored in a DataFrame of 24 by 365 = 8760 rows, neglecting time change

In [5]:
date_index = date1 + pd.to_timedelta(np.arange(8760), 'H')

Features = pd.DataFrame(index = date_index)

Features["demand"] = 0

Features = Rides.set_index("start_time")
Features = Features.resample('H').count()

Features.drop(columns = "start_station_id", inplace = True)
Features.drop(columns = "end_station_id", inplace = True)
Features.drop(columns = "start_station_name", inplace = True)
Features.drop(columns = "end_station_name", inplace = True)
Features.drop(columns = "bike_id", inplace = True)
Features.drop(columns = "user_type", inplace = True)

Features.rename(columns = {"end_time": "Rides"}, inplace = True)

The resulting df can be double checked with one of the cells above or with the sorted Rides df. It seems reasonable though:

In [6]:
Features.describe()

Unnamed: 0,Rides
count,8760.0
mean,411.310731
std,479.667282
min,0.0
25%,55.0
50%,225.0
75%,611.25
max,2829.0


In [7]:
Features.head()

Unnamed: 0_level_0,Rides
start_time,Unnamed: 1_level_1
2018-01-01 00:00:00,7
2018-01-01 01:00:00,15
2018-01-01 02:00:00,10
2018-01-01 03:00:00,2
2018-01-01 04:00:00,2


In [8]:
x = Features.iloc[0]["Rides"]
x

7

In [9]:
Features["Rides_last_hour"] = 0
value = Features.iloc[0]["Rides"]
for i in Features.index:     
        Features.loc[i,"Rides_last_hour"] = value
        value = Features.loc[i]["Rides"]

There seems to be some erroneous data in the weather data set as there are rows which exhibit the same date, leading pandas to crash. (e.g. for index 1662, if duplicates were not removed.)

In [10]:
Weather = pd.read_csv('/Users/jacquelinehuttebrauker/Desktop/weather_hourly_chicago.csv', sep=",")

Weather["avg_tmp"] = (Weather["max_temp"]+Weather["min_temp"])/2
Weather["is_raining"] = Weather["precip"] == 1
Weather.drop(columns = ["max_temp", "min_temp", "precip"], inplace=True)

Weather["date_time"] = pd.to_datetime(Weather["date_time"])

Weather.set_index("date_time", inplace = True)
Features = Features.join(Weather, on="start_time")

Lots of missing values for weather, imputation methods:

* numerical values: linear interpolation (time series data)
* categorical (is_raining): backwards-fill

In [11]:
Features.interpolate(inplace=True)
Features.fillna(method="bfill",inplace=True)

In [12]:
Features[Features["is_raining"].isnull()]

Unnamed: 0_level_0,Rides,Rides_last_hour,avg_tmp,is_raining
start_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [13]:
len(Features[Features["is_raining"] == True])

1023

In [14]:
Features.describe()

Unnamed: 0,Rides,Rides_last_hour,avg_tmp
count,9361.0,9361.0,9361.0
mean,405.986754,405.733255,10.460314
std,474.935071,475.283195,11.81407
min,0.0,0.0,-22.8
25%,54.0,54.0,1.1
50%,222.0,221.0,9.4
75%,605.0,603.0,21.1
max,2829.0,2829.0,36.1


In [15]:
Features.reset_index(inplace=True)

In [16]:
Features["is_workday"] = Features["start_time"].apply(lambda x: x.weekday() < 5)
Features["hour"] = Features["start_time"].apply(lambda x: x.hour)
Features["month"] = Features["start_time"].apply(lambda x: x.month)

In [17]:
def workday(d):
    
    if d == True:
        return 1
    elif d == False:
        return 0
    
Features["is_workday"] = Features["is_workday"].apply(lambda d: workday(d))

In [18]:
def rain(r):
    
    if r == True:
        return 1
    elif r == False:
        return 0
    
Features["is_raining"] = Features["is_raining"].apply(lambda r: rain(r))

In [19]:
def getSeason(month):
    
    Winter = [12, 1, 2]
    Spring = [3, 4, 5]
    Summer = [6, 7, 8]
    Fall = [9, 10, 11]
    
    if month in Winter:
        return 1
    elif month in Spring:
        return 2
    elif month in Summer:
        return 3
    elif month in Fall:
        return 4
    
Features["season"] = Features["month"].apply(lambda month: getSeason(month))

In [20]:
seasons = pd.get_dummies(Features["season"],prefix="season_")
seasons.drop(columns="season__4", inplace=True)

In [21]:
Features[list(seasons.columns)] = seasons

In [22]:
hours = pd.get_dummies(Features["hour"],prefix="hour_")
hours.drop(columns="hour__23", inplace=True)

In [23]:
Features[list(hours.columns)] = hours

In [24]:
Features.drop(columns=["season","month","hour"], inplace=True)

In [25]:
Features

Unnamed: 0,start_time,Rides,Rides_last_hour,avg_tmp,is_raining,is_workday,season__1,season__2,season__3,hour__0,...,hour__13,hour__14,hour__15,hour__16,hour__17,hour__18,hour__19,hour__20,hour__21,hour__22
0,2018-01-01 00:00:00,7,7,-13.90,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2018-01-01 01:00:00,15,7,-15.00,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2018-01-01 02:00:00,10,15,-15.60,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2018-01-01 03:00:00,2,10,-16.70,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2018-01-01 04:00:00,2,2,-17.20,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9356,2018-12-31 19:00:00,41,45,3.30,1,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9357,2018-12-31 20:00:00,28,41,3.60,1,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9358,2018-12-31 21:00:00,22,28,3.45,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9359,2018-12-31 22:00:00,25,22,3.30,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Re-Scaling the data

In [26]:
Features_rescaled = pd.DataFrame()

Features_rescaled["Rides"] = Features["Rides"]
Features_rescaled["Rides_last_hour"] = (Features["Rides_last_hour"] - Features["Rides_last_hour"].min()) / (Features["Rides_last_hour"].max() - Features["Rides_last_hour"].min())
Features_rescaled["avg_tmp"] = (Features["avg_tmp"] - Features["avg_tmp"].min()) / (Features["avg_tmp"].max() - Features["avg_tmp"].min())
Features_rescaled['is_raining'] = Features['is_raining']
Features_rescaled['is_workday'] = Features['is_workday']
Features_rescaled['hour__0'] = Features['hour__0']
Features_rescaled['hour__1'] = Features['hour__1']
Features_rescaled['hour__2'] = Features['hour__2']
Features_rescaled['hour__3'] = Features['hour__3']
Features_rescaled['hour__4'] = Features['hour__4']
Features_rescaled['hour__5'] = Features['hour__5']
Features_rescaled['hour__6'] = Features['hour__6']
Features_rescaled['hour__7'] = Features['hour__7']
Features_rescaled['hour__8'] = Features['hour__8']
Features_rescaled['hour__9'] = Features['hour__9']
Features_rescaled['hour__10'] = Features['hour__10']
Features_rescaled['hour__11'] = Features['hour__11']
Features_rescaled['hour__12'] = Features['hour__12']
Features_rescaled['hour__13'] = Features['hour__13']
Features_rescaled['hour__14'] = Features['hour__14']
Features_rescaled['hour__15'] = Features['hour__15']
Features_rescaled['hour__16'] = Features['hour__16']
Features_rescaled['hour__17'] = Features['hour__17']
Features_rescaled['hour__18'] = Features['hour__18']
Features_rescaled['hour__19'] = Features['hour__19']
Features_rescaled['hour__20'] = Features['hour__20']
Features_rescaled['hour__21'] = Features['hour__21']
Features_rescaled['hour__22'] = Features['hour__22']
Features_rescaled['season__1'] = Features['season__1']
Features_rescaled['season__2'] = Features['season__2']
Features_rescaled['season__3'] = Features['season__3']

Features_rescaled.head()

Unnamed: 0,Rides,Rides_last_hour,avg_tmp,is_raining,is_workday,hour__0,hour__1,hour__2,hour__3,hour__4,...,hour__16,hour__17,hour__18,hour__19,hour__20,hour__21,hour__22,season__1,season__2,season__3
0,7,0.002474,0.151104,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,15,0.002474,0.132428,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,10,0.005302,0.122241,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2,0.003535,0.103565,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,2,0.000707,0.095076,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [None]:
Features.cov()

In [None]:
Features_corr = Features.corr()
Features_corr

[Source](https://stackoverflow.com/questions/39409866/correlation-heatmap) for the following code:

In [None]:
Features_rescaled.var()

# Modeling

In [28]:
from sklearn.model_selection import train_test_split

In [37]:
# split the data with 70-30% split as above

X = Features_rescaled
y = Features["Rides"]
X,y
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
X

Unnamed: 0,Rides,Rides_last_hour,avg_tmp,is_raining,is_workday,hour__0,hour__1,hour__2,hour__3,hour__4,...,hour__16,hour__17,hour__18,hour__19,hour__20,hour__21,hour__22,season__1,season__2,season__3
0,7,0.002474,0.151104,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,15,0.002474,0.132428,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,10,0.005302,0.122241,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2,0.003535,0.103565,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,2,0.000707,0.095076,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9356,41,0.015907,0.443124,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
9357,28,0.014493,0.448217,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
9358,22,0.009897,0.445671,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
9359,25,0.007777,0.443124,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [38]:
x_train, x_test, y_train, y_test

(      Rides  Rides_last_hour   avg_tmp  is_raining  is_workday  hour__0  \
 5862   1265         0.496642  0.830221           0           0        0   
 4995     47         0.021562  0.801358           0           0        0   
 2595      5         0.002474  0.519525           1           1        0   
 6232    184         0.080594  0.877759           0           0        1   
 1030     18         0.010604  0.331070           0           1        0   
 ...     ...              ...       ...         ...         ...      ...   
 5734   1787         0.355603  0.867572           0           1        0   
 5191    944         0.413574  0.769100           0           0        0   
 5390    256         0.137858  0.840407           0           1        0   
 860      94         0.018735  0.292020           0           0        0   
 7270    363         0.117002  0.461800           0           1        0   
 
       hour__1  hour__2  hour__3  hour__4  ...  hour__16  hour__17  hour__18  \
 5862 

fangt hier an...

Feature is_raining & is_workday should be 0 or 1 --> all values needs to be between 0 and 1
Feature season --> which seperation is the best? (spring/summer/autumn/winter or winter/non winter)
Feature hour -> maybe us of 1 to 23 and rescale would be better 


In [None]:
#required modules

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import datetime

### linear regression

In [68]:
# define X and y vectors
#target
a = y_train

#features: rides last hour, Temperature, is_raining, Workday
b = x_train[["Rides_last_hour", "avg_tmp", "is_raining", "is_workday"]]

# make predictions (models have already been fitted above!)
lin_mod = LinearRegression()
lin_mod.fit(b, a)

v = x_test[["Rides_last_hour", "avg_tmp", "is_raining", "is_workday"]]
y_pred = lin_mod.predict(v)
print(lin_mod.coef_, lin_mod.intercept_)

# report test metrics

print("MAE:", mean_absolute_error(y_pred,y_test))
print("RMSE:", mean_squared_error(y_pred,y_test)**0.5)
print("R2:", r2_score(y_test,y_pred))

[2305.9429653   127.16371779   -8.47542446   14.82864265] -7.8383894772289295
MAE: 161.04890145145026
RMSE: 247.60036327723626
R2: 0.7382763584675394


In [None]:
plt.figure(figsize = (8,6))
plt.scatter(x_train, y_train, marker='x', color="C0")
plt.scatter(x_test, y_test, marker='x', color="C2")
plt.xlabel("Temperature (°C)")
plt.ylabel("Demand (GW)")
plt.legend(['Training set', 'Holdout set'])
plt.show()

In [65]:
# define X and y vectors
#target
a = y_train

#features: inkluding rides last hour, Temperature, is_raining, Workday, season
c = x_train[["Rides_last_hour", "avg_tmp", "is_raining", "is_workday","season__2","season__2","season__3"]]

# make predictions (models have already been fitted above!)
lin_mod = LinearRegression()
lin_mod.fit(c, a)

v = x_test[["Rides_last_hour", "avg_tmp", "is_raining", "is_workday","season__2","season__2","season__3"]]
y_pred = lin_mod.predict(v)
print(lin_mod.coef_, lin_mod.intercept_)

# report test metrics

print("MAE:", mean_absolute_error(y_pred,y_test))
print("RMSE:", mean_squared_error(y_pred,y_test)**0.5)
print("R2:", r2_score(y_test,y_pred))

[ 2.30028825e+03  7.01481068e+01 -7.94736207e+00  1.56218485e+01
  1.67946563e+00  1.67946563e+00  4.39972006e+01] 12.760069191768878
MAE: 160.18501239019142
RMSE: 247.08636992241543
R2: 0.739361854315987


In [64]:
# define X and y vectors
#target
a = y_train

#features: inkluding rides last hour, Temperature, is_raining, Workday, hour
e = x_train[["Rides_last_hour", "avg_tmp", "is_raining", "is_workday","hour__0","hour__1","hour__2","hour__3","hour__4","hour__5","hour__6","hour__7","hour__8","hour__9","hour__10","hour__11","hour__0","hour__1","hour__12","hour__13","hour__14","hour__15", "hour__16","hour__17","hour__18","hour__19","hour__20","hour__21","hour__22"]]

# make predictions (models have already been fitted above!)
lin_mod = LinearRegression()
lin_mod.fit(e, a)

v = x_test[["Rides_last_hour", "avg_tmp", "is_raining", "is_workday","hour__0","hour__1","hour__2","hour__3","hour__4","hour__5","hour__6","hour__7","hour__8","hour__9","hour__10","hour__11","hour__0","hour__1","hour__12","hour__13","hour__14","hour__15", "hour__16","hour__17","hour__18","hour__19","hour__20","hour__21","hour__22"]]
y_pred = lin_mod.predict(v)
print(lin_mod.coef_, lin_mod.intercept_)

# report test metrics

print("MAE:", mean_absolute_error(y_pred,y_test))
print("RMSE:", mean_squared_error(y_pred,y_test)**0.5)
print("R2:", r2_score(y_test,y_pred))

[ 2.23888378e+03  2.07626821e+02 -1.31109026e+01  1.49526886e+01
  4.66652256e+14 -2.20593119e+14  3.38360323e+01  3.84565418e+01
  4.80393977e+01  1.18272171e+02  2.67202726e+02  4.46447572e+02
  3.36182144e+02 -1.56987853e+02  7.90615104e+01  2.20431835e+02
 -4.66652256e+14  2.20593119e+14  2.15250082e+02  1.48506276e+02
  1.45358076e+02  2.56738858e+02  4.76318851e+02  4.84067354e+02
 -1.34147589e+02 -7.08804045e+01 -3.85261467e+01  6.35931494e+00
  3.46637161e+00] -165.28306122684597
MAE: 114.24411619966307
RMSE: 170.93906633285263
R2: 0.8752550267443393


In [63]:
# define X and y vectors
#target
y_train

#features
x_train

# make predictions (models have already been fitted above!)
lin_mod = LinearRegression()
lin_mod.fit(x_train, y_train)

v = x_test
y_pred = lin_mod.predict(v)
print(lin_mod.coef_, lin_mod.intercept_)

# report test metrics

print("MAE:", mean_absolute_error(y_pred,y_test))
print("RMSE:", mean_squared_error(y_pred,y_test)**0.5)
print("R2:", r2_score(y_test,y_pred))

[ 1.00000000e+00 -1.21500182e-14 -3.82474067e-14 -1.88156413e-14
  6.06004007e-15 -1.16908070e-14  1.56624320e-14  2.48827732e-14
 -1.19117519e-15 -5.94978096e-15 -8.44934126e-15 -1.62928423e-14
 -2.27152931e-14 -1.60613267e-14  1.27856118e-14 -2.30183167e-15
 -1.11579327e-14 -1.01214529e-14 -4.54415023e-15 -6.00814763e-15
 -1.21779446e-14 -2.00192486e-14 -1.29801316e-13  1.74876993e-14
  6.33163154e-15  3.82513833e-15 -1.67771634e-15 -5.51142620e-16
 -9.83620264e-16 -8.48684830e-16 -1.66205929e-14] 5.115907697472721e-13
MAE: 4.524289790939133e-13
RMSE: 6.096190732950222e-13
R2: 1.0


In [62]:
# define X and y vectors
#target
a = y_train

#features: Season, hour, workday --> all time related features
f = x_train[['is_workday',"season__2","season__2","season__3","hour__0","hour__1","hour__2","hour__3","hour__4","hour__5","hour__6","hour__7","hour__8","hour__9","hour__10","hour__11","hour__0","hour__1","hour__12","hour__13","hour__14","hour__15", "hour__16","hour__17","hour__18","hour__19","hour__20","hour__21","hour__22"]]

# make predictions (models have already been fitted above!)
lin_mod = LinearRegression()
lin_mod.fit(f, a)

v = x_test[['is_workday',"season__2","season__2","season__3","hour__0","hour__1","hour__2","hour__3","hour__4","hour__5","hour__6","hour__7","hour__8","hour__9","hour__10","hour__11","hour__0","hour__1","hour__12","hour__13","hour__14","hour__15", "hour__16","hour__17","hour__18","hour__19","hour__20","hour__21","hour__22"]]
y_pred = lin_mod.predict(v)
print(lin_mod.coef_, lin_mod.intercept_)

# report test metrics

print("MAE:", mean_absolute_error(y_test,y_pred))
print("RMSE:", mean_squared_error(y_test,y_pred)**0.5)
print("R2:", r2_score(y_test,y_pred))

[ 9.51850014e+01  7.52417258e+13 -7.52417258e+13  3.88259609e+02
  1.34157500e+15  9.22370236e+14 -8.74597445e+01 -8.48911421e+01
 -9.99130884e+01 -1.83686726e+01  1.85100263e+02  5.16355999e+02
  6.79860803e+02  3.22960951e+02  2.68877336e+02  3.53077292e+02
 -1.34157500e+15 -9.22370236e+14  4.25046986e+02  4.06062817e+02
  4.10042793e+02  5.12004789e+02  8.29818880e+02  1.08790608e+03
  6.97522317e+02  4.13954738e+02  2.40927369e+02  1.36703363e+02
  5.80457961e+01] -69.15153742807553
MAE: 224.9483103859258
RMSE: 318.35484737726904
R2: 0.5673234749210571


In [60]:
# define X and y vectors
#target
a = y_train

#features: inkluding rides last hour, Temperature, Weather, Workday, season
g = x_train[["avg_tmp", "is_raining", "is_workday","season__2","season__2","season__3"]]

# make predictions (models have already been fitted above!)
lin_mod = LinearRegression()
lin_mod.fit(g, a)

v = x_test[["avg_tmp", "is_raining", "is_workday","season__2","season__2","season__3"]]
y_pred = lin_mod.predict(v)
print(lin_mod.coef_, lin_mod.intercept_)

# report test metrics

print("MAE:", mean_absolute_error(y_test,y_pred))
print("RMSE:", mean_squared_error(y_test,y_pred)**0.5)
print("R2:", r2_score(y_test,y_pred))

[ 877.43857379 -102.46038139   90.43521574   -4.2831514    -4.2831514
   99.05203435] -169.0147541844861
MAE: 307.30690106869156
RMSE: 427.7233640812781
R2: 0.2189724615179156


In [69]:
# define X and y vectors
#target
a = y_train

#features: all features without rides_last_hour
h = x_train[["avg_tmp", "is_raining", "is_workday","season__2","season__2","season__3","hour__0","hour__1","hour__2","hour__3","hour__4","hour__5","hour__6","hour__7","hour__8","hour__9","hour__10","hour__11","hour__0","hour__1","hour__12","hour__13","hour__14","hour__15", "hour__16","hour__17","hour__18","hour__19","hour__20","hour__21","hour__22"]]

# make predictions (models have already been fitted above!)
lin_mod = LinearRegression()
lin_mod.fit(h, a)

v = x_test[["avg_tmp", "is_raining", "is_workday","season__2","season__2","season__3","hour__0","hour__1","hour__2","hour__3","hour__4","hour__5","hour__6","hour__7","hour__8","hour__9","hour__10","hour__11","hour__0","hour__1","hour__12","hour__13","hour__14","hour__15", "hour__16","hour__17","hour__18","hour__19","hour__20","hour__21","hour__22"]]
y_pred = lin_mod.predict(v)
print(lin_mod.coef_, lin_mod.intercept_)

# report test metrics

print("MAE:", mean_absolute_error(y_test,y_pred))
print("RMSE:", mean_squared_error(y_test,y_pred)**0.5)
print("R2:", r2_score(y_test,y_pred))

[ 9.22321148e+02 -9.62409894e+01  8.73448711e+01  1.39760572e+15
 -1.39760572e+15  8.80440923e+01 -6.90265405e+13  1.03122028e+14
 -4.16732538e+01 -4.09944388e+01 -4.45076936e+01  4.20074694e+01
  2.42807939e+02  5.90057018e+02  7.48323149e+02  3.92086474e+02
  3.39827847e+02  4.33971710e+02  6.90265405e+13 -1.03122028e+14
  4.92386101e+02  4.77252679e+02  4.55988721e+02  5.49850571e+02
  8.55123089e+02  1.09372846e+03  6.93940290e+02  4.00629744e+02
  2.36716937e+02  1.24152574e+02  5.14278691e+01] -519.636216927225
MAE: 205.10049801049195
RMSE: 283.9656279868107
R2: 0.6557515851366675


In [76]:
# define X and y vectors
#target
a = y_train

#features: Temperature, Weather, Workday
i = x_train[["avg_tmp", "is_raining", "is_workday"]]

# make predictions (models have already been fitted above!)
lin_mod = LinearRegression()
lin_mod.fit(i, a)

v = x_test[["avg_tmp", "is_raining", "is_workday"]]
y_pred = lin_mod.predict(v)
print(lin_mod.coef_, lin_mod.intercept_)

# report test metrics

print("MAE:", mean_absolute_error(y_test,y_pred))
print("RMSE:", mean_squared_error(y_test,y_pred)**0.5)
print("R2:", r2_score(y_test,y_pred))

[1013.68827104 -103.61128681   88.87436286] -222.2563771896493
MAE: 310.8391253023433
RMSE: 429.9283677285106
R2: 0.21089898325593015


In [78]:
#target
a = y_train

#features: Temperature, Weather, Workday, Season
j = x_train[["avg_tmp", "is_raining", "is_workday","season__2","season__2","season__3"]]

# make predictions (models have already been fitted above!)
lin_mod = LinearRegression()
lin_mod.fit(j, a)

v = x_test[["avg_tmp", "is_raining", "is_workday","season__2","season__2","season__3"]]
y_pred = lin_mod.predict(v)
print(lin_mod.coef_, lin_mod.intercept_)

# report test metrics

print("MAE:", mean_absolute_error(y_test,y_pred))
print("RMSE:", mean_squared_error(y_test,y_pred)**0.5)
print("R2:", r2_score(y_test,y_pred))

[ 877.43857379 -102.46038139   90.43521574   -4.2831514    -4.2831514
   99.05203435] -169.0147541844861
MAE: 307.30690106869156
RMSE: 427.7233640812781
R2: 0.2189724615179156


In [79]:
#target
a = y_train

#features: Temperature, is_weather, Workday, hour
k = x_train[["avg_tmp", "is_raining", "is_workday","hour__0","hour__1","hour__2","hour__3","hour__4","hour__5","hour__6","hour__7","hour__8","hour__9","hour__10","hour__11","hour__0","hour__1","hour__12","hour__13","hour__14","hour__15", "hour__16","hour__17","hour__18","hour__19","hour__20","hour__21","hour__22"]]

# make predictions (models have already been fitted above!)
lin_mod = LinearRegression()
lin_mod.fit(k, a)

v = x_test[["avg_tmp", "is_raining", "is_workday","hour__0","hour__1","hour__2","hour__3","hour__4","hour__5","hour__6","hour__7","hour__8","hour__9","hour__10","hour__11","hour__0","hour__1","hour__12","hour__13","hour__14","hour__15", "hour__16","hour__17","hour__18","hour__19","hour__20","hour__21","hour__22"]]
y_pred = lin_mod.predict(v)
print(lin_mod.coef_, lin_mod.intercept_)

# report test metrics

print("MAE:", mean_absolute_error(y_test,y_pred))
print("RMSE:", mean_squared_error(y_test,y_pred)**0.5)
print("R2:", r2_score(y_test,y_pred))

[ 1.04858996e+03 -9.68977602e+01  8.64376232e+01 -7.83023752e+14
 -4.50228652e+14 -3.64393531e+01 -3.71373642e+01 -3.62312983e+01
  4.94362333e+01  2.49610502e+02  5.99412870e+02  7.57629749e+02
  3.99137247e+02  3.48787475e+02  4.42661817e+02  7.83023752e+14
  4.50228652e+14  4.99593634e+02  4.86032711e+02  4.60629046e+02
  5.52128596e+02  8.56826397e+02  1.09275189e+03  6.93153115e+02
  3.97150359e+02  2.34771106e+02  1.21174605e+02  5.02816760e+01] -576.5983054768978
MAE: 206.82287676122942
RMSE: 286.02256649621984
R2: 0.6507463144536985


In [80]:
a = y_train

#features: Temperature, is_raining, hour
l = x_train[["avg_tmp", "is_raining","hour__0","hour__1","hour__2","hour__3","hour__4","hour__5","hour__6","hour__7","hour__8","hour__9","hour__10","hour__11","hour__0","hour__1","hour__12","hour__13","hour__14","hour__15", "hour__16","hour__17","hour__18","hour__19","hour__20","hour__21","hour__22"]]

# make predictions (models have already been fitted above!)
lin_mod = LinearRegression()
lin_mod.fit(l, a)

v = x_test[["avg_tmp", "is_raining","hour__0","hour__1","hour__2","hour__3","hour__4","hour__5","hour__6","hour__7","hour__8","hour__9","hour__10","hour__11","hour__0","hour__1","hour__12","hour__13","hour__14","hour__15", "hour__16","hour__17","hour__18","hour__19","hour__20","hour__21","hour__22"]]
y_pred = lin_mod.predict(v)
print(lin_mod.coef_, lin_mod.intercept_)

# report test metrics

print("MAE:", mean_absolute_error(y_test,y_pred))
print("RMSE:", mean_squared_error(y_test,y_pred)**0.5)
print("R2:", r2_score(y_test,y_pred))

[ 1.05206832e+03 -9.54326137e+01  1.11189339e+15  3.44683224e+14
 -3.56230097e+01 -3.91483316e+01 -3.37956661e+01  5.11673254e+01
  2.50686895e+02  5.99776239e+02  7.57867597e+02  4.00646906e+02
  3.48127235e+02  4.43824684e+02 -1.11189339e+15 -3.44683224e+14
  5.02060836e+02  4.87196188e+02  4.62115818e+02  5.54697300e+02
  8.57324551e+02  1.09463280e+03  6.93340572e+02  3.98152495e+02
  2.35285352e+02  1.22165480e+02  5.08728541e+01] -517.6850187182663
MAE: 207.40850188202185
RMSE: 289.4484123509762
R2: 0.6423298121967642


Assumptions:
    
    - last hour not useful for a linear regression 
    - hour not that useful like the other features
    - best regression model with season, temperature, is_raining and is_workday