In [30]:
# data normalisation with sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams ["figure.figsize"] = 10,6
import seaborn as sns
from scipy import stats
import math
import datetime
import urllib
import zipfile
import statsmodels.tsa.stattools as sts
from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.express as px
import glob
ohe = OneHotEncoder
le = LabelEncoder()
from sklearn.compose import ColumnTransformer
le = LabelEncoder()

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression as lr

from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import cohen_kappa_score

In [2]:
bike_station = pd.read_csv('stations.csv')


In [3]:
bike_station.tail(5)

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
1226762,300,2011-12-31 23:41:19,2011-12-31 23:46:20,31201,15th & P St NW,31214,17th & Corcoran St NW,W01459,Member
1226763,387,2011-12-31 23:46:43,2011-12-31 23:53:10,31223,Convention Center / 7th & M St NW,31201,15th & P St NW,W01262,Member
1226764,261,2011-12-31 23:47:27,2011-12-31 23:51:49,31107,Lamont & Mt Pleasant NW,31602,Park Rd & Holmead Pl NW,W00998,Member
1226765,2060,2011-12-31 23:55:12,2012-01-01 00:29:33,31205,21st & I St NW,31222,New York Ave & 15th St NW,W00042,Member
1226766,468,2011-12-31 23:55:56,2012-01-01 00:03:45,31221,18th & M St NW,31111,10th & U St NW,W01319,Member


In [5]:
# Convert start date to datetime format
bike_station["Start date"] = bike_station["Start date"].astype("datetime64")

In [6]:
# Check to see if the type conversion worked.
bike_station.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1226767 entries, 0 to 1226766
Data columns (total 9 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   Duration              1226767 non-null  int64         
 1   Start date            1226767 non-null  datetime64[ns]
 2   End date              1226767 non-null  object        
 3   Start station number  1226767 non-null  int64         
 4   Start station         1226767 non-null  object        
 5   End station number    1226767 non-null  int64         
 6   End station           1226767 non-null  object        
 7   Bike number           1226767 non-null  object        
 8   Member type           1226767 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(5)
memory usage: 84.2+ MB


In [7]:
# Here I am creating a new dataframe after first selecting all the rows with the "Casual" and then grouping them by Hour and then counting it using .count()
# renaming the column to "Casual"
casual = bike_station[bike_station['Member type'] == 'Casual'].groupby(pd.Grouper(key = 'Start date', freq = 'H')).count().rename(columns={'Member type': 'Casual'}).reset_index()
casual = casual[['Start date', "Casual"]]
casual.head()

Unnamed: 0,Start date,Casual
0,2011-01-01 00:00:00,3
1,2011-01-01 01:00:00,8
2,2011-01-01 02:00:00,5
3,2011-01-01 03:00:00,3
4,2011-01-01 04:00:00,0


In [8]:
# Here I am doing the same as above but for registered users. 
registered = bike_station[bike_station['Member type'] == 'Member'].groupby(pd.Grouper(key = 'Start date', freq = 'H')).count().rename(columns={'Member type': 'Registered'}).reset_index()
registered = registered[['Start date', "Registered"]]
registered.head()

Unnamed: 0,Start date,Registered
0,2011-01-01 00:00:00,13
1,2011-01-01 01:00:00,30
2,2011-01-01 02:00:00,26
3,2011-01-01 03:00:00,9
4,2011-01-01 04:00:00,1


In [9]:
# Now that we have our two dataframes, Registered and Casual, we need to join them into one df using merge.
df_user_count= pd.merge(registered,casual, how='outer', on = "Start date")
df_user_count.head()

Unnamed: 0,Start date,Registered,Casual
0,2011-01-01 00:00:00,13,3
1,2011-01-01 01:00:00,30,8
2,2011-01-01 02:00:00,26,5
3,2011-01-01 03:00:00,9,3
4,2011-01-01 04:00:00,1,0


In [10]:
# Now we create a total by adding the casual and registered user for each hour.
df_user_count = df_user_count.assign(Total = df_user_count['Registered'] + df_user_count['Casual'])
df_user_count.head()

Unnamed: 0,Start date,Registered,Casual,Total
0,2011-01-01 00:00:00,13,3,16
1,2011-01-01 01:00:00,30,8,38
2,2011-01-01 02:00:00,26,5,31
3,2011-01-01 03:00:00,9,3,12
4,2011-01-01 04:00:00,1,0,1


In [11]:
# # Now that we have our two dataframes, Registered and Casual, we need to join them into one df using merge.
# df_user_count= pd.merge(registered,casual, how='outer', on = "Start date")

# # But when we merge these two dataframes, Registered and Casual, we are merging them on the start date. 
# # So for instance at 1 am on 2011-01-01 there are 8 casual and 30 registered users.
# # But at 4 am on 2011-01-01 there is 0 
# df_user_count['Registered'] = df_user_count['Registered'].fillna(0)
# df_user_count['Casual'] = df_user_count['Casual'].fillna(0).astype(int)
# df_user_count = df_user_count.assign(Total = df_user_count['Registered'] + df_user_count['Casual'])

In [12]:
# Here I want to break up the start date to granular components that I may use for the machine learning model.

df_user_count['Year'] = df_user_count['Start date'].dt.year
df_user_count['Quarter'] = df_user_count['Start date'].dt.quarter
df_user_count['Month'] = df_user_count['Start date'].dt.month
df_user_count['Date'] = df_user_count['Start date'].dt.day
df_user_count['Day_of_Week'] = df_user_count['Start date'].dt.dayofweek
df_user_count['Hour'] = df_user_count['Start date'].dt.hour
df_user_count["Weekend"] = df_user_count["Date"] >= 5

In [13]:
# The weekend is boolean and shows as true or false. So that needs to be converted to integer
df_user_count.head()

Unnamed: 0,Start date,Registered,Casual,Total,Year,Quarter,Month,Date,Day_of_Week,Hour,Weekend
0,2011-01-01 00:00:00,13,3,16,2011,1,1,1,5,0,False
1,2011-01-01 01:00:00,30,8,38,2011,1,1,1,5,1,False
2,2011-01-01 02:00:00,26,5,31,2011,1,1,1,5,2,False
3,2011-01-01 03:00:00,9,3,12,2011,1,1,1,5,3,False
4,2011-01-01 04:00:00,1,0,1,2011,1,1,1,5,4,False


In [14]:
# Converting Weekend from True or False to 1 or 0

df_user_count["Weekend"] = df_user_count["Weekend"].astype(int)

In [15]:
# Now we can safely drop the Start Date column since we've extracted everything we can regarding time. 
df_user_count = df_user_count[["Registered", "Casual", "Total", "Year", "Quarter", "Month", "Date", "Day_of_Week", "Hour", "Weekend"]]
df_user_count.head()

Unnamed: 0,Registered,Casual,Total,Year,Quarter,Month,Date,Day_of_Week,Hour,Weekend
0,13,3,16,2011,1,1,1,5,0,0
1,30,8,38,2011,1,1,1,5,1,0
2,26,5,31,2011,1,1,1,5,2,0
3,9,3,12,2011,1,1,1,5,3,0
4,1,0,1,2011,1,1,1,5,4,0


# Now we can start with the Machine Learning model.

In [23]:
# split the data here
X_data = df_user_count[["Month", "Date","Day_of_Week"]]
y_data = df_user_count[["Total"]]

In [34]:
# train the linear regression model here.

X_train, X_test, y_train, y_test = train_test_split(X_data,y_data,test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test


(      Month  Date  Day_of_Week
 8415     12    17            5
 5049      7    30            5
 8395     12    16            4
 1535      3     5            5
 5518      8    18            3
 ...     ...   ...          ...
 5734      8    27            5
 5191      8     5            4
 5390      8    13            5
 860       2     5            5
 7270     10    30            6
 
 [7008 rows x 3 columns],
       Month  Date  Day_of_Week
 6056      9    10            5
 5556      8    20            5
 5990      9     7            2
 7674     11    16            2
 3319      5    19            3
 ...     ...   ...          ...
 8307     12    13            1
 100       1     5            2
 6605     10     3            0
 1783      3    16            2
 6013      9     8            3
 
 [1752 rows x 3 columns],
       Total
 8415    225
 5049    174
 8395    211
 1535     44
 5518    135
 ...     ...
 5734      0
 5191    266
 5390    253
 860      67
 7270     71
 
 [7008 rows x 1 co

In [25]:
# predict the results here

# data normalisation with sklearn
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing data
X_test_norm = norm.transform(X_test)


In [33]:
pred_lr = lr.predict(X_test)

TypeError: predict() missing 1 required positional argument: 'X'

In [31]:
lr.fit(X_train, y_train)

TypeError: fit() missing 1 required positional argument: 'y'

In [19]:
# evaluate the model here.



In [28]:
#display(x_data.head())
#display(y_data.head())

# Split Data 

In [None]:
#bike_station.isnull().any()

In [36]:
reg = lr().fit(X_train, y_train)

In [37]:
y_predicted = reg.predict(X_test)
y_predicted

array([[158.20515973],
       [149.50365125],
       [159.09407625],
       ...,
       [167.36419741],
       [113.41301476],
       [158.79777074]])

In [38]:
reg.score(y_predicted, y_test)



ValueError: X has 1 features, but LinearRegression is expecting 3 features as input.

In [39]:
display(y_predicted)
display(y_test.head())


array([[158.20515973],
       [149.50365125],
       [159.09407625],
       ...,
       [167.36419741],
       [113.41301476],
       [158.79777074]])

Unnamed: 0,Total
6056,119
5556,403
5990,24
7674,188
3319,254


In [40]:
reg.r2_score(y_test, y_predicted)

AttributeError: 'LinearRegression' object has no attribute 'r2_score'

In [41]:
reg.coef_

array([[ 7.42151056, -0.12799979, -0.16830572]])

In [42]:
reg.intercept_

array([93.53309117])

In [41]:
#reg.predict #(np.array([[3, 5]])) ex.