# Dublin Bikes Linear Regression Modelling

### The CSV files referenced here are available to download [here.](https://drive.google.com/drive/folders/1cVK3-9skev3Xg-FhSA6Yl8HMqaucgfTz?usp=sharing)

## Merge Weather and Bikes Availability Data Frames

In [1]:
#Import package pandas for data analysis
import pandas as pd

#Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt

#For showing plots directly in the notebook run the command below
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

import pickle

In [2]:
# Import weather and bike_availability csvs
weather_df = pd.read_csv('weather.csv')
bike_df = pd.read_csv('dublin_bikes_availablity.csv')

In [3]:
#This function is used repeatedly to compute all metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('\n==============================================================================')
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    #print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

## Changing data types

### Weather

In [4]:
# Select columns containing categorical data
weather_categorical_columns = weather_df[['weather_description', 'weather_icon', 'base', 'sys_country']].columns

# Convert data type to category for these columns
for column in weather_categorical_columns:
    weather_df[column] = weather_df[column].astype('category')

In [5]:
weather_datetime_columns = weather_df[['created_at', 'dt', 'sys_sunset', 'sys_sunrise']].columns

# Convert data type to datetime for these columns

for column in weather_datetime_columns:
    weather_df[column] = weather_df[column].astype('datetime64[ns]')

In [6]:
weather_df.dtypes

id                              int64
coord_lon                     float64
coord_lat                     float64
weather_id                      int64
weather_main                   object
weather_description          category
weather_icon                 category
base                         category
main_temp                     float64
main_pressure                   int64
main_humidity                   int64
main_temp_min                 float64
main_temp_max                 float64
visibility                      int64
wind_speed                    float64
wind_deg                        int64
clouds_all                      int64
dt                     datetime64[ns]
sys_type                        int64
sys_id                          int64
sys_message                   float64
sys_country                  category
sys_sunrise            datetime64[ns]
sys_sunset             datetime64[ns]
city_id                         int64
city_name                      object
cod         

### Bike Availability

In [7]:
# changing the feature status to datatype categorical 

bike_df['status'] = bike_df['status'].astype('category')

In [8]:
# changing features create_at and last_updated to datatype datetime64

bike_df['created_at'] = pd.to_datetime(bike_df.created_at)
bike_df['last_update'] = pd.to_datetime(bike_df.last_update)

# Let view our updated datatypes!

bike_df.dtypes

id                                int64
number                            int64
bike_stands                       int64
available_bike_stands             int64
available_bikes                   int64
status                         category
last_update              datetime64[ns]
created_at               datetime64[ns]
dtype: object

In [9]:
# Let's create a new df with only the features we want from weather, 'weather_main', 'main_temp', 'main_pressure', 'main_humidity' and 'wind_speed'
weather_df.head()
slim_weather_df = weather_df[['main_temp', 'main_pressure', 'main_humidity', 'wind_speed', 'dt']]

In [10]:
slim_weather_df.head()

Unnamed: 0,main_temp,main_pressure,main_humidity,wind_speed,dt
0,284.58,1011,81,4.6,2019-02-20 18:30:00
1,284.57,1011,81,4.6,2019-02-20 18:30:00
2,284.57,1011,81,4.6,2019-02-20 18:30:00
3,284.57,1011,81,4.6,2019-02-20 18:30:00
4,284.15,1011,87,6.2,2019-02-20 19:00:00


In [11]:
bike_df.head()

Unnamed: 0,id,number,bike_stands,available_bike_stands,available_bikes,status,last_update,created_at
0,1,42,30,0,30,OPEN,2019-02-20 19:08:32,2019-02-20 19:10:01
1,2,30,20,17,3,OPEN,2019-02-20 19:02:00,2019-02-20 19:10:01
2,3,54,33,33,0,OPEN,2019-02-20 19:02:15,2019-02-20 19:10:01
3,4,108,40,26,13,OPEN,2019-02-20 19:01:08,2019-02-20 19:10:01
4,5,56,40,34,6,OPEN,2019-02-20 19:08:18,2019-02-20 19:10:01


### Using merge_asof to match the nearest key values

In [12]:
bike_df.sort_values('last_update', inplace=True)
slim_weather_df.sort_values('dt', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
slim_weather_df.head()

Unnamed: 0,main_temp,main_pressure,main_humidity,wind_speed,dt
0,284.58,1011,81,4.6,2019-02-20 18:30:00
1,284.57,1011,81,4.6,2019-02-20 18:30:00
2,284.57,1011,81,4.6,2019-02-20 18:30:00
3,284.57,1011,81,4.6,2019-02-20 18:30:00
10,284.15,1011,87,6.2,2019-02-20 19:00:00


In [14]:
bike_df.head()

Unnamed: 0,id,number,bike_stands,available_bike_stands,available_bikes,status,last_update,created_at
70,71,94,40,0,40,OPEN,2019-02-20 18:59:04,2019-02-20 19:10:01
21,22,87,38,21,17,OPEN,2019-02-20 18:59:34,2019-02-20 19:10:01
52,53,104,40,40,0,OPEN,2019-02-20 18:59:40,2019-02-20 19:10:01
40,41,77,29,25,4,OPEN,2019-02-20 19:00:15,2019-02-20 19:10:01
68,69,59,20,17,3,OPEN,2019-02-20 19:00:29,2019-02-20 19:10:01


In [15]:
merged_dataframe = pd.merge_asof(bike_df, slim_weather_df, left_on='last_update', right_on='dt')

In [16]:
# Compare size of dataframes
merged_dataframe.shape

(1019825, 13)

In [17]:
bike_df.shape

(1019825, 8)

In [18]:
merged_dataframe.head()

Unnamed: 0,id,number,bike_stands,available_bike_stands,available_bikes,status,last_update,created_at,main_temp,main_pressure,main_humidity,wind_speed,dt
0,71,94,40,0,40,OPEN,2019-02-20 18:59:04,2019-02-20 19:10:01,284.57,1011,81,4.6,2019-02-20 18:30:00
1,22,87,38,21,17,OPEN,2019-02-20 18:59:34,2019-02-20 19:10:01,284.57,1011,81,4.6,2019-02-20 18:30:00
2,53,104,40,40,0,OPEN,2019-02-20 18:59:40,2019-02-20 19:10:01,284.57,1011,81,4.6,2019-02-20 18:30:00
3,41,77,29,25,4,OPEN,2019-02-20 19:00:15,2019-02-20 19:10:01,284.15,1011,87,6.2,2019-02-20 19:00:00
4,69,59,20,17,3,OPEN,2019-02-20 19:00:29,2019-02-20 19:10:01,284.15,1011,87,6.2,2019-02-20 19:00:00


In [19]:
df = merged_dataframe.drop(['id', 'created_at', 'dt'], axis=1)

In [20]:
df.head()

Unnamed: 0,number,bike_stands,available_bike_stands,available_bikes,status,last_update,main_temp,main_pressure,main_humidity,wind_speed
0,94,40,0,40,OPEN,2019-02-20 18:59:04,284.57,1011,81,4.6
1,87,38,21,17,OPEN,2019-02-20 18:59:34,284.57,1011,81,4.6
2,104,40,40,0,OPEN,2019-02-20 18:59:40,284.57,1011,81,4.6
3,77,29,25,4,OPEN,2019-02-20 19:00:15,284.15,1011,87,6.2
4,59,20,17,3,OPEN,2019-02-20 19:00:29,284.15,1011,87,6.2


## Training the model

In [21]:
df.dtypes

number                            int64
bike_stands                       int64
available_bike_stands             int64
available_bikes                   int64
status                         category
last_update              datetime64[ns]
main_temp                       float64
main_pressure                     int64
main_humidity                     int64
wind_speed                      float64
dtype: object

Let's take `available_bikes` to be our target feature

### Preparing the data

In [22]:
# Create new column weekday
df['weekday'] = df['last_update'].dt.dayofweek

In [23]:
# Create new column hour
df['hour'] = df['last_update'].dt.hour

In [24]:
# Use one hot encoding to peform binarisation of categorical data (weather first, then hour)
hour_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, hour_dummies], axis=1)
df = df.drop('hour', axis = 1)
df = df.drop('last_update', axis = 1)

In [25]:
df.head().T

Unnamed: 0,0,1,2,3,4
number,94,87,104,77,59
bike_stands,40,38,40,29,20
available_bike_stands,0,21,40,25,17
available_bikes,40,17,0,4,3
status,OPEN,OPEN,OPEN,OPEN,OPEN
main_temp,284.57,284.57,284.57,284.15,284.15
main_pressure,1011,1011,1011,1011,1011
main_humidity,81,81,81,87,87
wind_speed,4.6,4.6,4.6,6.2,6.2
weekday,2,2,2,2,2


In [26]:
# Create 7 dataframes based on day of week
df_monday = df[df.weekday == 0]
df_tuesday = df[df.weekday == 1]
df_wednesday = df[df.weekday == 2]
df_thursday = df[df.weekday == 3]
df_friday = df[df.weekday == 4]
df_saturday = df[df.weekday == 5]
df_sunday = df[df.weekday == 6]

In [27]:
# Then group_by number
df_monday_grouped = df_monday.groupby('number')
df_tuesday_grouped = df_tuesday.groupby('number')
df_wednesday_grouped = df_wednesday.groupby('number')
df_thursday_grouped = df_thursday.groupby('number')
df_friday_grouped = df_friday.groupby('number')
df_saturday_grouped = df_saturday.groupby('number')
df_sunday_grouped = df_sunday.groupby('number')


In [28]:
df_sunday_grouped.get_group(2).head()

Unnamed: 0,number,bike_stands,available_bike_stands,available_bikes,status,main_temp,main_pressure,main_humidity,wind_speed,weekday,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
101772,2,20,17,3,OPEN,277.41,1032,93,2.6,6,...,0,0,0,0,0,0,0,0,0,0
101773,2,20,17,3,OPEN,277.41,1032,93,2.6,6,...,0,0,0,0,0,0,0,0,0,0
101936,2,20,17,3,OPEN,277.41,1032,93,2.6,6,...,0,0,0,0,0,0,0,0,0,0
102138,2,20,16,4,OPEN,277.05,1032,94,1.5,6,...,0,0,0,0,0,0,0,0,0,0
102139,2,20,16,4,OPEN,277.05,1032,94,1.5,6,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Get a list of all station numbers as a list
stations = list(sorted(df.number.unique()))

## Training with Continuous and Categorical Features

In [30]:
categ_features = hour_dummies.columns.values.tolist()
cont_features = ['main_temp', 'main_pressure', 'main_humidity', 'wind_speed']

features = categ_features + cont_features
print("\nCont features: ", cont_features)
print("Categ features: ", categ_features)
print("Features: ", features)


Cont features:  ['main_temp', 'main_pressure', 'main_humidity', 'wind_speed']
Categ features:  ['hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23']
Features:  ['hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'main_temp', 'main_pressure', 'main_humidity', 'wind_speed']


In [31]:
## Training the models - Sunday

test_gt_60 = 0

for station in stations:
    # Train the model
    X = df_sunday_grouped.get_group(station)[features]
    y = df_sunday_grouped.get_group(station).available_bikes
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = LinearRegression(fit_intercept=False).fit(X_train, y_train)
    
    # Write to a pickle file
    with open('pickles/Sunday_' + str(station) + '.pkl', 'wb') as handle:
        pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL)
    
    # Predicted price on training set
    train_predictions = model.predict(X_train)
#     print('Training Set Predictions for Station: ', station)
#     printMetrics(y_train, train_predictions)
#     print()
    
    # Predicted price on test set
    test_predictions = model.predict(X_test)
#     print('Test Set Predictions for Station: ', station)
#     printMetrics(y_test, test_predictions)
#     print()
    
    if metrics.r2_score(y_test, test_predictions) > 0.6:
        test_gt_60+=1

print('Percentage of Models with R^2 greater than 0.6:', test_gt_60 / 113 * 100)
    

Percentage of Models with R^2 greater than 0.6: 23.893805309734514


In [32]:
## Training the models - Monday

test_gt_60 = 0

for station in stations:
    # Train the model
    X = df_monday_grouped.get_group(station)[features]
    y = df_monday_grouped.get_group(station).available_bikes
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = LinearRegression(fit_intercept=False).fit(X_train, y_train)
    
    # Write to a pickle file
    with open('pickles/Monday_' + str(station) + '.pkl', 'wb') as handle:
        pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL)
    
    # Predicted price on training set
    train_predictions = model.predict(X_train)
#     print('Training Set Predictions for Station: ', station)
#     printMetrics(y_train, train_predictions)
#     print()
    
    # Predicted price on test set
    test_predictions = model.predict(X_test)
#     print('Test Set Predictions for Station: ', station)
#     printMetrics(y_test, test_predictions)
#     print()
    if metrics.r2_score(y_test, test_predictions) > 0.6:
        test_gt_60+=1

print('Percentage of Models with R^2 greater than 0.6:', test_gt_60 / 113 * 100)

Percentage of Models with R^2 greater than 0.6: 38.93805309734513


In [33]:
## Training the models - Tuesday

test_gt_60 = 0

for station in stations:
    # Train the model
    X = df_tuesday_grouped.get_group(station)[features]
    y = df_tuesday_grouped.get_group(station).available_bikes
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = LinearRegression(fit_intercept=False).fit(X_train, y_train)
    
    # Write to a pickle file
    with open('pickles/Tuesday_' + str(station) + '.pkl', 'wb') as handle:
        pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL)
    
    # Predicted price on training set
    train_predictions = model.predict(X_train)
#     print('Training Set Predictions for Station: ', station)
#     printMetrics(y_train, train_predictions)
#     print()
    
    # Predicted price on test set
    test_predictions = model.predict(X_test)
#     print('Test Set Predictions for Station: ', station)
#     printMetrics(y_test, test_predictions)
#     print()
    if metrics.r2_score(y_test, test_predictions) > 0.6:
        test_gt_60+=1

print('Percentage of Models with R^2 greater than 0.6:', test_gt_60 / 113 * 100)

Percentage of Models with R^2 greater than 0.6: 72.56637168141593


In [34]:
## Training the models - Wednesday

test_gt_60 = 0

for station in stations:
    # Train the model
    X = df_wednesday_grouped.get_group(station)[features]
    y = df_wednesday_grouped.get_group(station).available_bikes
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = LinearRegression(fit_intercept=False).fit(X_train, y_train)
    
    # Write to a pickle file
    with open('pickles/Wednesday_' + str(station) + '.pkl', 'wb') as handle:
        pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL)
    
    # Predicted price on training set
    train_predictions = model.predict(X_train)
#     print('Training Set Predictions for Station: ', station)
#     printMetrics(y_train, train_predictions)
#     print()
    
    # Predicted price on test set
    test_predictions = model.predict(X_test)
#     print('Test Set Predictions for Station: ', station)
#     printMetrics(y_test, test_predictions)
#     print()
    if metrics.r2_score(y_test, test_predictions) > 0.6:
        test_gt_60+=1

print('Percentage of Models with R^2 greater than 0.6:', test_gt_60 / 113 * 100)

Percentage of Models with R^2 greater than 0.6: 75.22123893805309


In [35]:
## Training the models - Thursday

test_gt_60 = 0

for station in stations:
    # Train the model
    X = df_thursday_grouped.get_group(station)[features]
    y = df_thursday_grouped.get_group(station).available_bikes
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = LinearRegression(fit_intercept=False).fit(X_train, y_train)
    
    # Write to a pickle file
    with open('pickles/Thursday_' + str(station) + '.pkl', 'wb') as handle:
        pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL)
    
    # Predicted price on training set
    train_predictions = model.predict(X_train)
#     print('Training Set Predictions for Station: ', station)
#     printMetrics(y_train, train_predictions)
#     print()
    
    # Predicted price on test set
    test_predictions = model.predict(X_test)
#     print('Test Set Predictions for Station: ', station)
#     printMetrics(y_test, test_predictions)
#     print()
    if metrics.r2_score(y_test, test_predictions) > 0.6:
        test_gt_60+=1

print('Percentage of Models with R^2 greater than 0.6:', test_gt_60 / 113 * 100)

Percentage of Models with R^2 greater than 0.6: 71.68141592920354


In [36]:
## Training the models - Friday

test_gt_60 = 0

for station in stations:
    # Train the model
    X = df_friday_grouped.get_group(station)[features]
    y = df_friday_grouped.get_group(station).available_bikes
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = LinearRegression(fit_intercept=False).fit(X_train, y_train)
    
    # Write to a pickle file
    with open('pickles/Friday_' + str(station) + '.pkl', 'wb') as handle:
        pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL)
    
    # Predicted price on training set
    train_predictions = model.predict(X_train)
#     print('Training Set Predictions for Station: ', station)
#     printMetrics(y_train, train_predictions)
#     print()
    
    # Predicted price on test set
    test_predictions = model.predict(X_test)
#     print('Test Set Predictions for Station: ', station)
#     printMetrics(y_test, test_predictions)
#     print()
    if metrics.r2_score(y_test, test_predictions) > 0.6:
        test_gt_60+=1

print('Percentage of Models with R^2 greater than 0.6:', test_gt_60 / 113 * 100)

Percentage of Models with R^2 greater than 0.6: 73.45132743362832


In [37]:
## Training the models - Saturday

test_gt_60 = 0

for station in stations:
    # Train the model
    X = df_saturday_grouped.get_group(station)[features]
    y = df_saturday_grouped.get_group(station).available_bikes
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = LinearRegression(fit_intercept=False).fit(X_train, y_train)
    
    # Write to a pickle file
    with open('pickles/Saturday_' + str(station) + '.pkl', 'wb') as handle:
        pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL)
    
    # Predicted price on training set
    train_predictions = model.predict(X_train)
#     print('Training Set Predictions for Station: ', station)
#     printMetrics(y_train, train_predictions)
#     print()
    
    # Predicted price on test set
    test_predictions = model.predict(X_test)
#     print('Test Set Predictions for Station: ', station)
#     printMetrics(y_test, test_predictions)
#     print()
    if metrics.r2_score(y_test, test_predictions) > 0.6:
        test_gt_60+=1

print('Percentage of Models with R^2 greater than 0.6:', test_gt_60 / 113 * 100)

Percentage of Models with R^2 greater than 0.6: 23.008849557522122


For Saturday's, Sunday's and Monday's our linear regression model is less accurate than for the other days of the week.

## Vectorizing input for predicting bike availability

In [38]:
def initDF():
    # Initialize the hours list
    hours = [0] * 24
    # Initialize weather continuous features
    weather_data = [0] * 4
    # Combine to form initial vector
    data = [hours + weather_data]
    
    df = pd.DataFrame(data, columns=['hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9',
                                  'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
                                  'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'main_temp', 'main_pressure', 'main_humidity', 'wind_speed'])
    
    return df

In [39]:
X = initDF()

In [40]:
X.head().T

Unnamed: 0,0
hour_0,0
hour_1,0
hour_2,0
hour_3,0
hour_4,0
hour_5,0
hour_6,0
hour_7,0
hour_8,0
hour_9,0


In [41]:
inputDict = {'hour': '10', 'main_temp': 281.11, 'main_pressure': 1025, 'main_humidity': 81, 'wind_speed': 10.6}

In [42]:
def setValuesDF(df, dict):
    df['hour_' + dict['hour']][0] = 1
    df['main_temp'][0] = dict['main_temp']
    df['main_pressure'][0] = dict['main_pressure']
    df['main_humidity'][0] = dict['main_humidity']
    df['wind_speed'][0] = dict['wind_speed']

In [44]:
setValuesDF(X, inputDict)

In [45]:
X.head().T

Unnamed: 0,0
hour_0,0
hour_1,0
hour_2,0
hour_3,0
hour_4,0
hour_5,0
hour_6,0
hour_7,0
hour_8,0
hour_9,0


In [46]:
# Make a prediction with the input dataframe

with open('pickles/Monday_5.pkl', 'rb') as handle:
    model = pickle.load(handle) 
    
result = model.predict(X)

In [47]:
result

array([6.25731233])