# Creating a BenchMark Model using Linear regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

In [2]:
#loading data
data=pd.read_csv('C:\\NYC_trip.csv')
data.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object

In [3]:
#Converting datetime data types
data.pickup_datetime=pd.to_datetime(data.pickup_datetime)
data.dropoff_datetime=pd.to_datetime(data.dropoff_datetime)
data.dtypes

id                            object
vendor_id                      int64
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
pickup_longitude             float64
pickup_latitude              float64
dropoff_longitude            float64
dropoff_latitude             float64
store_and_fwd_flag            object
trip_duration                  int64
dtype: object

In [4]:
#Creating new columns
data['Pickup_day']=data.pickup_datetime.dt.dayofweek
data['Dropoff_day']=data.dropoff_datetime.dt.dayofweek
data['Pickup_hour']=data.pickup_datetime.dt.hour
data['Dropoff_hour']=data.dropoff_datetime.dt.hour
data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,Pickup_day,Dropoff_day,Pickup_hour,Dropoff_hour
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400,0,0,16,16
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100,4,4,23,23
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635,6,6,17,18
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,1141,1,1,9,10
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,848,2,2,6,6


In [5]:
data.shape

(729322, 15)

In [6]:
data.isna().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
Pickup_day            0
Dropoff_day           0
Pickup_hour           0
Dropoff_hour          0
dtype: int64

In [7]:
data.dtypes

id                            object
vendor_id                      int64
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
pickup_longitude             float64
pickup_latitude              float64
dropoff_longitude            float64
dropoff_latitude             float64
store_and_fwd_flag            object
trip_duration                  int64
Pickup_day                     int64
Dropoff_day                    int64
Pickup_hour                    int64
Dropoff_hour                   int64
dtype: object

In [8]:
#changing datatypes
data['store_and_fwd_flag']=1*(data.store_and_fwd_flag.values=='Y')

In [9]:
data.dtypes

id                            object
vendor_id                      int64
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
pickup_longitude             float64
pickup_latitude              float64
dropoff_longitude            float64
dropoff_latitude             float64
store_and_fwd_flag             int32
trip_duration                  int64
Pickup_day                     int64
Dropoff_day                    int64
Pickup_hour                    int64
Dropoff_hour                   int64
dtype: object

In [10]:
#segeragating data
x=data.drop(['trip_duration','id', 'pickup_datetime', 'dropoff_datetime'], axis=1)   #independent variables
y=data['trip_duration']                   #dependent variable (target)

In [11]:
#dividing data into train and test dataset
from sklearn.model_selection import train_test_split as tts
trainx, testx, trainy, testy=tts(x,y, random_state=56)

In [12]:
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_absolute_error as mae

In [14]:
lr=LR()                                  #creating instance
lr.fit(trainx, trainy)                   #fitting data
#prediction over train dataset
train_prediction=lr.predict(trainx)
k=mae(train_prediction, trainy)
print("Train prediction mean absoulute error", k)
#prediction over test dataset
test_prediction=lr.predict(testx)
k=mae(test_prediction, testy)
print("Test prediction mean absolute error", k)

Train prediction mean absoulute error 603.9268434646589
Test prediction mean absolute error 597.4548441988155


We can definitely see that the predictions for the test dataset are much better than that of train data set

In [16]:
#Coefficient of all the parameter used in prediction
lr.coef_
#Checking the assumptions in LR benchmark model for train data
assump=pd.DataFrame({
    'fitted values':trainy,
    'predicted values':train_prediction
})
assump['residual']=assump['fitted values']-assump['predicted values']
assump.head()

Unnamed: 0,fitted values,predicted values,residual
15861,801,1070.696993,-269.696993
275904,378,1251.208302,-873.208302
237361,243,861.265379,-618.265379
142741,334,798.502602,-464.502602
21032,655,1065.190948,-410.190948


In [18]:
#Checking the assumptions in our LR benchmark model for test data
assum=pd.DataFrame({
    'Fitted values':testy,
    'Predicted values':test_prediction
})
assum['residual']=assum['Fitted values']-assum['Predicted values']
assum.head()

Unnamed: 0,Fitted values,Predicted values,residual
14908,2406,649.038122,1756.961878
87154,1457,914.855753,542.144247
666581,163,972.92292,-809.92292
543467,2160,1169.359371,990.640629
105193,402,761.103586,-359.103586


# BenchMark Model Complete