In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

In [2]:
#loading data
data=pd.read_csv('C:\\NYC_trip.csv')
data.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object

In [3]:
#changing datatypes
data.pickup_datetime=pd.to_datetime(data.pickup_datetime)
data.dropoff_datetime=pd.to_datetime(data.dropoff_datetime)
data.dtypes

id                            object
vendor_id                      int64
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
pickup_longitude             float64
pickup_latitude              float64
dropoff_longitude            float64
dropoff_latitude             float64
store_and_fwd_flag            object
trip_duration                  int64
dtype: object

In [4]:
#creating new data columns
data['Pickup_day']=data.pickup_datetime.dt.dayofweek
data['Dropoff_day']=data.dropoff_datetime.dt.dayofweek
data['Pickup_hour']=data.pickup_datetime.dt.hour
data['Dropoff_hour']=data.dropoff_datetime.dt.hour
data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,Pickup_day,Dropoff_day,Pickup_hour,Dropoff_hour
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400,0,0,16,16
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100,4,4,23,23
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635,6,6,17,18
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,1141,1,1,9,10
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,848,2,2,6,6


In [5]:
data.shape

(729322, 15)

In [6]:
data.isna().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
Pickup_day            0
Dropoff_day           0
Pickup_hour           0
Dropoff_hour          0
dtype: int64

In [7]:
data.dtypes

id                            object
vendor_id                      int64
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
pickup_longitude             float64
pickup_latitude              float64
dropoff_longitude            float64
dropoff_latitude             float64
store_and_fwd_flag            object
trip_duration                  int64
Pickup_day                     int64
Dropoff_day                    int64
Pickup_hour                    int64
Dropoff_hour                   int64
dtype: object

In [8]:
#changing datatypes
data['store_and_fwd_flag']=1*(data.store_and_fwd_flag.values=='Y')

In [9]:
data.dtypes

id                            object
vendor_id                      int64
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
pickup_longitude             float64
pickup_latitude              float64
dropoff_longitude            float64
dropoff_latitude             float64
store_and_fwd_flag             int32
trip_duration                  int64
Pickup_day                     int64
Dropoff_day                    int64
Pickup_hour                    int64
Dropoff_hour                   int64
dtype: object

In [10]:
#segeragating variables
x=data.drop(['trip_duration', 'id', 'pickup_datetime', 'dropoff_datetime'], axis=1)    #independent variable 
y=data['trip_duration']                                           #dependent variable

In [11]:
#importing train test split
from sklearn.model_selection import train_test_split as tts

In [12]:
#scaling the data
from sklearn.preprocessing import MinMaxScaler as MMS
scale=MMS()
scaled_x=scale.fit_transform(x)

In [13]:
X=pd.DataFrame(scaled_x)

In [14]:
#splitting the data
train_x, test_x, train_y, test_y=tts(X,y,random_state=67)

In [15]:
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.metrics import mean_squared_error as mse

In [16]:
reg=KNN(n_neighbors=5)                          #creating Instance
reg.fit(train_x, train_y)          #fitting the data

#prediction over train data set and calculating mse
train_predictio=reg.predict(train_x)
k=mse(train_predictio, train_y)
print("MSE for train prediction", k)

#prediction over test data set and calculating mse
test_predictio=reg.predict(test_x)
k=mse(test_predictio, test_y)
print("MSE for test prediction", k)

MSE for train prediction 4663653.463088276
MSE for test prediction 597151.5261518887


In [17]:
reg=KNN(n_neighbors=12)                          #creating Instance
reg.fit(train_x, train_y)          #fitting the data

#prediction over train data set and calculating mse
train_predictio=reg.predict(train_x)
k=mse(train_predictio, train_y)
print("MSE for train prediction", k)

#prediction over test data set and calculating mse
test_predictio=reg.predict(test_x)
k=mse(test_predictio, test_y)
print("MSE for test prediction", k)

MSE for train prediction 6790594.126306303
MSE for test prediction 1313741.1963829594


In [None]:
#to find the best value of K
def elbow(k):
    testmse=[]
    for i in k:
        reg=KNN(n_neighbors=i)
        reg.fit(train_x, train_y)
        tmp=reg.predict(test_x)
        tmp=mse(tmp, test_y)
        testmse.append(tmp)
    return testmse

#defining k range
K=range(1,100)
test=elbow(K)
#plotting the elbow curve
plt.plot(K, test)
plt.xlabel('K-neighbours')
plt.ylabel('Test MSE')
plt.title('Elbow Curve test')