In [1]:
import math
from datetime import datetime
import random
# performing linear algebra 
import numpy as np 

# data processing 
import pandas as pd 

# visualisation 
import matplotlib.pyplot as plt 

# logistic regression
from sklearn import linear_model 

# To split train and test set
from sklearn.model_selection import train_test_split 
# Confusion Matrix
from sklearn.metrics import confusion_matrix
#from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),    
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

#    Prepare Dataset
   

## Fetch  data set 

In [2]:
data = pd.read_csv(".\\Data\\train.csv") 

#print (data)

# Handle data imbalance for Incorrect

In [3]:
incorrectLabeledData = data.loc[data['label'] == "incorrect"]
#incorrectLabeledData
incorrectLabeledData = incorrectLabeledData.reset_index(drop=True)
#print (incorrectLabeledData)

correctLabeledData = (data.loc[data['label'] == "correct"])

correctLabeledDataLength = correctLabeledData[correctLabeledData.columns[0]].count()
print(correctLabeledDataLength)
moreRowsToBalance = correctLabeledDataLength - incorrectLabeledData[incorrectLabeledData.columns[0]].count()
print(moreRowsToBalance)

indices = []
for x in range(1,moreRowsToBalance):
    indexToAdd = random.randrange(0,1680)
    indices.append(indexToAdd)
#print (toBeAdded)
#print (indices)
toBeAdded = incorrectLabeledData.iloc[indices, :]
#print (toBeAdded)

data.append(toBeAdded)

15495
13814


Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
0,189123628,10.5,834.0,56.0,0.000000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,correct
1,189125358,10.5,791.0,47.0,0.000000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
2,189125719,10.5,1087.0,80.0,0.000000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
3,189127273,10.5,598.0,271.0,15.663800,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,correct
4,189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,correct
5,189129552,10.5,3407.0,182.0,0.000000,112.0,11/1/2019 5:38,11/1/2019 6:35,7.13402,79.8969,6.91865,79.8649,1065.02,correct
6,189132829,10.5,1246.0,487.0,0.000000,133.0,11/1/2019 6:29,11/1/2019 6:49,6.84371,79.9051,6.85069,79.8624,266.62,correct
7,189135103,10.5,1333.0,295.0,17.198500,212.0,11/1/2019 6:50,11/1/2019 7:12,6.90760,79.9524,6.90634,79.9042,318.05,correct
8,189139296,10.5,360.0,80.0,4.664000,3.0,11/1/2019 7:00,11/1/2019 7:06,7.26706,80.6064,7.27422,80.6124,100.32,correct
9,189138671,10.5,1539.0,588.0,33.986400,43.0,11/1/2019 7:02,11/1/2019 7:28,6.85137,79.9537,6.84779,79.9274,257.89,correct


### Calculate distance from longitude and latitude

In [4]:
def transform_data(dataframe):
    dataValues = dataframe.copy()
    R = 6373.0 # Radius of earth
    distance_col = []
    actual_duration_col = []
    total_rep_duration_col = []
    rep_time_diff_col = []

    for index,row in dataframe.iterrows():  
        #Calculate the distance 
        lat1 = math.radians(float(row["pick_lat"]))
        lon1 = math.radians(float(row["pick_lon"]))
        lat2 = math.radians(float(row["drop_lat"]))
        lon2 = math.radians(float(row["drop_lon"]))

        dlon = lon2 - lon1
        dlat = lat2 - lat1

        #Harversine formula
        a = math.sin(dlat/2)**2 + math.cos(lat1)*math.cos(lat2) * math.sin(dlon / 2)**2
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

        distance = R * c
        distance_col.append(distance)

        #Calculate actual duration
        start_time = datetime. strptime(row["pickup_time"], '%m/%d/%Y %H:%M')
        end_time = datetime. strptime(row["drop_time"], '%m/%d/%Y %H:%M')
        difference = end_time - start_time
        actual_duration_col.append(difference.total_seconds())
        sub_total = float(row["duration"]) + float(row["meter_waiting"]) + float(row["meter_waiting_till_pickup"])
        total_rep_duration_col.append(sub_total)
        #print(difference)

        #Calculate reported time difference
        #rep_time_diff_col.append(difference.total_seconds() - (float(row["duration"]) + float(row["meter_waiting"])))
        rep_time_diff_col.append(difference.total_seconds() - sub_total)
    #print(distance_col)
    #print (actual_duration_col)
    dataValues['Distance'] = distance_col
    dataValues = dataValues.drop(["pick_lat"], axis = 1) 
    dataValues = dataValues.drop(["pick_lon"], axis = 1) 
    dataValues = dataValues.drop(["drop_lat"], axis = 1) 
    dataValues = dataValues.drop(["drop_lon"], axis = 1)

    dataValues['ActualDuration'] = actual_duration_col
    dataValues= dataValues.drop(["pickup_time"], axis = 1)
    dataValues= dataValues.drop(["drop_time"], axis = 1)

    dataValues['TotalReportedTime'] = total_rep_duration_col

    dataValues['ReportedDurationDiff'] = rep_time_diff_col
    #dataValues = dataValues.drop(["ActualDuration"], axis = 1)
    dataValues = dataValues.drop(["duration"], axis = 1)
    dataValues = dataValues.drop(["meter_waiting"], axis = 1)

    dataValues = dataValues.drop(["additional_fare"], axis = 1)
    dataValues = dataValues.drop(["meter_waiting_fare"], axis = 1)
    dataValues = dataValues.drop(["tripid"], axis = 1)
    dataValues = dataValues.drop(["meter_waiting_till_pickup"], axis = 1)
    dataValues = dataValues.drop(["fare"], axis = 1)
    
    return dataValues

### Adding new distance column to data set 

In [5]:
#data['Distance'] = distance_col
#data = data.drop(["pick_lat"], axis = 1) 
#data = data.drop(["pick_lon"], axis = 1) 
#data = data.drop(["drop_lat"], axis = 1) 
#data = data.drop(["drop_lon"], axis = 1) 

#data['ActualDuration'] = actual_duration_col
#data= data.drop(["pickup_time"], axis = 1)
#data= data.drop(["drop_time"], axis = 1)

#data['TotalReportedTime'] = total_rep_duration_col

#data['ReportedDurationDiff'] = rep_time_diff_col
#data = data.drop(["ActualDuration"], axis = 1)
#data = data.drop(["duration"], axis = 1)
#data = data.drop(["meter_waiting"], axis = 1)

#data = data.drop(["additional_fare"], axis = 1)
#data = data.drop(["meter_waiting_fare"], axis = 1)
#data = data.drop(["tripid"], axis = 1)
#data = data.drop(["meter_waiting_till_pickup"], axis = 1)
#data = data.drop(["fare"], axis = 1)
data = transform_data(data)

### Calculate duration from start and end times 

In [6]:
#print (data)

In [7]:
data.label = [1 if each.strip() == "correct" else 0 for each in data.label] 

In [8]:
#Remove data rows that contain NaN values
data.dropna(subset=["Distance","ReportedDurationDiff","ActualDuration", "TotalReportedTime"], inplace = True)
# Create train set
y = data.label.values 
x_data = data.drop(['label'], axis = 1) 

In [14]:
x_data

Unnamed: 0,Distance,ActualDuration,TotalReportedTime,ReportedDurationDiff
0,5.094369,840.0,954.0,-114.0
1,3.169052,780.0,972.0,-192.0
2,6.307375,1080.0,1228.0,-148.0
3,0.862217,600.0,937.0,-337.0
5,24.214638,3420.0,3701.0,-281.0
6,4.779123,1200.0,1866.0,-666.0
7,5.324215,1320.0,1840.0,-520.0
8,1.035627,360.0,443.0,-83.0
9,2.931635,1560.0,2170.0,-610.0
10,14.385516,0.0,118.0,-118.0


In [10]:
#x = (x_data - np.min(x_data))/(np.max(x_data) - np.min(x_data)).values 

In [12]:
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15, random_state = 42) 
x_train, x_test, y_train, y_test = train_test_split(x_data, y, test_size = 0.15, random_state = 42) 

#x_train = x_train.T 
#x_test = x_test.T 
#y_train = y_train.T 
#y_test = y_test.T 
print("X train : " , x_train.shape)
print("X test : " , x_test.shape)
print("Y train : " , y_train.shape)
print("Y test : " , y_test.shape)

X train :  (14427, 4)
X test :  (2547, 4)
Y train :  (14427,)
Y test :  (2547,)


## Classifiers

In [13]:
#fitting the model to the training set
#clf = svm.SVC(gamma='scale')
for name, clf in zip(names, classifiers):
    #classes = ["correct","incorrect"]
    #clf = svm.SVC(kernel='linear')
    print ()
    print("Classifier : {}".format(name))
    clf.fit(x_train, y_train)
    
    y_pred = clf.predict(x_test)
    #con = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    accuracy = (tp + tn ) / (tp + tn + fp + fn)
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1_score = 2*precision*recall/(precision + recall)
    
    
    print ("Accuracy : {}".format(accuracy) )
    print ("Precision : {}".format(precision) )
    print ("Recall : {}".format(recall) )
    print ("F1 score : {}".format(f1_score) )
    #print(y_pred)
    #Prepare test data
    testData = pd.read_csv(".\\Data\\test.csv")
    outData = pd.DataFrame()

    outData["tripid"] = testData["tripid"]
    testData = transform_data(testData)
    #Predict for test data
    y_pred = clf.predict(testData)
    outData["prediction"] = y_pred
    outData.to_csv("ExportedResults-"+name+".csv",index=False)


Classifier : Nearest Neighbors
Accuracy : 0.8979191205339615
Precision : 0.914763458401305
Recall : 0.9777680906713164
F1 score : 0.9452170248630425

Classifier : Linear SVM
Accuracy : 0.8994895956026698
Precision : 0.9076
Recall : 0.9891020052310375
F1 score : 0.9465999165623696

Classifier : RBF SVM
Accuracy : 0.8979191205339615
Precision : 0.9010252365930599
Recall : 0.9960767218831735
F1 score : 0.9461697722567288

Classifier : Decision Tree
Accuracy : 0.9057714958775029
Precision : 0.9137792103142627
Recall : 0.988666085440279
F1 score : 0.949748743718593

Classifier : Random Forest
Accuracy : 0.90616411464468
Precision : 0.9085487077534792
Recall : 0.9960767218831735
F1 score : 0.9503015179871076

Classifier : Neural Net
Accuracy : 0.8924224577934825
Precision : 0.9092382495948136
Recall : 0.978204010462075
F1 score : 0.9424611507769843

Classifier : AdaBoost
Accuracy : 0.9002748331370239
Precision : 0.9054054054054054
Recall : 0.993025283347864
F1 score : 0.9471933471933472

Cl

