In [3]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [4]:
def convert_dt(original_dt, dt_format):
    original = original_dt
    date_objects = []

    for d in original:
        dt_obj = datetime.strptime(d, dt_format)
        date_objects.append(dt_obj)
        
    return date_objects

In [5]:
def metrics(y_pred, y_test):
    
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='macro')
    precision = precision_score(y_test,y_pred, average='macro')
    
    print('Accuracy: {}'.format(accuracy))
    print('Recall: {}'.format(recall))
    print('Precision: {}'.format(precision))

In [6]:
taxi_df = pd.read_csv('Taxi_df.csv')
taxi_df = taxi_df.drop('Unnamed: 0', axis = 1)
 
pudt = taxi_df['pickup_datetime']
pudt = convert_dt(pudt, '%Y-%m-%d %H:%M:%S')

taxi_df['pickup_datetime'] = pudt
taxi_df.head()

Unnamed: 0,hack_license,pickup_datetime,payment_type,total_amount,tip_amount,tip_percentage,passenger_count,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,tip_rate,TAVG,precipitation,rush_hour
0,1A6CAB3092DCB1821793159CE85E889B,2013-01-01 16:57:00,CSH,21.0,0.0,0.0,5,960,6.49,-73.862747,40.769035,-73.924576,40.739918,Low,32.0,0.000755,True
1,1A6CAB3092DCB1821793159CE85E889B,2013-01-01 13:53:00,CSH,8.5,0.0,0.0,5,360,2.09,-73.989174,40.774166,-74.000671,40.750278,Low,32.0,0.000755,False
2,1A6CAB3092DCB1821793159CE85E889B,2013-01-01 15:16:00,CRD,12.38,2.38,19.224556,5,540,2.25,-74.004433,40.722157,-73.991508,40.748821,Mid,32.0,0.000755,False
3,1A6CAB3092DCB1821793159CE85E889B,2013-01-01 17:53:00,CSH,4.0,0.0,0.0,5,60,0.53,-73.984528,40.728703,-73.987152,40.732422,Low,32.0,0.000755,True
4,1A6CAB3092DCB1821793159CE85E889B,2013-01-01 17:57:00,CRD,8.5,1.0,11.764706,5,420,1.39,-73.988556,40.737137,-73.975616,40.750626,Low,32.0,0.000755,True


In [5]:
target = 'tip_rate'
feats = ['payment_type','total_amount','passenger_count','trip_time_in_secs',
         'trip_distance','TAVG','precipitation','rush_hour']

In [6]:
le = preprocessing.LabelEncoder()
le.fit(taxi_df[target])

LabelEncoder()

In [7]:
taxi_df[target] = le.transform(taxi_df[target])

In [8]:
le2 = preprocessing.LabelEncoder()
le2.fit(taxi_df['payment_type'])

LabelEncoder()

In [9]:
taxi_df['payment_type'] = le2.transform(taxi_df['payment_type'])

In [10]:
taxi_df.head()

Unnamed: 0,hack_license,pickup_datetime,payment_type,total_amount,tip_amount,tip_percentage,passenger_count,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,tip_rate,TAVG,precipitation,rush_hour
0,1A6CAB3092DCB1821793159CE85E889B,2013-01-01 16:57:00,1,21.0,0.0,0.0,5,960,6.49,-73.862747,40.769035,-73.924576,40.739918,1,32.0,0.000755,True
1,1A6CAB3092DCB1821793159CE85E889B,2013-01-01 13:53:00,1,8.5,0.0,0.0,5,360,2.09,-73.989174,40.774166,-74.000671,40.750278,1,32.0,0.000755,False
2,1A6CAB3092DCB1821793159CE85E889B,2013-01-01 15:16:00,0,12.38,2.38,19.224556,5,540,2.25,-74.004433,40.722157,-73.991508,40.748821,2,32.0,0.000755,False
3,1A6CAB3092DCB1821793159CE85E889B,2013-01-01 17:53:00,1,4.0,0.0,0.0,5,60,0.53,-73.984528,40.728703,-73.987152,40.732422,1,32.0,0.000755,True
4,1A6CAB3092DCB1821793159CE85E889B,2013-01-01 17:57:00,0,8.5,1.0,11.764706,5,420,1.39,-73.988556,40.737137,-73.975616,40.750626,1,32.0,0.000755,True


In [11]:
X_train, X_test, y_train, y_test = train_test_split(taxi_df[feats], taxi_df[target], test_size = 0.3)

## DecisionTree Classifier

In [12]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = 'entropy', min_samples_split = 2)

In [13]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [14]:
y_predict = tree.predict(X_test)

In [15]:
metrics(y_predict, y_test)

Accuracy: 0.9272055083916905
Recall: 0.8482958614775726
Precision: 0.8466043002972466


## Naive Bayes

In [16]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [17]:
gnb_model = gnb.fit(X_train, y_train)

In [18]:
y_pred = gnb_model.predict(X_test)

In [19]:
metrics(y_pred, y_test)

Accuracy: 0.03894996283400493
Recall: 0.33473133191822374
Precision: 0.29651123033771926


  'precision', 'predicted', average, warn_for)


## RandomForest Classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier

nTrees = 100
max_depth = 5
min_node_size = 5
verbose = 0

clf = RandomForestClassifier(n_estimators=nTrees, max_depth=max_depth, random_state=0, verbose=verbose, min_samples_leaf=min_node_size)

In [21]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [22]:
y_pred = clf.predict(X_test)

In [23]:
metrics(y_pred, y_test)

Accuracy: 0.8001427956652714
Recall: 0.5797752784651665
Precision: 0.5394337244724247


  'precision', 'predicted', average, warn_for)


## SVM

In [27]:
from sklearn import svm
clf = svm.SVC(kernel='rbf')

In [None]:
svm_model = clf.fit(X_train, y_train) 

In [None]:
accuracy = round(float((target_test_pred == target_test).sum())/len(target_test)*100,2)
print("Accuracy=%.2f%%"%accuracy)

## K Nearest Neighbors

In [24]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)


In [25]:
knn.fit(X_train, y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [26]:
y_pred = knn.predict(X_test)

In [27]:
metrics(y_pred, y_test)

Accuracy: 0.7749911975274832
Recall: 0.5633980388122798
Precision: 0.5585610298301598
