<a href="https://colab.research.google.com/github/Malintha1996/parallel-linked-list/blob/master/rideshare_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot  as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_curve, roc_auc_score,f1_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from geopy.distance import vincenty
from sklearn.model_selection import GridSearchCV
import calendar

%matplotlib inline

  import pandas.util.testing as tm


## Load Dataset

In [0]:
# training dataset
x_train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/ml-rideshare/train.csv')
train_set = x_train.copy()
tripid_train = x_train.pop('tripid')
y_train = pd.DataFrame(x_train.pop('label'),columns=['label'])

#testset
x_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/ml-rideshare/test.csv')
tripid_test = x_test.pop('tripid')
x_train.head()

In [0]:
from scipy import stats
z_scores = stats.zscore(train_set)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
new_df = train_set[filtered_entries]
print(new_df.shape)
print(x_train.shape)

 ## Analyze training set

Training dataset

In [0]:
# training feature set
print("\nTrain set description:\n",x_train.describe())
print("\nTrain set null count:\n",x_train.isnull().sum())

#training labels
print("\nTrain set lables description:\n",y_train.describe())
print(y_train['label'].value_counts().plot.barh(title="Label"))

Test dataset

In [0]:
#Testset features
print("Test set description:\n",x_test.describe())

## Preprocess data

In [0]:
# Encode target variable
le = preprocessing.LabelEncoder()
y_train['label'] = le.fit_transform(y_train['label'])
y_train['label'] = y_train['label'].replace({0: 1, 1: 0})

In [0]:
# Format timestamp to datetime
x_train['pickup_time'] = pd.to_datetime(x_train['pickup_time'], format="%m/%d/%Y  %H:%M" )
x_train['drop_time'] = pd.to_datetime(x_train['drop_time'], format="%m/%d/%Y  %H:%M" )

x_test['pickup_time'] = pd.to_datetime(x_test['pickup_time'], format="%m/%d/%Y  %H:%M" )
x_test['drop_time'] = pd.to_datetime(x_test['drop_time'], format="%m/%d/%Y  %H:%M" )

#create new features for day,hour,day_of_week,month and year
x_train.rename(columns={'pickup_time':'pickup_datetime','drop_time':'drop_datetime'}, inplace=True)
x_test.rename(columns={'pickup_time':'pickup_datetime','drop_time':'drop_datetime'}, inplace=True)


x_train['pickup_day']=x_train['pickup_datetime'].apply(lambda x:x.day)
x_train['pickup_hour']=x_train['pickup_datetime'].apply(lambda x:x.hour)
x_train['pickup_day_of_week']=x_train['pickup_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
x_train['pickup_month']=x_train['pickup_datetime'].apply(lambda x:x.month)
x_train['pickup_year']=x_train['pickup_datetime'].apply(lambda x:x.year)

x_train['drop_day']=x_train['drop_datetime'].apply(lambda x:x.day)
x_train['drop_hour']=x_train['drop_datetime'].apply(lambda x:x.hour)
x_train['drop_day_of_week']=x_train['drop_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
x_train['drop_month']=x_train['drop_datetime'].apply(lambda x:x.month)
x_train['drop_year']=x_train['drop_datetime'].apply(lambda x:x.year)


x_test['pickup_day']=x_test['pickup_datetime'].apply(lambda x:x.day)
x_test['pickup_hour']=x_test['pickup_datetime'].apply(lambda x:x.hour)
x_test['pickup_day_of_week']=x_test['pickup_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
x_test['pickup_month']=x_test['pickup_datetime'].apply(lambda x:x.month)
x_test['pickup_year']=x_test['pickup_datetime'].apply(lambda x:x.year)


x_test['drop_day']=x_test['drop_datetime'].apply(lambda x:x.day)
x_test['drop_hour']=x_test['drop_datetime'].apply(lambda x:x.hour)
x_test['drop_day_of_week']=x_test['drop_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
x_test['drop_month']=x_test['drop_datetime'].apply(lambda x:x.month)
x_test['drop_year']=x_test['drop_datetime'].apply(lambda x:x.year)

x_train.pop('pickup_datetime')
x_train.pop('drop_datetime')

x_test.pop('pickup_datetime')
x_test.pop('drop_datetime')

x_train['pick_lat'] = np.radians(x_train["pick_lat"])
x_train['pick_lon'] = np.radians(x_train["pick_lon"])
x_train['drop_lat'] = np.radians(x_train["drop_lat"])
x_train['drop_lon'] = np.radians(x_train["drop_lon"])

x_test['pick_lat'] = np.radians(x_test["pick_lat"])
x_test['pick_lon'] = np.radians(x_test["pick_lon"])
x_test['drop_lat'] = np.radians(x_test["drop_lat"])
x_test['drop_lon'] = np.radians(x_test["drop_lon"])

# create new feature distance
def distance_calc (row):
    start = (row['pick_lat'], row['pick_lon'])
    stop = (row['drop_lat'], row['drop_lon'])

    return vincenty(start, stop).meters

x_train['distance'] = x_train.apply(
    lambda row: distance_calc(row), axis=1)

x_test['distance'] = x_test.apply(
    lambda row: distance_calc(row), axis=1)

In [0]:
x_temp = x_train.copy()
x_temp['label'] = y_train
counts = (x_temp[['pickup_day_of_week', 'label']]
              .groupby(['pickup_day_of_week', 'label'])
              .size()
              .unstack('label')
         )
print(counts)
ax = counts.plot.barh()
ax.invert_yaxis()
print(ax.legend(
    loc='center right',  from sklearn import linear_model
 from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
    bbox_to_anchor=(1.3, 0.5), 
    title='label'
))

In [0]:
counts = (x_temp[['drop_day_of_week', 'label']]
              .groupby(['drop_day_of_week', 'label'])
              .size()
              .unstack('label')
         )
print(counts)
ax = counts.plot.barh()
ax.invert_yaxis()
print(ax.legend(
    loc='center right', 
    bbox_to_anchor=(1.3, 0.5), 
    title='label'
))

In [0]:
counts1 = (x_temp[['pickup_month', 'label']]
              .groupby(['pickup_month', 'label'])
              .size()
              .unstack('label')
         )
print(counts1)

counts2= (x_temp[['drop_month', 'label']]
              .groupby(['drop_month', 'label'])
              .size()
              .unstack('label')
         )
print(counts2)

print(x_train['pickup_month'].nunique())
print(x_train['pickup_day'].nunique())
print(x_train['meter_waiting_fare'].nunique())
print(x_train['meter_waiting_till_pickup'].nunique())
print(x_train['meter_waiting'].nunique())
print(x_train['duration'].nunique())

In [0]:
counts = (x_temp[['additional_fare', 'label']]
              .groupby(['additional_fare', 'label'])
              .size()
              .unstack('label')
         )
print(counts)
ax = counts.plot.barh()
ax.invert_yaxis()
print(ax.legend(
    loc='center right', 
    bbox_to_anchor=(1.3, 0.5), 
    title='label'
))

# Model creation and training

In [0]:
# Create preprocessing pipeline

#Feature selection
numeric_columns = ['additional_fare',
                   'duration',
                   'meter_waiting',
                   'pickup_day',
                   'pickup_hour',
                   'drop_hour',
                   'meter_waiting_fare',
                   'meter_waiting_till_pickup',
                   'distance',
                   'fare']
                   
categorical_columns = []

numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='mean')),
    ('standard_scaler',StandardScaler())
])

categorical_preprocessing_steps = Pipeline ([                               
    ('categorical_imputer', SimpleImputer(strategy ='most_frequent')),
    ('encode', OneHotEncoder()),

])

# preprocessing steps of the pipeline
preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_columns),
        ("categorical",categorical_preprocessing_steps,categorical_columns)
    ],
    remainder = "drop"
)


In [0]:
# Split training and test data
RANDOM_SEED = 6  
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.2,
    shuffle=True,
    stratify=y_train,
    random_state=RANDOM_SEED) 

# Model definition  
estimator =  RandomForestClassifier()
#estimator =  GradientBoostingClassifier()
#estimator =  DecisionTreeClassifier(max_depth=11,min_samples_split=7,min_samples_leaf=6,random_state=0)

full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    #("feature_selection",SelectFromModel(LinearSVC(loss='l2', penalty='l1', dual=False,max_iter=1000))),
    ("estimator", estimator),
])

In [0]:
#hyper-parameter tuning for model selection

weights = np.linspace(0.05, 0.95, 20)
param_grid={
        'estimator__class_weight': [{0: w} for w in [1, 2, 4, 6, 10]],
        "estimator__max_depth": [3, None],
        "estimator__min_samples_split": [1, 3, 10],
        "estimator__min_samples_leaf": [1, 3, 10]
}

g_search = GridSearchCV(estimator=full_pipeline, param_grid = param_grid)
g_fit = g_search.fit(X_train, Y_train['label'])
best_clf = g_fit.best_estimator_
print(best_clf.score(X_test,Y_test['label']))

In [0]:
# confution matrix
pred = best_clf.predict(X_test)
pd.crosstab(pred,Y_test['label'],rownames=['predicted'],colnames=['actual'])

(3436,)
(3436, 1)


actual,0,1
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,178,19
1,158,3081


In [0]:
x_test.head()

## Generate output

In [0]:
# Generate the best model for the whole dataset.
g_search = GridSearchCV(full_pipeline, parameters)
g_fit = g_search.fit(x_train, y_train['label'])
best_clf = g_fit.best_estimator_

# generate the predictions for the test dataset
pred = best_clf.predict(x_test)
cls_lbls = pd.Series(pred)
prediction = pd.DataFrame(cls_lbls,columns=['prediction'])
output = pd.DataFrame()
output['tripid'] = tripid_test
output['prediction'] = prediction['prediction']

output.to_csv('/content/drive/My Drive/Colab Notebooks/data/ml-rideshare/output-new-1.csv',index = False)