In [11]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin,clone
from sklearn.pipeline import make_pipeline, FeatureUnion

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import seaborn as sns
import pickle
import cloudpickle
import os

In [12]:
def build_and_train(df_features,df_labels):
    df_sort_labels = df_labels.sort_values(['bookingID'])
    df_sort_features = df_features.sort_values(['bookingID'])
    df_merge = pd.merge(df_sort_features, df_sort_labels, on='bookingID',how='left')
    df_merge['hours'] = df_merge['second']/(60*60)
    df_merge = df_merge.drop(['second'],axis=1)
    print(df_merge.label.value_counts())
    print(df_merge.columns)
    df_downsampled  = downSampling(df_merge)
    num_cols = ['Accuracy', 'Bearing', 'acceleration_x', 'acceleration_y','acceleration_z', 
                'gyro_x', 'gyro_y', 'gyro_z', 'Speed','hours']
    df_input1 = df_downsampled.drop(['label'],axis=1)  
    df_input = df_input1.drop(['bookingID'],axis=1)  
        
    X = df_input
    Y = df_downsampled['label']    
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=70)
    
    grid = XGBClassifier(silent=0, 
                      scale_pos_weight=1,
                      learning_rate=0.3,  
                      colsample_bytree = 0.8,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=300, 
                      reg_alpha = 0.3,
                      max_depth=6,
                      min_child_weight=3,
                      n_jobs=-1,
                      gamma=5)
    
    grid.fit(X_train,y_train)
    y_tr_preds = grid.predict(X_train)
    
    accuracy = accuracy_score(y_train,y_tr_preds)
    precision = precision_score(y_train,y_tr_preds)
    recall = recall_score(y_train,y_tr_preds)
    f1score = f1_score(y_train,y_tr_preds)
    
    print("Train Accuracy   :  ",accuracy)
    print("Train Precision  :  ",precision)
    print("Train Recall     :  ",recall)
    print("Train f1score    :  ",f1score)
    
    y_ts_preds = grid.predict(X_test)
    
    accuracy = accuracy_score(y_test,y_ts_preds)
    precision = precision_score(y_test,y_ts_preds)
    recall = recall_score(y_test,y_ts_preds)
    f1score=f1_score(y_test,y_ts_preds)

    print("Test Accuracy   :  ",accuracy)
    print("Test Precision  :  ",precision)
    print("Test Recall     :  ",recall)
    print("Test f1score    :  ",f1score)
      
    return grid  
    

In [13]:
def downSampling(df_merge):
    print(df_merge.label.value_counts())
    df_merge_label0 = df_merge[df_merge.label==0]
    df_merge_label1 = df_merge[df_merge.label==1]
    df_merge_majority_downsample = resample(df_merge_label0, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(df_merge_label1),     # to match minority class
                                 random_state=0) # reproducible results
 
    # Combine minority class with downsampled majority class
    df_down = pd.concat([df_merge_label1,df_merge_majority_downsample])
    print("After downsampling")
    print(df_down.label.value_counts())
    return df_down

In [14]:
def listDir(dir):
    fileNames = os.listdir(dir)
    for fileName in fileNames:
        print('File Name:' + fileName)
        print('Folder Path: '+ os.path.abspath(os.path.join(dir,fileName)),sep='\n')
        df = pd.read_csv(os.path.abspath(os.path.join(dir,fileName)),index_col=None,header=0)
        li.append(df)
    csv_df = pd.concat(li, axis=0, ignore_index=True)
    return csv_df

In [None]:
if __name__ == '__main__':
    
    Feature_Path = r'C:\\Users\\bindu\\Desktop\\Safety\\safety\\features'
    Label_Path = r'C:\\Users\\bindu\\Desktop\\Safety\\safety\\labels'
    
    li = []
    df_features = listDir(Feature_Path)
   #df_features = pd.read_csv('../Ride_Safety_Data/part0.csv')
    li = []
    df_labels   = listDir(Label_Path)
    
    grid = build_and_train(df_features, df_labels)
    
    filename = 'Safety_Model_v3.pkl'
    
    with open('E:\\pickle\\'+filename, 'wb') as file:
            cloudpickle.dump(grid,file)
    print("model dumped with filename",filename)

File Name:part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Folder Path: C:\Users\bindu\Desktop\Safety\safety\features\part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
File Name:part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Folder Path: C:\Users\bindu\Desktop\Safety\safety\features\part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
File Name:part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Folder Path: C:\Users\bindu\Desktop\Safety\safety\features\part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
File Name:part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Folder Path: C:\Users\bindu\Desktop\Safety\safety\features\part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
File Name:part-00004-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Folder Path: C:\Users\bindu\Desktop\Safety\safety\features\part-00004-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
File Name:part-00005-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Folder Path: C:\Us