In [1]:
import os

We have already raw data exists in data/raw folder. Now process the raw data.

In [3]:
get_processed_data_script_file=os.path.join(os.path.pardir,'src','data','get_processed_data.py')

In [59]:
%%writefile $get_processed_data_script_file

import os
import logging
import numpy as np
import pandas as pd

def read_data():
    '''
    This method reads raw data and assign to Data Frame
    '''
    #set the path for raw data
    raw_data_path = os.path.join(os.path.pardir, 'data','raw')
    fraud_data_file_path = os.path.join(raw_data_path, 'fraud_data.csv')
    fraud_ip_country_file_path = os.path.join(raw_data_path, 'IpAddress_to_Country.xlsx')
    
    #Read the data
    df_fraud = pd.read_csv(fraud_data_file_path,index_col=0)
    df_ip_country = pd.read_excel(fraud_ip_country_file_path)
    s = pd.Series(df_ip_country['country'].values, pd.IntervalIndex.from_arrays(df_ip_country['lower_bound_ip_address'], df_ip_country['upper_bound_ip_address']))
    df_fraud['country'] = df_fraud['ip_address'].map(s)
    return df_fraud

def processed_data(df_fraud):
    '''
    This method process all the data manipulations on data set including feature negineering
    '''
    df_fraud.country.fillna('UnKnown',inplace=True)
    df_fraud['signup_time'] = df_fraud.signup_time.apply(pd.to_datetime)#pd.to_datetime(df_fraud.signup_time)
    df_fraud['purchase_time'] = df_fraud.purchase_time.apply(pd.to_datetime)#pd.to_datetime(df_fraud.purchase_time)

    # it is very suspicious if a user signup and then immediately purchase
    df_fraud['time_diff'] = (df_fraud.purchase_time - df_fraud.signup_time).apply(lambda x: x.seconds)
    
    # Count the number of unique user ids associated each device
    df_fraud['userids_per_ipaddress'] = df_fraud.groupby('ip_address')['user_id'].transform('count')

    # Count the number of unique user ids associated each ip address
    df_fraud['userids_per_deviceid'] = df_fraud.groupby('device_id')['user_id'].transform('count')
    
    # Adding age_bin transformation to df_fraud data frame
    df_fraud['age_bin']= pd.qcut(df_fraud.age,3,labels=['Age18-30','Age30-50','Age41+'])

    # Adding purchase_bin transformation to df_fraud data frame
    df_fraud['purchase_bin']=pd.qcut(df_fraud['purchase_value'],4,labels=['low','medium','high','very_high'])
    
    # Add column for the average of the userids_per_deviceid,userids_per_ipaddress
    df_fraud["mean_number_of_ip_device_userids"] = (df_fraud.userids_per_deviceid + df_fraud.userids_per_ipaddress) * 0.5
    
    # day of the week
    df_fraud['signup_time_dow'] = pd.to_datetime(df_fraud['signup_time']).dt.dayofweek
    df_fraud['purchase_time_dow'] = pd.to_datetime(df_fraud['purchase_time']).dt.dayofweek
    
    # week of the year
    df_fraud['signup_time_week'] = pd.to_datetime(df_fraud['signup_time']).dt.week
    df_fraud['purchase_time_week'] = pd.to_datetime(df_fraud['purchase_time']).dt.week
    
    #Hour of the day
    df_fraud['signup_hour_of_day'] = pd.to_datetime(df_fraud['purchase_time']).dt.hour
    df_fraud['purchase_hour_of_day'] = pd.to_datetime(df_fraud['signup_time']).dt.hour
    
    return df_fraud


def write_data(df):
    processed_data_path = os.path.join(os.path.pardir,'data','processed')
    write_fraud_data_path = os.path.join(processed_data_path,'df_fraud_data.csv')
    df.to_csv(write_fraud_data_path)

if __name__=='__main__':
    df=read_data()
    df = processed_data(df)
    write_data(df)
    

Overwriting ..\src\data\get_processed_data.py


In [30]:
!python $get_processed_data_script_file

In [31]:
get_visualized_data_script_file=os.path.join(os.path.pardir,'src','visualization','visualize.py')

In [61]:
%%writefile $get_visualized_data_script_file

import os
import logging
import numpy as np
import pandas as pd

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

import  warnings
warnings.simplefilter('ignore')


def hist_visualization(df_fraud):
    '''
    Histogram for distribution
    '''
    f, hist_fig=plt.subplots(1,2, figsize=(20,5))

    sns.distplot(df_fraud['purchase_value'], hist=True, kde=True, 
                 bins=int(30), color = 'orange', 
                 hist_kws={'edgecolor':'skyblue'},
                 kde_kws={'linewidth': 2}, ax=hist_fig[0])
    hist_fig[0].set_title('Purchase Histogram - Right skewed 0.67')
    hist_fig[0].set_xlabel('Bins')
    hist_fig[0].set_ylabel('Frequency')

    sns.distplot(df_fraud.age, hist=True, kde=True, 
                 bins=int(30), color = 'green', 
                 hist_kws={'edgecolor':'white'},
                 kde_kws={'linewidth': 2}, ax=hist_fig[1])
    hist_fig[1].set_title('Age : Histogram - Right skewed 0.42')
    hist_fig[1].set_xlabel('Bins')
    hist_fig[1].set_ylabel('Frequency')
    plt.show()

def outliers_visualization(df_fraud):
    '''
    Out liers detection
    '''
    f, sub_fig=plt.subplots(1,3, figsize=(20,5))
    sub_fig[0].scatter(df_fraud['purchase_value'],df_fraud.age,c='c', alpha=0.5)
    sub_fig[0].set_title('Purchase vs Age')
    sub_fig[0].set_xlabel('Purchase Value')
    sub_fig[0].set_ylabel('Age')

    sub_fig[1].boxplot(df_fraud.age)
    sub_fig[1].set_title('Age : Box plot')

    sub_fig[2].boxplot(df_fraud['purchase_value'])
    sub_fig[2].set_title('Purchase Value : Box plot')

    plt.tight_layout
    plt.show()

def other_visualization(df_fraud):
    '''
    Other visualizations - age, sex, class, purchase value, source, browser
    '''
    f, rel_fig=plt.subplots(3,3, figsize=(20,15))
    sns.countplot(x='sex', hue='class', data=df_fraud,ax=rel_fig[0,0])

    sns.catplot(x="class", y="age", kind="box", hue='sex', data=df_fraud, ax=rel_fig[0,1]);

    sns.catplot(x="class", y="purchase_value", kind="box", hue='sex', data=df_fraud, ax=rel_fig[0,2]);

    #sns.catplot(x="class", y="purchase_value", kind="box", hue='sex', data=df_fraud, ax=rel_fig[1,0]);

    sns.scatterplot(x="purchase_value", y="age", hue="class", data=df_fraud, ax=rel_fig[1,0]);

    sns.countplot(x='source', hue='class', data=df_fraud,ax=rel_fig[1,1])

    sns.countplot(x='browser', hue='class', data=df_fraud,ax=rel_fig[1,2])

    sns.factorplot(x="class", y="userids_per_deviceid", data=df_fraud, ax=rel_fig[2,0])

    sns.factorplot(x="class", y="userids_per_ipaddress", data=df_fraud, ax=rel_fig[2,1])

    h=sns.factorplot(x="class", y="time_diff", data=df_fraud, ax=rel_fig[2,2])

    plt.close(2)
    plt.close(3)
    plt.close(4)
    plt.close(5)
    plt.close(6)
    plt.tight_layout()
    plt.show()

def signup_purchase_visualization(df_fraud):
    '''
    signup hour of day, sign up week, purchase hour, purchase week
    '''
    f, time_fig=plt.subplots(2,3, figsize=(20,10))
    sns.countplot(x='signup_hour_of_day', hue='class', data=df_fraud, ax=time_fig[0,0])
    sns.countplot(x='signup_time_dow', hue='class', data=df_fraud, ax=time_fig[0,1])
    sns.countplot(x='signup_time_week', hue='class', data=df_fraud, ax=time_fig[0,2])

    sns.countplot(x='purchase_hour_of_day', hue='class', data=df_fraud, ax=time_fig[1,0])
    sns.countplot(x='purchase_time_dow', hue='class', data=df_fraud, ax=time_fig[1,1])
    sns.countplot(x='purchase_time_week', hue='class', data=df_fraud, ax=time_fig[1,2])

    plt.close(2)
    plt.close(3)
    plt.tight_layout()
    plt.show()

def read_data():
    '''
    This method reads raw data and assign to Data Frame
    '''
    #set the path for raw data
    processed_data_path = os.path.join(os.path.pardir, 'data','processed')
    fraud_data_file_path = os.path.join(processed_data_path, 'df_fraud_data.csv')
    
    #Read the processed data
    df_fraud = pd.read_csv(fraud_data_file_path,index_col=0)
    #print(df_fraud.head(2))
    return df_fraud

if __name__=='__main__':
    df=read_data()
    hist_visualization(df)
    outliers_visualization(df)
    other_visualization(df)
    signup_purchase_visualization(df)

Overwriting ..\src\visualization\visualize.py


In [53]:
!python $get_visualized_data_script_file

Figure(2000x500)
Figure(2000x500)
Figure(2000x1500)
Figure(2000x1000)


In [54]:
get_feature_engineering_data_script_file=os.path.join(os.path.pardir,'src','features','build_features.py')

In [60]:
%%writefile $get_feature_engineering_data_script_file

import os
import logging
import numpy as np
import pandas as pd

def feature_engineering(df_fraud):
    col_list = ['source','browser','sex','age_bin','purchase_bin','country','time_diff','mean_number_of_ip_device_userids','class']
    df_fraud = df_fraud[col_list]
    # Feature Engineering - create dummy variables for categorical features - onehot encoding
    final_fraud = pd.get_dummies(df_fraud,['source','browser','sex','age_bin','purchase_bin','country'])
    
    return final_fraud

def read_data():
    '''
    This method reads data and assign to Data Frame
    '''
    #set the path for raw data
    processed_data_path = os.path.join(os.path.pardir, 'data','processed')
    fraud_data_file_path = os.path.join(processed_data_path, 'df_fraud_data.csv')
    
    #Read the processed data
    df_fraud = pd.read_csv(fraud_data_file_path,index_col=0)
    #print(df_fraud.head(2))
    return df_fraud

def write_data(df):
    processed_features_data_path = os.path.join(os.path.pardir,'data','processed')
    write_fraud_data_feature_engineered_path = os.path.join(processed_features_data_path,'df_fraud_feature_engineered_data.csv')
    df.to_csv(write_fraud_data_feature_engineered_path)

if __name__=='__main__':
    df = read_data()
    df = feature_engineering(df)
    write_data(df)

Overwriting ..\src\features\build_features.py


In [57]:
!python $get_feature_engineering_data_script_file

In [63]:
process_dataset_data_script_file=os.path.join(os.path.pardir,'src','data','dataset_split.py')

In [79]:
%%writefile $process_dataset_data_script_file

import os
import logging
import numpy as np
import pandas as pd


def process_no_target_dataset(df):
    # Data Set with no class feature
    X = df[ [col for col in df.columns if col != "class"] ]
    return X

def process_target_dataset(df):
    # Data set with only class feature
    Y = df[ [col for col in df.columns if col == "class"] ]#df["class"]
    return Y

def write_data(df,csv_file):
    processed_dataset_data_path = os.path.join(os.path.pardir,'data','processed')
    write_fraud_data_feature_engineered_path = os.path.join(processed_dataset_data_path,csv_file)
    df.to_csv(write_fraud_data_feature_engineered_path)

def read_data():
    '''
    This method reads data and assign to Data Frame
    '''
    #set the path for raw data
    processed_data_path = os.path.join(os.path.pardir, 'data','processed')
    fraud_data_featured_file_path = os.path.join(processed_data_path, 'df_fraud_feature_engineered_data.csv')
    
    #Read the processed data
    df_fraud = pd.read_csv(fraud_data_featured_file_path,index_col=0)
    #print(df_fraud.head(2))
    return df_fraud
    
if __name__=='__main__':
    df = read_data()
    df_X = process_no_target_dataset(df)
    df_Y = process_target_dataset(df)
    write_data(df_X,'df_fraud_no_target_data.csv')
    write_data(df_Y,'df_fraud_target_data.csv')


Overwriting ..\src\data\dataset_split.py


In [80]:
!python $process_dataset_data_script_file

In [81]:
feature_imp_data_script_file=os.path.join(os.path.pardir,'src','features','feature_imp.py')

In [84]:
%%writefile $feature_imp_data_script_file

import os
import logging
import numpy as np
import pandas as pd

# Modeling imports
from sklearn import linear_model
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score,precision_recall_curve 
from sklearn.dummy import DummyClassifier


import  warnings
warnings.simplefilter('ignore')


def feature_importances(X,Y):
    '''
    Feature importance using Extra tree classifier
    '''
    model = ExtraTreesClassifier()
    model.fit(X, Y)
    feature_imp=model.feature_importances_
    print(feature_imp)

def read_data():
    '''
    This method reads data and assign to Data Frame
    '''
    #set the path for raw data
    processed_data_path = os.path.join(os.path.pardir, 'data','processed')
    fraud_data_no_target_file_path = os.path.join(processed_data_path, 'df_fraud_no_target_data.csv')
    fraud_data_target_file_path = os.path.join(processed_data_path, 'df_fraud_target_data.csv')
    
    #Read the processed data
    df_fraud_x = pd.read_csv(fraud_data_no_target_file_path,index_col=0)
    df_fraud_y = pd.read_csv(fraud_data_target_file_path, index_col=0)
    #print(df_fraud.head(2))
    return df_fraud_x,df_fraud_y
    
if __name__=='__main__':
    X, Y = read_data()
    feature_importances(X,Y)


Overwriting ..\src\features\feature_imp.py


In [85]:
!python $feature_imp_data_script_file

[4.30429516e-01 4.29584452e-01 4.65513131e-03 2.91400038e-03
 4.18905061e-03 4.40818956e-03 5.05775245e-03 3.14822164e-03
 2.25021219e-03 4.92235268e-03 4.63087563e-03 4.29692988e-03
 5.89738067e-03 5.77768689e-03 5.94916464e-03 7.03874786e-03
 6.22864143e-03 6.69758414e-03 5.83401415e-03 5.20533915e-05
 3.26747844e-06 4.15977375e-04 4.58395112e-05 5.28781183e-07
 8.09509672e-04 1.23360431e-04 1.22356726e-03 5.54557289e-04
 9.43725231e-05 2.47302517e-07 1.35846039e-06 9.76843337e-05
 3.25364533e-06 8.20330561e-05 5.09780375e-04 9.90559739e-07
 2.46505776e-07 7.34865840e-08 2.22299589e-06 1.71425744e-04
 8.27582620e-05 1.88091699e-06 1.54703904e-03 3.11856566e-08
 3.07763734e-07 1.06132442e-04 1.63499174e-07 5.84922676e-06
 2.99891885e-07 1.77682251e-03 9.11495710e-08 3.41486665e-07
 5.37492535e-04 2.13595115e-03 7.51547029e-04 1.13682112e-06
 4.54526117e-07 1.88627247e-04 7.24264232e-07 1.94547707e-04
 2.01080621e-06 8.18379594e-07 4.52924636e-05 4.43932864e-04
 5.52306623e-04 3.623445

  from numpy.core.umath_tests import inner1d


In [86]:
train_test_split_data_script_file=os.path.join(os.path.pardir,'src','data','train_test_split.py')

In [93]:
%%writefile $train_test_split_data_script_file

import os
import logging
import numpy as np
import pandas as pd

# Modeling imports
from sklearn import linear_model
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score,precision_recall_curve 
from sklearn.dummy import DummyClassifier


import  warnings
warnings.simplefilter('ignore')


def train_test_split_data(X,Y):
    '''
    Train Test Split
    '''
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=1)
    
    return X_train, X_test, Y_train, Y_test

def read_data():
    '''
    This method reads data and assign to Data Frame
    '''
    #set the path for raw data
    processed_data_path = os.path.join(os.path.pardir, 'data','processed')
    fraud_data_no_target_file_path = os.path.join(processed_data_path, 'df_fraud_no_target_data.csv')
    fraud_data_target_file_path = os.path.join(processed_data_path, 'df_fraud_target_data.csv')
    
    #Read the processed data
    df_fraud_x = pd.read_csv(fraud_data_no_target_file_path,index_col=0)
    df_fraud_y = pd.read_csv(fraud_data_target_file_path, index_col=0)
    #print(df_fraud.head(2))
    return df_fraud_x,df_fraud_y

def write_data(df,csv_file):
    processed_dataset_data_path = os.path.join(os.path.pardir,'data','processed')
    write_train_test_split_path = os.path.join(processed_dataset_data_path,csv_file)
    df.to_csv(write_train_test_split_path)
    
if __name__=='__main__':
    X, Y = read_data()
    X_train,X_test,Y_train,Y_test = train_test_split_data(X,Y)
    write_data(X_train, 'X_train_data.csv')
    write_data(X_test, 'X_test_data.csv')
    write_data(Y_train, 'Y_train_data.csv')
    write_data(Y_test, 'Y_test_data.csv')


Overwriting ..\src\data\train_test_split.py


In [90]:
!python $train_test_split_data_script_file

  from numpy.core.umath_tests import inner1d


In [91]:
build_model_data_script_file=os.path.join(os.path.pardir,'src','models','build_model.py')

In [133]:
%%writefile $build_model_data_script_file

import os
import logging
import numpy as np
import pandas as pd

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling imports
from sklearn import linear_model
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score,precision_recall_curve 
from sklearn.dummy import DummyClassifier
import pickle

import  warnings
warnings.simplefilter('ignore')

def read_data():
    '''
    This method reads data and assign to Data Frame
    '''
    #set the path for raw data
    processed_data_path = os.path.join(os.path.pardir, 'data','processed')
    x_train_file_path = os.path.join(processed_data_path, 'X_train_data.csv')
    x_test_file_path = os.path.join(processed_data_path, 'X_test_data.csv')
    y_train_file_path = os.path.join(processed_data_path, 'Y_train_data.csv')
    y_test_file_path = os.path.join(processed_data_path, 'Y_test_data.csv')
    
    #Read the processed data
    X_train = pd.read_csv(x_train_file_path,index_col=0)
    X_test = pd.read_csv(x_test_file_path,index_col=0)
    Y_train = pd.read_csv(y_train_file_path,index_col=0)
    Y_test = pd.read_csv(y_test_file_path,index_col=0)
    
    return X_train, X_test, Y_train, Y_test

def baseline_model(X_train, X_test, Y_train, Y_test):
    model_dummy = DummyClassifier(strategy='most_frequent',random_state=0)
    model_dummy.fit(X_train,Y_train)
    print('score for baseline model: {0:.2f}'.format(model_dummy.score(X_test,Y_test)))
    # Performance Metrics
    print('accuracy for Baseline model: {0:.2f}'.format(accuracy_score(Y_test,model_dummy.predict(X_test))))

    # Confusion Matrix
    print('Confusion matrix for Baseline model: \n {0}'.format(confusion_matrix(Y_test,model_dummy.predict(X_test))))

    # Precision and Recall scores
    print('Precision for baseline Model: {0:.2f}'.format(precision_score(Y_test, model_dummy.predict(X_test))))
    print('Recall for baseline Model: {0:.2f}'.format(recall_score(Y_test, model_dummy.predict(X_test))))

def random_forest(X_train,X_test,Y_train,Y_test):
    rf = RandomForestClassifier()
    rf_model = rf.fit(X_train, Y_train)

    # Predicting the results
    y_pred = rf_model.predict(X_test)
    
    #Evaluating
    # Evaluating
    model_test_score = rf_model.score(X_test,Y_test)
    conf_matrix = confusion_matrix(Y_test, y_pred)
    print ('Random Forest MODEL TEST SCORE: {0:.5f}'.format(model_test_score))
    print("Random Forest ACCURACY: {0:.2f}".format(accuracy_score(Y_test, y_pred)))
    print("Random Forest ROC-AUC: {0:.2f}".format(roc_auc_score(Y_test, y_pred)))
    print("Random Forest PRECISION: {0:.2f}".format(precision_score(Y_test, y_pred)))
    print("Random Forest RECALL: {0:.2f}".format(recall_score(Y_test, y_pred)))
    print("Random Forest Confusion Matrix:\n",conf_matrix)
    print ('\nRandom Forest True Negatives: ', conf_matrix[0,0])
    print ('Random Forest False Negatives: ', conf_matrix[1,0])
    print ('Random Forest True Positives: ', conf_matrix[1,1])
    print ('Random Forest False Positives: ', conf_matrix[0,1])
    
    return rf_model

def XGBoost(X_train,X_test,Y_train,Y_test):
    xgb = XGBClassifier()

    # Fitting the model
    xgb_model = xgb.fit(X_train, Y_train)

    # Predicting results
    y_pred = xgb_model.predict(X_test)
    
    #Evaluation
    model_train_score = xgb_model.score(X_train,Y_train)
    model_test_score = xgb_model.score(X_test,Y_test)
    conf_matrix = confusion_matrix(Y_test, y_pred)
    print ('XGBoost MODEL TEST SCORE: {0:.5f}'.format(model_train_score))
    print ('XGBoost MODEL TEST SCORE: {0:.5f}'.format(model_test_score))
    print("XGBoost ACCURACY: {0:.2f}".format(accuracy_score(Y_test, y_pred)))
    print("XGBoost ROC-AUC: {0:.2f}".format(roc_auc_score(Y_test, y_pred)))
    print("XGBoost PRECISION: {0:.2f}".format(precision_score(Y_test, y_pred)))
    print("XGBoost RECALL: {0:.2f}".format(recall_score(Y_test, y_pred)))
    print("XGBoost Confusion Matrix:\n",conf_matrix)
    print ('\nXGBoost True Negatives: ', conf_matrix[0,0])
    print ('XGBoost False Negatives: ', conf_matrix[1,0])
    print ('XGBoost True Positives: ', conf_matrix[1,1])
    print ('XGBoost False Positives: ', conf_matrix[0,1])
    return xgb_model

def model_persistance_xgb(xgb):
    #Create file path
    model_file_path = os.path.join(os.path.pardir,'models','xgb_model.pkl')
    
    #Open file to write
    model_file_pickle = open(model_file_path,'wb')
    
    #model persist
    pickle.dump(xgb,model_file_pickle)
    
    model_file_pickle.close()
    

def model_persistance_rf(rf):
    #Create file path
    model_file_path = os.path.join(os.path.pardir,'models','rf_model.pkl')
    
    #Open file to write
    model_file_pickle = open(model_file_path,'wb')
    
    #model persist
    pickle.dump(rf,model_file_pickle)
    
    model_file_pickle.close()

def validation_auc_roc(xgb_model,X_test,Y_test):
    y_predd=xgb_model.predict_proba(X_test)
    p = plot_validation_roc(Y_test,y_predd)
    roc_auc = roc_auc_score(Y_test, y_predd[:,1])
    plt.figure(figsize=(10,5))
    plt.plot(p.FPR,p.TPR, color='orange', label='ROC curve (area = %0.2f)'%roc_auc)
    plt.xlim([-0.02, 1])
    plt.ylim([0, 1.02])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.plot([0, 1], [0, 1], color='skyblue', lw=2, linestyle='--',label='Random guess')
    plt.legend(loc="lower right",frameon=False)

def plot_validation_roc(Y_test, y_predd):
    fpr,tpr,thresholds = roc_curve(Y_test,y_predd[:,1])
    return pd.DataFrame({'FPR':fpr,'TPR':tpr,'Threshold':thresholds})

if __name__=='__main__':
    X_train,X_test,Y_train,Y_test = read_data()
    #baseline_model(X_train,X_test,Y_train,Y_test)
    rf=random_forest(X_train,X_test,Y_train,Y_test)
    xgb_model = XGBoost(X_train,X_test,Y_train,Y_test)
    validation_auc_roc(xgb_model,X_test,Y_test)
    model_persistance_rf(rf)
    model_persistance_xgb(xgb_model)
    

Overwriting ..\src\models\build_model.py


In [134]:
!python $build_model_data_script_file

Random Forest MODEL TEST SCORE: 0.94154
Random Forest ACCURACY: 0.94
Random Forest ROC-AUC: 0.77
Random Forest PRECISION: 0.76
Random Forest RECALL: 0.55
Random Forest Confusion Matrix:
 [[21349   395]
 [ 1008  1248]]

Random Forest True Negatives:  21349
Random Forest False Negatives:  1008
Random Forest True Positives:  1248
Random Forest False Positives:  395
XGBoost MODEL TEST SCORE: 0.95640
XGBoost MODEL TEST SCORE: 0.95592
XGBoost ACCURACY: 0.96
XGBoost ROC-AUC: 0.77
XGBoost PRECISION: 1.00
XGBoost RECALL: 0.53
XGBoost Confusion Matrix:
 [[21744     0]
 [ 1058  1198]]

XGBoost True Negatives:  21744
XGBoost False Negatives:  1058
XGBoost True Positives:  1198
XGBoost False Positives:  0


  from numpy.core.umath_tests import inner1d


In [132]:
predict_data_script_file=os.path.join(os.path.pardir,'src','models','predict_model.py')

In [143]:
%%writefile $predict_data_script_file

import os
import logging
import numpy as np
import pandas as pd

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling imports
from sklearn import linear_model
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score,precision_recall_curve 
from sklearn.dummy import DummyClassifier
import pickle

import  warnings
warnings.simplefilter('ignore')

def read_data():
    '''
    This method reads data and assign to Data Frame
    '''
    #set the path for raw data
    processed_data_path = os.path.join(os.path.pardir, 'data','processed')
    x_train_file_path = os.path.join(processed_data_path, 'X_train_data.csv')
    x_test_file_path = os.path.join(processed_data_path, 'X_test_data.csv')
    y_train_file_path = os.path.join(processed_data_path, 'Y_train_data.csv')
    y_test_file_path = os.path.join(processed_data_path, 'Y_test_data.csv')
    
    #Read the processed data
    X_train = pd.read_csv(x_train_file_path,index_col=0)
    X_test = pd.read_csv(x_test_file_path,index_col=0)
    Y_train = pd.read_csv(y_train_file_path,index_col=0)
    Y_test = pd.read_csv(y_test_file_path,index_col=0)
    
    return X_train, X_test, Y_train, Y_test

def predict(X_test,Y_test):
    model_file_path = os.path.join(os.path.pardir,'models','xgb_model.pkl')
    model_file_pickle = open(model_file_path, 'rb')
    xgb_model = pickle.load(model_file_pickle)
    predicted_results = xgb_model.predict(X_test)
    return predicted_results


def write_data(df):
    predicted_results_data_path = os.path.join(os.path.pardir,'data','processed')
    write_predicted_results_path = os.path.join(predicted_results_data_path,'predicted_results_X_test.csv')
    df.to_csv(write_predicted_results_path)
    
if __name__=='__main__':
    X_train,X_test,Y_train,Y_test = read_data()
    predicted_results = predict(X_test,Y_test)
    write_data(pd.DataFrame(predicted_results))
    

Overwriting ..\src\models\predict_model.py


In [144]:
!python $predict_data_script_file

  from numpy.core.umath_tests import inner1d
