In [26]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import NearMiss, RandomUnderSampler, InstanceHardnessThreshold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


In [2]:
result = pd.read_csv('result.csv')
result.drop('Unnamed: 0', axis =1, inplace= True)

In [3]:
result.head()

Unnamed: 0,employment_type_contract,employment_type_full-time,employment_type_other,employment_type_part-time,employment_type_temporary,required_experience_associate,required_experience_entry level,required_experience_executive,required_experience_internship,required_experience_other,...,Country_GR,Country_OTHER,Country_US,telecommuting,has_company_logo,has_questions,fraudulent,salary_range,0,1
0,0,0,1,0,0,0,0,0,1,0,...,0,0,1,0,1,0,0,0.0,-5.284934,-2.366045
1,0,1,0,0,0,0,0,0,0,1,...,0,1,0,0,1,0,0,1.0,3.447525,0.811648
2,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,0.0,0.357541,4.447133
3,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0.0,9.69858,-3.118088
4,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,1,1,0,0.0,6.679002,-4.354322


In [4]:
threshold = InstanceHardnessThreshold(random_state =42)

X_rus, y_rus = threshold.fit_resample(result.drop('fraudulent', axis = 1), result['fraudulent'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus, test_size=0.2, random_state=42, stratify=y_rus)

# KNN tuning

In [6]:
#create new a knn model
knn = KNeighborsClassifier()

#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25), 
              'weights': ['uniform', 'distance'], 
              'algorithm': ['auto', 'kd_tree', 'brute'],
              'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
            }

#knn_gs = GridSearchCV(knn, params_knn, cv=5)

#knn_gs.fit(X_train, y_train)

#knn_best = knn_gs.best_estimator_

#print(knn_gs.best_params_)


"""The code was executed in  google colab, the result is 

{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 21, 'weights': 'distance'}"""

"The code was executed in  google colab, the result is \n\n{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 21, 'weights': 'distance'}"

In [7]:
""" Without Tuning"""

knn = KNeighborsClassifier()
y_pred = knn.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2331   10]
 [  23  150]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2341
           1       0.94      0.87      0.90       173

    accuracy                           0.99      2514
   macro avg       0.96      0.93      0.95      2514
weighted avg       0.99      0.99      0.99      2514



In [8]:
""" With Tuning """

knn = KNeighborsClassifier(algorithm =  'auto', metric = 'manhattan', n_neighbors= 21, weights = 'distance')
y_pred = knn.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2337    4]
 [  24  149]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2341
           1       0.97      0.86      0.91       173

    accuracy                           0.99      2514
   macro avg       0.98      0.93      0.95      2514
weighted avg       0.99      0.99      0.99      2514



# MLP tuning

In [9]:
mlp = MLPClassifier(max_iter = 10000, hidden_layer_sizes = (300,))

params_mlp = {
              'activation': ['identity', 'logistic', 'tanh', 'relu'], 
              'solver': ['lbfgs', 'sgd', 'adam'],
              'learning_rate': ['constant', 'invscaling', 'adaptive', 'minkowski'],
            }

#mlp_gs = GridSearchCV(mlp, params_mlp, cv=5)

#mlp_gs.fit(X_train, y_train)

#mlp_best = mlp_gs.best_estimator_

#print(knn_gs.best_params_)


"""
The code was executed in  google colab, the result is

{'activation': 'tanh', 'learning_rate': 'constant', 'solver': 'lbfgs'}
"""

"\nThe code was executed in  google colab, the result is\n\n{'activation': 'tanh', 'learning_rate': 'adaptive', 'solver': 'lbfgs'}\n"

In [10]:
""" Without Tuning """

mlp = MLPClassifier(max_iter = 10000,hidden_layer_sizes = (300,))
y_pred = mlp.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2335    6]
 [  16  157]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2341
           1       0.96      0.91      0.93       173

    accuracy                           0.99      2514
   macro avg       0.98      0.95      0.96      2514
weighted avg       0.99      0.99      0.99      2514



In [28]:
""" With  Tuning """

mlp = MLPClassifier(max_iter = 10000,hidden_layer_sizes = (300,),activation='tanh',learning_rate='constant',solver='lbfgs')
y_pred = mlp.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2335    6]
 [  14  159]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2341
           1       0.96      0.92      0.94       173

    accuracy                           0.99      2514
   macro avg       0.98      0.96      0.97      2514
weighted avg       0.99      0.99      0.99      2514



# Random Forest Tuning

In [16]:
rfc = RandomForestClassifier(random_state = 42)

params_rfc = { 
                'n_estimators' : np.arange(50,250),
                'criterion' : ['gini','entropy'],
                'max_features' : ['sqrt','log2']
            }

#rfc_gs = GridSearchCV(rfc, params_rfc, cv=5)

#rfc_gs.fit(X_train, y_train)

#rfc_best = rfc_gs.best_estimator_

#print(rfc_gs.best_params_)

"""
The code was executed in  google colab, the result is
'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 163
"""

"\nThe code was executed in  google colab, the result is\n'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 163\n"

In [17]:
""" Without Tuning """

rfc = RandomForestClassifier(random_state = 42)
y_pred = rfc.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2340    1]
 [  23  150]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2341
           1       0.99      0.87      0.93       173

    accuracy                           0.99      2514
   macro avg       0.99      0.93      0.96      2514
weighted avg       0.99      0.99      0.99      2514





In [20]:
""" With Tuning """

rfc = RandomForestClassifier(random_state = 42, criterion = 'entropy', max_features = 'sqrt', n_estimators = 163)
y_pred = rfc.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2340    1]
 [  19  154]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2341
           1       0.99      0.89      0.94       173

    accuracy                           0.99      2514
   macro avg       0.99      0.94      0.97      2514
weighted avg       0.99      0.99      0.99      2514



# Decision Tree

In [None]:
dtc = DecisionTreeClassifier(random_state = 42)

params_dtc = { 
                'class_weight' : ['balanced', None],
                'criterion' : ['gini','entropy'],
                'spliter' : ['random','best']
                'max_features' : ['sqrt','log2']
            }

#dtc_gs = GridSearchCV(dtc, params_dtc, cv=5)

#dtc_gs.fit(X_train, y_train)

#dtc_best = dtc_gs.best_estimator_

#print(dtc_gs.best_params_)

"""
The code was executed in  google colab, the result is
{'class_weight': 'balanced', 'criterion': 'gini', 'max_features': 'sqrt', 'splitter': 'best'}"""

In [37]:
""" Without Tuning """

dtc = DecisionTreeClassifier(random_state = 42)
y_pred = dtc.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2327   14]
 [  21  152]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2341
           1       0.92      0.88      0.90       173

    accuracy                           0.99      2514
   macro avg       0.95      0.94      0.94      2514
weighted avg       0.99      0.99      0.99      2514



In [36]:
dtc = DecisionTreeClassifier(random_state = 42, class_weight= 'balanced', criterion='gini', max_features='sqrt', splitter='best')
y_pred = dtc.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



[[2333    8]
 [  22  151]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2341
           1       0.95      0.87      0.91       173

    accuracy                           0.99      2514
   macro avg       0.97      0.93      0.95      2514
weighted avg       0.99      0.99      0.99      2514



# Final Result

In [23]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score

def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)
    #logloss      = log_loss(y_test, y_pred)   # SVC & LinearSVC unable to use cvs

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'specificity': [specificity]
                             #'logloss'      : [logloss]
                            })   # timetaken: to be used for comparison later
    return df_model

In [32]:
models = {#'gnb': GaussianNB(),
          #'bnb': BernoulliNB(),
          #'mnb': MultinomialNB(),
          #'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(algorithm =  'auto', metric = 'manhattan', n_neighbors= 21, weights = 'distance'),
          'decisiontree': DecisionTreeClassifier(random_state = 42, class_weight= None, criterion='entropy', max_features='sqrt', splitter='best'),
          'randomforest': RandomForestClassifier(random_state = 42, criterion = 'entropy', max_features = 'sqrt', n_estimators = 163),
          #'svc': SVC(probability=True),
          #'linearsvc': LinearSVC(),
          #'xgboost': GradientBoostingClassifier(),
          #'NN': MLPClassifier(),
           'MLP': MLPClassifier(max_iter = 10000,hidden_layer_sizes = (300,),activation='tanh',learning_rate='constant',solver='lbfgs')
         }

models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])

In [33]:
models_df

Unnamed: 0,model,accuracy,precision,recall,f1score,rocauc,specificity
0,knn,0.983785,0.99266,0.786446,0.87387,0.979688,0.989835
0,decisiontree,0.982393,0.890846,0.829799,0.863942,0.906658,0.990618
0,randomforest,0.98866,0.983298,0.854249,0.903277,0.996181,0.991946
0,MLP,0.987466,0.935201,0.864415,0.899598,0.985741,0.99235
