In [11]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import NearMiss, RandomUnderSampler, InstanceHardnessThreshold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
result = pd.read_csv('result2.csv')
result.drop('Unnamed: 0', axis =1, inplace= True)

In [3]:
result.head()

Unnamed: 0,employment_type_contract,employment_type_full-time,employment_type_other,employment_type_part-time,employment_type_temporary,required_experience_associate,required_experience_entry level,required_experience_executive,required_experience_internship,required_experience_other,...,Country_GR,Country_OTHER,Country_US,telecommuting,has_company_logo,has_questions,fraudulent,salary_range,0,1
0,0,0,1,0,0,0,0,0,1,0,...,0,0,1,0,1,0,0,0.0,24.71429,22.090221
1,0,1,0,0,0,0,0,0,0,1,...,0,1,0,0,1,0,0,1.0,22.080013,20.904112
2,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,0.0,19.897823,15.733032
3,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0.0,14.740032,29.069805
4,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,1,1,0,0.0,24.179564,27.688435


In [4]:
threshold = InstanceHardnessThreshold(random_state =42)

X_rus, y_rus = threshold.fit_resample(result.drop('fraudulent', axis = 1), result['fraudulent'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus, test_size=0.3, random_state=42, stratify=y_rus)

# KNN tuning

In [6]:
#create new a knn model
knn = KNeighborsClassifier()

#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25), 
              'weights': ['uniform', 'distance'], 
              'algorithm': ['auto', 'kd_tree', 'brute'],
              'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
            }

#knn_gs = GridSearchCV(knn, params_knn, cv=5)

#knn_gs.fit(X_train, y_train)

#knn_best = knn_gs.best_estimator_

#print(knn_gs.best_params_)


"""The code was executed in  google colab, the result is 

{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 21, 'weights': 'distance'}"""

"The code was executed in  google colab, the result is \n\n{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 21, 'weights': 'distance'}"

In [12]:
""" Without Tuning"""

knn = KNeighborsClassifier()
y_pred = knn.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[3502    9]
 [  48  212]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3511
           1       0.96      0.82      0.88       260

    accuracy                           0.98      3771
   macro avg       0.97      0.91      0.94      3771
weighted avg       0.98      0.98      0.98      3771



In [8]:
""" With Tuning """

knn = KNeighborsClassifier(algorithm =  'auto', metric = 'manhattan', n_neighbors= 21, weights = 'distance')
y_pred = knn.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[3505    6]
 [  46  214]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3511
           1       0.97      0.82      0.89       260

    accuracy                           0.99      3771
   macro avg       0.98      0.91      0.94      3771
weighted avg       0.99      0.99      0.99      3771



# MLP tuning

In [10]:
mlp = MLPClassifier(max_iter = 10000, hidden_layer_sizes = (100,))

params_mlp = {
              'activation': ['identity', 'logistic', 'tanh', 'relu'], 
              'solver': ['lbfgs', 'sgd', 'adam'],
              'learning_rate': ['constant', 'invscaling', 'adaptive', 'minkowski'],
            }

#mlp_gs = GridSearchCV(mlp, params_mlp, cv=5)

#mlp_gs.fit(X_train, y_train)

#mlp_best = mlp_gs.best_estimator_

#print(knn_gs.best_params_)


"""
The code was executed in  google colab, the result is

{'activation': 'tanh', 'learning_rate': 'adaptive', 'solver': 'lbfgs'}
"""

In [16]:
""" Without Tuning """

mlp = MLPClassifier(max_iter = 10000)
y_pred = mlp.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[3499   12]
 [  56  204]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3511
           1       0.94      0.78      0.86       260

    accuracy                           0.98      3771
   macro avg       0.96      0.89      0.92      3771
weighted avg       0.98      0.98      0.98      3771



In [17]:
""" With  Tuning """

mlp = MLPClassifier(max_iter = 10000, hidden_layer_sizes = (100,),activation='tanh',learning_rate='adaptive',solver='lbfgs')
y_pred = knn.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[3502    9]
 [  48  212]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3511
           1       0.96      0.82      0.88       260

    accuracy                           0.98      3771
   macro avg       0.97      0.91      0.94      3771
weighted avg       0.98      0.98      0.98      3771



In [None]:
""" Now we will find the best hidde_layer_sizes """

In [None]:
mlp = MLPClassifier(max_iter = 10000,activation='tanh',learning_rate='adaptive',solver='lbfgs')

params_mlp = {
              'hidden_layer_sizes' : (np.arange(100,400),)
            }

#mlp_gs = GridSearchCV(mlp, params_mlp, cv=5)

#mlp_gs.fit(X_train, y_train)

#mlp_best = mlp_gs.best_estimator_

#print(mlp_gs.best_params_)

# Random Forest Tuning

In [None]:
rfc = RandomForestClassifier(random_state = 42)

params_rfc = { 
                'n_estimators' : np.arrage(50,250)
                'criterion' : ['gini','entropy']
                'max_features' : ['int','float','auto','log2']
            }

#rfc_gs = GridSearchCV(rfc, params_rfc, cv=5)

#rfc_gs.fit(X_train, y_train)

#rfc_best = rfc_gs.best_estimator_

#print(rfc_gs.best_params_)