<img src="https://bit.ly/2VnXWr2" width="100" align="left">

# Project | ML: Job offers' Fraud-Detection with NLP

## Introduction

In order to experience how to work in a ML project as a group, and to learn more on NLP we have been working on this dataset from Kaggle, [[Real or Fake] Fake Job Posting Prediction](https://www.kaggle.com/shivamb/real-or-fake-fake-jobposting-prediction), that holds around 18K job descriptions out of which about 900 are fake. The data consists of both textual information and meta-information about the jobs. 

### Objectives

 We mainly wanted to create a **classification model using text data features and meta-features to predict which job descriptions are fraudulent**. As well as, finding out if there are **key traits/features** (words, entities, phrases) of job descriptions which are **intrinsically fraudulent**.

### Imports

In [1]:
# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Data Visualization for text
from PIL import Image
from os import path
import os
import random
from wordcloud import WordCloud, STOPWORDS

# Text Processing
import re
import itertools
import spacy
import string
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
from collections import Counter

# Machine Learning packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore noise warning
import warnings
warnings.filterwarnings('ignore')

# Work with pickles
import pickle

# Fix imbalance
from imblearn.under_sampling import InstanceHardnessThreshold

# Model training and evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

Using TensorFlow backend.


## 5. Hyperparameter Tuning of the Models

Althought the metrics of the different models are really good, we can still improve the performance of the models. Therefore, a fine tunning of the different parameters of each models has to be done.

<img src="images/danger-explosives-sign.jpg" width="200"> 

In [82]:
raise SystemExit("Stop right there! The following cells takes some time to complete.")

SystemExit: Stop right there! The following cells takes some time to complete.

### KNN Tuning

In [None]:
#create new a knn model
knn = KNeighborsClassifier()

#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25), 
              'weights': ['uniform', 'distance'], 
              'algorithm': ['auto', 'kd_tree', 'brute'],
              'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
            }

#knn_gs = GridSearchCV(knn, params_knn, cv=5)

#knn_gs.fit(X_train, y_train)

#knn_best = knn_gs.best_estimator_

#print(knn_gs.best_params_)


"""The code was executed in  google colab, the result is 

{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 21, 'weights': 'distance'}"""

In [None]:
""" Without Tuning"""

knn = KNeighborsClassifier()
y_pred = knn.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
""" With Tuning """

knn = KNeighborsClassifier(algorithm =  'auto', metric = 'manhattan', n_neighbors= 21, weights = 'distance')
y_pred = knn.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### RandomForest Tuning

In [None]:
rfc = RandomForestClassifier(random_state = 42)

params_rfc = { 
                'n_estimators' : np.arange(50,250),
                'criterion' : ['gini','entropy'],
                'max_features' : ['sqrt','log2']
            }

#rfc_gs = GridSearchCV(rfc, params_rfc, cv=5)

#rfc_gs.fit(X_train, y_train)

#rfc_best = rfc_gs.best_estimator_

#print(rfc_gs.best_params_)

"""
The code was executed in  google colab, the result is
'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 163
"""

In [None]:
""" Without Tuning """

rfc = RandomForestClassifier(random_state = 42)
y_pred = rfc.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
""" With Tuning """

rfc = RandomForestClassifier(random_state = 42, criterion = 'entropy', max_features = 'sqrt', n_estimators = 163)
y_pred = rfc.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### Decision Tree

In [None]:
dtc = DecisionTreeClassifier(random_state = 42)

params_dtc = { 
                'class_weight' : ['balanced', None],
                'criterion' : ['gini','entropy'],
                'spliter' : ['random','best']
                'max_features' : ['sqrt','log2']
            }

#dtc_gs = GridSearchCV(dtc, params_dtc, cv=5)

#dtc_gs.fit(X_train, y_train)

#dtc_best = dtc_gs.best_estimator_

#print(dtc_gs.best_params_)

"""
The code was executed in  google colab, the result is
{'class_weight': 'balanced', 'criterion': 'gini', 'max_features': 'sqrt', 'splitter': 'best'}"""

### MLP Tuning

In [None]:
mlp = MLPClassifier(max_iter = 10000, hidden_layer_sizes = (300,))

params_mlp = {
              'activation': ['identity', 'logistic', 'tanh', 'relu'], 
              'solver': ['lbfgs', 'sgd', 'adam'],
              'learning_rate': ['constant', 'invscaling', 'adaptive', 'minkowski'],
            }

#mlp_gs = GridSearchCV(mlp, params_mlp, cv=5)

#mlp_gs.fit(X_train, y_train)

#mlp_best = mlp_gs.best_estimator_

#print(knn_gs.best_params_)


"""
The code was executed in  google colab, the result is

{'activation': 'tanh', 'learning_rate': 'constant', 'solver': 'lbfgs'}
"""

In [None]:
""" Without Tuning """

mlp = MLPClassifier(max_iter = 10000,hidden_layer_sizes = (300,))
y_pred = mlp.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
""" With  Tuning """

mlp = MLPClassifier(max_iter = 10000,hidden_layer_sizes = (300,),activation='tanh',learning_rate='constant',solver='lbfgs')
y_pred = mlp.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### Final results

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)
    

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'specificity': [specificity]
                            })  
    return df_model

In [None]:
models = {
          'knn': KNeighborsClassifier(algorithm =  'auto', metric = 'manhattan', n_neighbors= 21, weights = 'distance'),
          'decisiontree': DecisionTreeClassifier(random_state = 42, class_weight= None, criterion='entropy', max_features='sqrt', splitter='best'),
          'randomforest': RandomForestClassifier(random_state = 42, criterion = 'entropy', max_features = 'sqrt', n_estimators = 163),
           'MLP': MLPClassifier(max_iter = 10000,hidden_layer_sizes = (300,),activation='tanh',learning_rate='constant',solver='lbfgs')
         }

models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df