# Imports

In [None]:
#ignore unless on colab

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#ignore unless on colab

import nltk
nltk.download('stopwords')
nltk.download('punkt')
!pip install contractions
!pip install translators --upgrade

In [None]:
#main imports

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt

from collections import defaultdict
from collections import Counter
import contractions
import csv
import pandas as pd
import numpy as np
import pickle

# Text Parsing

This section parses the initial dataset and applies data preprocessing techniques including stemming, stopword removal, substituting contractions, and tokenization. We then choose specific features to focus on in the dataset, extract them, and convert them to Bag of Words and Tfidf feature vectorizations. The dataset is also duplicated and augumented using SMOTE to account for the imbalance fake and real job postings.

In [None]:
#set up tools and variables to properly process and store raw text

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))
documents = []
Y = []

### Using Pandas Dataframe (should be faster with larger dataset)

In [None]:
#read and process job posting dataset into pandas dataframe

print('\nLoading the file: \n', 'input/fake_job_postings.csv')
df = pd.read_csv('input/fake_job_postings.csv')
print('Loaded.')

Y = df["fraudulent"]

tokenizer = RegexpTokenizer('\w+')
# fill in null
for col in ["title", "company_profile", "description", "requirements"]:
  df[col] = df[col].fillna("")
  print("stemming", col)
    
  #apply text processing to each column
  df['stem_'+ col] = df.apply(lambda row: [ps.stem(word.lower()) for word in tokenizer.tokenize(contractions.fix(row[col])) if not word.lower() in stop_words], axis=1)

In [None]:
#check columns for features

df.head(0)

In [None]:
# combining stemmed tokens into main dataset
df['documents'] = df['stem_title'] + df['stem_company_profile'] + df['stem_description'] + df['stem_requirements']
df['documents'] = df['documents'].str.join(' ')
df['documents']

In [None]:
#vectorize word tokens for use in model

vectorizer = CountVectorizer(stop_words='english', min_df=0.01, ngram_range=(1,3))
X = vectorizer.fit_transform(df['documents']) #vectorize feature dataset
print('Feature data shape: ', X.shape, '\nLabel data shape:', Y.shape)

### Load from pickle

In [None]:
#alternatively load dataset and vectorizers from previous experimentation

df = pd.read_pickle('../generated/dataframe.pkl')
vectorizer = pickle.load(open('../generated/vectorizer.pkl', 'rb'))
Tfidf_vect = pickle.load(open('../generated/Tfidf_vect.pkl', 'rb'))
X = pickle.load(open('../generated/X.pkl', 'rb'))
Y = df["fraudulent"]
print('Feature data shape: ', X.shape, '\nLabel data shape:', Y.shape)

### Generate datasets

In [None]:
#use SMOTE to augument fake job class data for balanced dataset

sm = SMOTE(random_state=0)
X_res, Y_res = sm.fit_resample(X,Y)
print('Resampled data shape: ', X_res.shape, Y_res.shape)
print('Resampled data class balance: %s' % Counter(Y_res))

#split into training and testing datasets for later
X_res_train, X_res_test, Y_res_train, Y_res_test = train_test_split(X_res, Y_res, test_size=0.3)

In [None]:
#Use tfidf vectorizer to convert raw dataset into tfidf feature model and associated labels
#adapted from https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

X_train, X_test, Y_train, Y_test = train_test_split(df['documents'], Y, test_size=0.5)

#encode labels
Encoder = LabelEncoder()
Train_Y_Tfidf = Encoder.fit_transform(Y_train)
Test_Y_Tfidf = Encoder.fit_transform(Y_test)

#convert dataset to tfidf feature representation
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(df['documents'])
Train_X_Tfidf = Tfidf_vect.transform(X_train)
Test_X_Tfidf = Tfidf_vect.transform(X_test)

In [None]:
#split base dataset into training and testing data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5)

### Dump to pickle

In [None]:
#store time consuming cells in pickle files to load later

df.to_pickle('../generated/dataframe.pkl')
pickle.dump(vectorizer, open('../generated/vectorizer.pkl', 'wb'))
pickle.dump(X, open('../generated/X.pkl', 'wb'))
pickle.dump(Tfidf_vect, open('../generated/Tfidf_vect.pkl', 'wb'))

#### Language translation data augumentation

##### Unused

ends up taking too long, likely because library uses outside server that rate limits requests

originally meant to raw data to other language and back as a method of generating "new" data to augument dataset as solution for imbalance in class distribution

In [None]:
import translators as ts
import translators.server as tss
import random

language_list = ['cy', 'zh', 'ru', 'fr', 'ja', 'es', 'it', 'de', 'ko', 'el', 'ar']

#extract feature relevant dataframe for fraudulent job postings
fake_df = df[df.fraudulent == 1]
fake_df['tr_documents'] = fake_df['title'] + fake_df['company_profile'] + fake_df['description'] + fake_df['requirements'] 
fake_df = fake_df['tr_documents'].str.join('')

for i in range(len(fake_df['tr_documents'])):
    language = random.choice(language_list)
    entry = fake_df['tr_documents'][i]
    
    #translate to random different language and back to generate similar entry
    entry = tss.google(entry, from_language='en', to_language=language)
    entry = tss.google(entry, from_language=language, to_language='en')
    entry = ' '.join([ps.stem(word.lower()) for word in tokenizer.tokenize(contractions.fix(entry)) if not word.lower() in stop_words])
    df['tr_documents'][i] = entry

# Analysis

Data analysis to attempt to find important features that strongly correlate with fraudulency.

For boolean features, we generated a confusion matrix to find correlation with the overall dataset and each class subset.

For non-boolean features, we found the top 10 most commonly occuring values associated with fraudulency, then generated a bar plot of each feature value's percent fraudulency to compare.

A function is also created to uniformly test each generated model for accuracy according to various metrics.

In [None]:
import seaborn as sns

In [None]:
#dataset has large amounts of null values

df.isna().sum()

In [None]:
#get most common fraudulent values and percentage of original dataframe for selected features
test_df = df[['location', 'department', 'salary_range', 'employment_type', 'required_experience', 'industry', 'fraudulent']]
feature_fraud_cor = {} #dict with feature names as keys and two-tuple values of (most common fraudulent values, percentage of original dataset)
fraud_test_df = test_df[test_df.fraudulent == 1] #get fraudulent subset of original dataframe
for col in ['location', 'department', 'salary_range', 'employment_type', 'required_experience', 'industry']:
    n_largest_fraud = fraud_test_df[col].value_counts().nlargest(10) #top 10 most common feature values in fraudulent subset
    n_largest_df = test_df[test_df[col].isin(n_largest_fraud.index)] #number of occurences of each feature value in original df
    feature_fraud_cor[col] = (n_largest_fraud, n_largest_fraud.div(n_largest_df[col].value_counts()).sort_values(ascending=False))

In [None]:
print(feature_fraud_cor['location'][0]) #print most commonly occuring fraudulent feature values
feature_fraud_cor['location'][1].plot.bar(xlabel='location') #generate bar plot of percentage of original dataset for each value

In [None]:
print(feature_fraud_cor['department'][0])
feature_fraud_cor['department'][1].plot.bar(xlabel='department')

In [None]:
print(feature_fraud_cor['salary_range'][0])
feature_fraud_cor['salary_range'][1].plot.bar(xlabel='salary_range')

In [None]:
print(feature_fraud_cor['employment_type'][0])
feature_fraud_cor['employment_type'][1].plot.bar(xlabel='employment_type')

In [None]:
print(feature_fraud_cor['required_experience'][0])
feature_fraud_cor['required_experience'][1].plot.bar(xlabel='required_experience')

In [None]:
print(feature_fraud_cor['industry'][0])
feature_fraud_cor['industry'][1].plot.bar(xlabel='industry')

In [None]:
#attempt to correlate boolean features with fraudulency to see if there is any pattern

boolean_df = df[['telecommuting', 'has_company_logo', 'has_questions', 'fraudulent']] #subset of boolean features
real_df = boolean_df[boolean_df.fraudulent == 0] #boolean features on real jobs
fake_df = boolean_df[boolean_df.fraudulent != 0] #boolean features on fake jobs

In [None]:
corr = df.corr() #correlation matrix of all boolean features and fraudulency label
sns.heatmap(corr)
print(corr)

In [None]:
#extra row added to prevent zero division since all fraudulency entries are 0
fake_nan_row = pd.Series({'telecommuting':1, 'has_company_logo':1, 'has_questions':1, 'fraudulent':1})
real_df = real_df.append(fake_nan_row, ignore_index=True)

real_corr = real_df.corr() #correlation matrix of boolean features and real jobs
sns.heatmap(real_corr)
print(real_corr)

In [None]:
#extra row added to prevent zero division since all fraudulency entries are 1
real_nan_row = pd.Series({'telecommuting':0, 'has_company_logo':0, 'has_questions':0, 'fraudulent':0})
fake_df = fake_df.append(real_nan_row, ignore_index=True)

fake_corr = fake_df.corr() #correlation matrix of boolean features and fake jobs
sns.heatmap(fake_corr)
print(fake_corr)

In [None]:
#function to output model score according to various metrics
#takes a model and train test split of data to run model predictions on
#returns model predictions and scoring on each metric as dicts

def model_score(model, X_train, X_test, Y_train, Y_test):
    training_predictions = model.predict(X_train) #evaluate training accuracy
    
    train_accuracy = metrics.accuracy_score(Y_train, training_predictions)
    print('Training accuracy: %.2f' % train_accuracy) 

    test_predictions = model.predict(X_test) #evaluate testing accuracy

    test_accuracy = metrics.accuracy_score(Y_test, test_predictions)
    print('Testing accuracy: %.2f' % test_accuracy)

    test_predict_proba = model.predict_proba(X_test)[:,1] #evaluate testing probabilities
    
    test_auc_score = metrics.roc_auc_score(Y_test, test_predict_proba)
    print('AUC value: %.2f' % test_auc_score)

    bal_score = metrics.balanced_accuracy_score(Y_test, test_predictions) #evaluate balanced score (average of recall scores)
    print('Balanced score: %.2f' % bal_score)

    precision = metrics.precision_score(Y_test, test_predictions) #evaluate precision score
    print('Precision: %.2f' % precision)
    
    k_predictions = model.predict(X_test[:100]) #evaluate precision score on 100 entries
    k_precision = metrics.precision_score(Y_test[:100], k_predictions)
    print('Precision over 100 entries: %.2f' % k_precision)
    
    recall = metrics.recall_score(Y_test, test_predictions) #evaluate recall score
    print('Recall: %.2f' % recall)
    
    f1 = metrics.f1_score(Y_test, test_predictions) #evaluate f1 score
    print('F1: %.2f' % f1)
    
    #save predictions and scores to return
    predictions = {
        'training': training_predictions,
        'testing:': test_predictions,
        'testing_proba': test_predict_proba,
        'k': k_predictions
    }
    
    scores = {
        'training': train_accuracy,
        'testing': test_accuracy,
        'auc': test_auc_score,
        'balanced': bal_score,
        'precision': precision,
        'k_precision': k_precision,
        'recall': recall,
        'f1': f1
    }
    
    return predictions, scores

# Logistic Regression

This section focuses on the baseline logistic regression model, experimenting with hyperparameter tuning, different feature representations, and imbalanced vs balanced dataset to compare each adjustment's effectiveness on model accuracy.

## Hyperparameter Tuning

This section tunes hyperparameters for logistic regression model using grid search on created hyperparameter matrix.

In [None]:
#establish hyperparameter matrix to be search through

c_space = np.logspace(-1, 3, 5).tolist()
solvers = ['lbfgs','newton-cg','liblinear','sag','saga']
param_grid = {'C': c_space, 'solver': solvers}
param_grid

In [None]:
#run gridsearchCV to find best combination of hyperparameters over base logistic regression model

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.9)
logreg = linear_model.LogisticRegression(fit_intercept=True, max_iter=10000)
log_grid = GridSearchCV(logreg, param_grid=param_grid, cv=3)
log_grid.fit(X_train, Y_train)

In [None]:
#extract optimal hyperparameter combination

logreg_best_c, logreg_best_solver = log_grid.best_params_.values()
print('c:', logreg_best_c, '\nsolver:', logreg_best_solver, '\nscore:', log_grid.best_score_)

### Already tuned

Tuned hyperparameters previously, initializing to optimized values

In [None]:
logreg_best_c = 0.1
logreg_best_solver = 'liblinear'

## Training

Train multiple logistic regression models using different combinations of various features. We compare model accuracy while adjusting optimal hyperparameters, usage of BOW vs tfidf feature models, and imbalanced vs augumented balanced dataset.

In [None]:
#regenerate tfidf feature representation and BOW dataset

X_train, X_test, Y_train, Y_test = train_test_split(df['documents'], Y, test_size=0.3)
Encoder = LabelEncoder()
Train_Y_Tfidf = Encoder.fit_transform(Y_train)
Test_Y_Tfidf = Encoder.fit_transform(Y_test)
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(df['documents'])
Train_X_Tfidf = Tfidf_vect.transform(X_train)
Test_X_Tfidf = Tfidf_vect.transform(X_test)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [None]:
#train logistic regression model using optimized hyperparameters

logreg_model_tuned = linear_model.LogisticRegression(C=logreg_best_c, penalty='l2', solver=logreg_best_solver, fit_intercept=True, max_iter=10000)
logreg_model_tuned.fit(X_train, Y_train)

In [None]:
#train base logistic regression model

logreg_model = linear_model.LogisticRegression(penalty='l2', fit_intercept=True, max_iter=10000)
logreg_model.fit(X_train, Y_train)

In [None]:
#train logistic regression model using tfidf feature set and optimized hyperparameters

logreg_model_tuned_tfidf = linear_model.LogisticRegression(C=logreg_best_c, penalty='l2', solver=logreg_best_solver, fit_intercept=True, max_iter=10000)
logreg_model_tuned_tfidf.fit(Train_X_Tfidf, Train_Y_Tfidf)

In [None]:
#train base logistic regression model using tfidf feature set

logreg_model_tfidf = linear_model.LogisticRegression(penalty='l2', fit_intercept=True, max_iter=10000)
logreg_model_tfidf.fit(Train_X_Tfidf, Train_Y_Tfidf)

In [None]:
#train logistic regression model using augumented dataset and optimized hyperparameters

logreg_res_model_tuned = linear_model.LogisticRegression(C=logreg_best_c, penalty='l2', solver=logreg_best_solver, fit_intercept=True, max_iter=10000)
logreg_res_model_tuned.fit(X_res_train, Y_res_train)

In [None]:
#train base logistic regression model using augumented dataset

logreg_res_model = linear_model.LogisticRegression(penalty='l2', fit_intercept=True, max_iter=10000)
logreg_res_model.fit(X_res_train, Y_res_train)

### Load from pickle
Load previously trained models and scores if available

In [None]:
logreg_model_tuned = pickle.load(open('../generated/logistic_model_tuned.pkl', 'rb'))
logreg_model = pickle.load(open('../generated/logistic_model.pkl', 'rb'))
logreg_model_tuned_tfidf = pickle.load(open('../generated/logistic_model_tuned_tfidf.pkl', 'rb'))
logreg_model_tfidf = pickle.load(open('../generated/logistic_model_tfidf.pkl', 'rb'))
logreg_res_model_tuned = pickle.load(open('../generated/logistic_res_model_tuned.pkl', 'rb'))
logreg_res_model = pickle.load(open('../generated/logistic_res_model.pkl', 'rb'))
logistic_scores = pickle.load(open('../generated/logistic_scores.pkl', 'rb'))

## Scoring

Output scoring of each logistic regression model to compare model effectiveness. Various metrics are used for a more complete picture of how each model performs.

In [None]:
#output scoring of models using BOW dataset to train

print('\n\nModels trained on original data:')

#save each model to store later
print('\nTuned logistic regression:')
logreg_tuned_preds, logreg_tuned_scores = model_score(logreg_model_tuned, X_train, X_test, Y_train, Y_test)
print('\nBase logistic regression:')
logreg_preds, logreg_scores = model_score(logreg_model, X_train, X_test, Y_train, Y_test)
print('\nTuned logistic regression using tfidf:')
logreg_tuned_tfidf_preds, logreg_tuned_tfidf_scores = model_score(logreg_model_tuned_tfidf, Train_X_Tfidf, Test_X_Tfidf, Train_Y_Tfidf, Test_Y_Tfidf)
print('\nBase logistic regression using tfidf:')
logreg_tfidf_preds, logreg_tfidf_scores = model_score(logreg_model_tfidf, Train_X_Tfidf, Test_X_Tfidf, Train_Y_Tfidf, Test_Y_Tfidf)

In [None]:
#output scoring on models using augumented dataset to train
print('\n\nModels trained on SMOTE resampled data:')

#save each model to store later
print('\nTuned logistic regression on resampled data:')
logreg_res_tuned_preds_res, logreg_res_tuned_scores_res = model_score(logreg_res_model_tuned, X_res_train, X_res_test, Y_res_train, Y_res_test)
print('\nBase logistic regression on resampled data:')
logreg_res_preds_res, logreg_res_scores_res = model_score(logreg_res_model, X_res_train, X_res_test, Y_res_train, Y_res_test)
print('\nTuned logistic regression on original data:')
logreg_res_tuned_preds, logreg_res_tuned_scores = model_score(logreg_res_model_tuned, X_train, X_test, Y_train, Y_test)
print('\nBase logistic regression on original data:')
logreg_res_preds, logreg_res_scores = model_score(logreg_res_model, X_train, X_test, Y_train, Y_test)

In [None]:
#save scores for each model to dict

logistic_scores = {
    'logreg_tuned': logreg_tuned_scores,
    'logreg': logreg_scores,
    'logreg_tuned_tfidf': logreg_tuned_tfidf_scores,
    'logreg_tfidf': logreg_tfidf_scores,
    'logreg_res_tuned_res': logreg_res_tuned_scores_res,
    'logreg_res_preds_res': logreg_res_scores_res,
    'logreg_res_tuned': logreg_res_tuned_scores,
    'logreg_res': logreg_res_scores
}

## Dump to pickle

In [None]:
#dump models and scoring to pickle file to load later

pickle.dump(logreg_model_tuned, open('../generated/logistic_model_tuned.pkl', 'wb'))
pickle.dump(logreg_model, open('../generated/logistic_model.pkl', 'wb'))
pickle.dump(logreg_model_tuned_tfidf, open('../generated/logistic_model_tuned_tfidf.pkl', 'wb'))
pickle.dump(logreg_model_tfidf, open('../generated/logistic_model_tfidf.pkl', 'wb'))
pickle.dump(logreg_res_model_tuned, open('../generated/logistic_res_model_tuned.pkl', 'wb'))
pickle.dump(logreg_res_model, open('../generated/logistic_res_model.pkl', 'wb'))
pickle.dump(logistic_scores, open('../generated/logistic_scores.pkl', 'wb'))

## Top 5 pos/neg terms

Extract the top 5 most impactful terms from logistic regression model weights.

In [None]:
#sort logistic regression model weights
K=5
weights = logreg_res_model_tuned.coef_[0]
terms = sorted(vectorizer.vocabulary_.keys())
sorted_weights_terms = np.array(sorted((weights[i], terms[i]) for i in range(len(weights))))

#get top 5 most postive and negative terms and weights
topK_pos_terms = sorted_weights_terms[-1:-1-K:-1, 1]
topK_pos_weights = sorted_weights_terms[-1:-1-K:-1, 0]
topK_neg_terms = sorted_weights_terms[:K, 1]
topK_neg_weights = sorted_weights_terms[:K, 0]

#print weights to 2 decimal places
print(f'\nThe {K} *most positive* weights')
for i in range(K):
    print(f"{i+1}: {topK_pos_terms[i]} \t  {round(float(topK_pos_weights[i]), 2)}")

print(f'\nThe {K} *most negative* weights')
for i in range(K):
    print(f"{i+1}: {topK_neg_terms[i]} \t  {round(float(topK_neg_weights[i]), 2)}")

# SVM 

This sections focuses on the support vector machine model, where we experiment with hyperparameter tuning, BOW and tfidf feature representations, and imbalanced vs balanced dataset to compare model accuracy with different adjustments.

Base models and tfidf adapted from https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [None]:
from sklearn import svm

## Hyperparameter Tuning

Tune hyperparameters on all combinations of user generated hyperparameter matrix.

In [None]:
#take subset of base dataset to reduce runtime

X_train_tuning, X_test_tuning, Y_train_tuning, Y_test_tuning = train_test_split(df['documents'], Y, test_size=0.7)
X_train_tuning, X_test_tuning, Y_train_tuning, Y_test_tuning = train_test_split(X_train_tuning, Y_train_tuning, test_size=0.5)

In [None]:
#convert smaller dataset to tfidf feature representation
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['documents'])
Train_X_Tfidf_Tuning = Tfidf_vect.transform(X_train_tuning)
Test_X_Tfidf_Tuning = Tfidf_vect.transform(X_test_tuning)

In [None]:
#generate hyperparameter matrix
kernels=['linear', 'poly', 'rbf', 'sigmoid']
c_range = np.logspace(-2, 3, 6).tolist()
gamma_range = np.logspace(-3, 2, 6).tolist()
param_grid = dict(C=c_range, kernel=kernels, gamma=gamma_range)

In [None]:
#use grid search to score all combinations of hyperparameters
svm_grid = GridSearchCV(svm.SVC(), param_grid=param_grid, refit=True)#, verbose=3)
svm_grid.fit(Train_X_Tfidf_Tuning, Y_train_tuning)

In [None]:
#extract best hyperparameters from grid search
svm_best_c, svm_best_gamma, svm_best_kernel = svm_grid.best_params_.values()
print('c:', svm_best_c, '\ngamma:', svm_best_gamma, '\nkernel:', svm_best_kernel, '\nscore:', svm_grid.best_score_)

### Already tuned

Tuned hyperparameters previously, initializing to optimized values

In [None]:
svm_best_c = 100.0
svm_best_gamma = 0.01
svm_best_kernel = 'rbf'

## Training

Train multiple support vector machines using various combinations of optimized hyperparameters, tfidf and BOW feature sets, and balanced dataset to see most effective adjustments.

In [None]:
#regenerate tfidf and BOW feature sets if needed

X_train, X_test, Y_train, Y_test = train_test_split(df['documents'], Y, test_size=0.3)

Encoder = LabelEncoder()
Train_Y_Tfidf = Encoder.fit_transform(Y_train)
Test_Y_Tfidf = Encoder.fit_transform(Y_test)

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['documents'])

Train_X_Tfidf = Tfidf_vect.transform(X_train)
Test_X_Tfidf = Tfidf_vect.transform(X_test)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [None]:
#train base SVM on BOW feature set

SVM = svm.SVC(probability=True, random_state=0)
SVM.fit(X_train, Y_train)

#train SVM with optimal hyperparameters on BOW feature set
SVM_tuned = svm.SVC(probability=True, random_state=0, C=svm_best_c, gamma=svm_best_gamma, kernel=svm_best_kernel)
SVM_tuned.fit(X_train, Y_train)

In [None]:
#train base SVM on tfidf feature set
SVM_tfidf = svm.SVC(probability=True, random_state=0)
SVM_tfidf.fit(Train_X_Tfidf, Train_Y_Tfidf)

#train SVM with optimal hyperparameters on tfidf feature set
SVM_tuned_tfidf = svm.SVC(probability=True, random_state=0, C=svm_best_c, gamma=svm_best_gamma, kernel=svm_best_kernel)
SVM_tuned_tfidf.fit(Train_X_Tfidf, Train_Y_Tfidf)

In [None]:
#train base SVM using augumented dataset
SVM_res = svm.SVC(probability=True, random_state=0)
SVM_res.fit(X_res_train, Y_res_train)

#train SVM with optimal hyperparameters using augumented dataset
SVM_res_tuned = svm.SVC(probability=True, random_state=0, C=svm_best_c, gamma=svm_best_gamma, kernel=svm_best_kernel)
SVM_res_tuned.fit(X_res_train, Y_res_train)

### Load from pickle

Load previously trained models and scores if available

In [None]:
SVM_tuned = pickle.load(open('../generated/svm_model_tuned.pkl', 'rb'))
SVM = pickle.load(open('../generated/svm_model.pkl', 'rb'))
SVM_tuned_tfidf = pickle.load(open('../generated/svm_model_tuned_tfidf.pkl', 'rb'))
SVM_tfidf = pickle.load(open('../generated/svm_model_tfidf.pkl', 'rb'))
SVM_res_tuned = pickle.load(open('../generated/svm_res_model_tuned.pkl', 'rb'))
SVM_res = pickle.load(open('../generated/svm_res_model.pkl', 'rb'))
SVM_scores = pickle.load(open('../generated/svm_scores.pkl', 'rb'))

## Scoring

Output scoring for each SVM model to compare model effectiveness. Various metrics are used for a more complete picture of how each model performs.

In [None]:
#output scoring of models that used BOW dataset to train

print('\n\nModels trained using original data')
print('\nBase SVM:')
svm_scores = model_score(SVM, X_train, X_test, Y_train, Y_test)

print('\nTuned SVM:')
svm_tuned_scores = model_score(SVM_tuned, X_train, X_test, Y_train, Y_test)

print('\nBase SVM using tfidf')
svm_tfidf_scores = model_score(SVM_tfidf, Train_X_Tfidf, Test_X_Tfidf, Train_Y_Tfidf, Test_Y_Tfidf)

print('\nTuned SVM using tfidf')
svm_tuned_tfidf_scores = model_score(SVM_tuned_tfidf, Train_X_Tfidf, Test_X_Tfidf, Train_Y_Tfidf, Test_Y_Tfidf)

In [None]:
#output scoring of models that used augumented dataset to train

print('\n\nModels trained using SMOTE resampled data')
print('\nBase SVM on resampled data')
svm_res_scores_res = model_score(SVM_res, X_res_train, X_res_test, Y_res_train, Y_res_test)

print('\nTuned SVM on resampled data')
svm_res_tuned_scores_res = model_score(SVM_res_tuned, X_res_train, X_res_test, Y_res_train, Y_res_test)

In [None]:
#output scoring of models that used augumented dataset to train
print('\nBase SVM on original data')
svm_res_scores = model_score(SVM_res, X_train, X_test, Y_train, Y_test)

print('\nTuned SVM on original data')
svm_res_tuned_scores = model_score(SVM_res_tuned, X_train, X_test, Y_train, Y_test)

In [None]:
#save model scores to dict
SVM_scores = {
    'svm_tuned': svm_tuned_scores,
    'svm': svm_scores,
    'svm_tuned_tfidf': svm_tuned_tfidf_scores,
    'svm_tfidf': svm_tfidf_scores,
    'svm_res_tuned_res': svm_res_tuned_scores_res,
    'svm_res_preds_res': svm_res_scores_res,
    'svm_res_tuned': svm_res_tuned_scores,
    'svm_res': svm_res_scores
}

## Dump to pickle

Save SVM models and scores to pickle files to use in later experiments.

In [None]:
#dump models and scores to pickle files

pickle.dump(SVM, open('../generated/svm_model.pkl', 'wb'))
pickle.dump(SVM_tuned, open('../generated/svm_model_tuned.pkl', 'wb'))
pickle.dump(SVM_tfidf, open('../generated/svm_model_tfidf.pkl', 'wb'))
pickle.dump(SVM_tuned_tfidf, open('../generated/svm_model_tuned_tfidf.pkl', 'wb'))
pickle.dump(SVM_res, open('../generated/svm_res_model.pkl', 'wb'))
pickle.dump(SVM_res_tuned, open('../generated/svm_res_model_tuned.pkl', 'wb'))
pickle.dump(SVM_scores, open('../generated/svm_scores.pkl', 'wb'))

# Stacking Ensemble

This section focuses on the stacking ensemble, an ensemble learning technique that combines predictions from multiple models according to meta-estimator that decides how to combine model predictions. We choose the best performing logistic regression and SVM models from previous experimentation, and combine it with an XGBoost classifier, using an Extra Trees classifier to find the best weighting for each of the lower level models. The ensemble is trained on the tfidf feature representation and augumented BOW representation to compare effectiveness.

In [None]:
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import ExtraTreesClassifier

## Training

Selects multiple lower level classifiers to fit to an extra trees classifier, then trains the stacking model on tfidf feature representation and augumented BOW representation.

In [None]:
#intialize lower level classifiers to be used
lower_classifiers = dict()
lower_classifiers["logreg"] = logreg_res_model
lower_classifiers["xgboost"] = XGBClassifier(objective='binary:logistic', eval_metric='aucpr')
lower_classifiers["svm"] = SVM_res_tuned

#initialize higher level classifier
aggregate_classifier = ExtraTreesClassifier()

In [None]:
#regenerate tfidf feature representation if needed

X_train, X_test, Y_train, Y_test = train_test_split(df['documents'], Y, test_size=0.3)

Encoder = LabelEncoder()
Train_Y_Tfidf = Encoder.fit_transform(Y_train)
Test_Y_Tfidf = Encoder.fit_transform(Y_test)

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['documents'])

Train_X_Tfidf = Tfidf_vect.transform(X_train)
Test_X_Tfidf = Tfidf_vect.transform(X_test)

X_train, X_test, Y_train, Y_test = train_test_split(Train_X_Tfidf, Train_Y_Tfidf, test_size=0.6)

In [None]:
#train stacking model on tfidf feature dataset
stacking_model_tfidf = StackingClassifier(estimators=list(lower_classifiers.items()), final_estimator=aggregate_classifier, passthrough=True, stack_method="predict_proba", verbose=2)
stacking_model_tfidf.fit(X_train, Y_train)

In [None]:
#train stacking model on augumented BOW dataset
stacking_res_model = StackingClassifier(estimators=list(lower_classifiers.items()), final_estimator=aggregate_classifier, passthrough=True, stack_method="predict_proba", verbose=2)
stacking_res_model.fit(X_res_train, Y_res_train)

### Load from pickle

Load previously trained models and scores if available

In [None]:
#load stacking models and scores

stacking_model_tfidf = pickle.load(open('../generated/ensemble_model_tfidf.pkl', 'rb'))
stacking_res_model = pickle.load(open('../generated/ensemble_res_model.pkl', 'rb'))
ensemble_scores = pickle.load(open('../generated/ensemble_scores.pkl', 'rb'))

## Scoring
Output scoring for each stacking model to compare model effectiveness. Various metrics are used for a more complete picture of how each model performs.

In [None]:
#print scores for each model

print('Stacking ensemble using tfidf:')
stacking_model_tfidf_scores = model_score(stacking_model_tfidf, X_train, X_test, Y_train, Y_test)

print('\nStacking ensemble using SMOTE resampled data:')
stacking_res_model_scores = model_score(stacking_res_model, X_train, X_test, Y_train, Y_test)

In [None]:
#save model scores to dict
ensemble_scores = {
    'ensemble_tfidf': stacking_model_tfidf_scores,
    'ensemble_res': stacking_res_model_scores
}

In [None]:
print(stacking_res_model.estimators_[0])
print(stacking_res_model.estimators_[1])
print(stacking_res_model.estimators_[2])
print(stacking_res_model.final_estimator_)
print(stacking_res_model.stack_method_)
print(stacking_res_model.estimators_[1].feature_importances_)

## Dump to pickle
Save stacking models and scores to pickle files to use in later experiments.

In [None]:
#dump models and score to pickle files
pickle.dump(stacking_model_tfidf, open('../generated/ensemble_model_tfidf.pkl', 'wb'))
pickle.dump(stacking_res_model, open('../generated/ensemble_res_model.pkl', 'wb'))
pickle.dump(ensemble_scores, open('../generated/ensemble_scores.pkl', 'wb'))

# Bidirectional LSTM


## Imports

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [None]:
#load previously trained model if available

In [None]:
lstm_model = keras.models.load_model('../generated/lstm_model')

## Training


In [None]:
df['documents']
voc_size=5000
sent_length = 50 # make all sentences the same length, and add white space if necessary

X = df['documents'].values
Y = list(Y)

encoder = tf.keras.layers.TextVectorization(max_tokens=voc_size, output_sequence_length=sent_length) # Layer for vectorizing our text data
tf_dataset = tf.data.Dataset.from_tensor_slices((X, Y)) # create a tensorflow dataset

In [None]:
X_tr, X_te, Y_tr, Y_te = train_test_split(X, Y, test_size=0.25, random_state=32) # split our dataset into training and test data

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

tf_dataset_tr = tf.data.Dataset.from_tensor_slices((X_tr,Y_tr)) # create tensorflow training dataset
tf_dataset_tr = tf_dataset_tr.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

encoder.adapt(tf_dataset_tr.map(lambda text, label: text)) # adapt our encoder with our training data

tf_dataset_te = tf.data.Dataset.from_tensor_slices((X_te,Y_te)) # create tensorflow testing dataset
tf_dataset_te = tf_dataset_te.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

total = len(X_tr)
neg = len([x for x in Y_tr if x == 0])
pos = len([x for x in Y_tr if x == 1])
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weights = {0: weight_for_0, 1: weight_for_1}

In [None]:
## Creating model
embedding_vector_features=50
lstm_model=Sequential() # create model

lstm_model.add(encoder) # add encoder layer

lstm_model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length)) # embedding layer to standardize data
lstm_model.add(Bidirectional(LSTM(100))) # LSTM layer that processes data in both directions
lstm_model.add(Dropout(0.3)) # prevent overfitting with dropout layer
lstm_model.add(Dense(1,activation='sigmoid'))
lstm_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) 

lstm_model.fit(tf_dataset_tr, validation_data=tf_dataset_te,epochs=12,batch_size=BATCH_SIZE, class_weight=class_weights) # fit data to training data with 12 epochs
print(lstm_model.summary())

In [None]:
# Model Performance
Y_pred = (lstm_model.predict(X_te) > 0.5).astype("int32")

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print(confusion_matrix(Y_te,Y_pred))
print(classification_report(Y_te, Y_pred))

In [None]:
lstm_model.save('../generated/lstm_model')