<a href="https://colab.research.google.com/github/mtlmh34/AML_Project/blob/main/grp%20proj1.12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files

In [None]:
files.upload()

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model, tree, model_selection, ensemble, svm, metrics, naive_bayes
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('fake_job_postings.csv')

In [None]:
df.info()

In [None]:
# drop columns with more than 50% missing data
# drop column for industry with too many variables
df.drop(columns=['department', 'salary_range', 'industry'], inplace = True)

In [None]:
#-------------------start cleaning for location column

#--------1. split the columns to country, state, and city

# to use country only?
df['country']=df.location.str.split(',',expand=True)[:][0]

df['state']=df.location.str.split(',',expand=True)[:][1]
df['city']=df.location.str.split(',',expand=True)[:][2]

#----------2. replacing different kinds of missing value to np.nan
df.state = df.state.str.strip()
df.state.fillna(value=np.nan, inplace=True)
df.state.replace('', np.nan, inplace=True)
df.state.replace(' ', np.nan, inplace=True)


#df['state'].sort_values().unique()

#----------3. clear the whitespaces and signs at the start/end

df.city = df.city.str.strip(' /:\\')

#---------4. replacing different kinds of missing value to np.nan
df.city.fillna(value=np.nan, inplace=True)
df.city.replace('', np.nan, inplace=True)
df.city.replace(' ', np.nan, inplace=True)
df['city']=df.city.str.lower()
#------

# drop unnecessary columns
df.drop(['location'], axis = 1, inplace = True)

# too many empty fields in city, hence dropped
df.drop(columns=['city'], axis=1, inplace = True)

In [None]:
#-------------------start cleaning for text columns

def preprocess_text(text):
    
    
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\b[A-Za-z]+\b')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords


df['text'] = df.apply(lambda row: (str(row['title']) + ' ' + 
                                   str(row['company_profile']) + ' ' + 
                                   str(row['description']) + ' ' + 
                                   str(row['requirements']) + ' ' + 
                                   str(row['benefits'])), axis = 1)

# Fill empty columns with "Unspecified"
df['text'] = df['text'].fillna("Unspecified")
    
# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=preprocess_text, ngram_range = (1,2),   #TRY ADJUSTING ngram_range and min_df, max_df
                                min_df= 0.05, max_df= 0.5)

# Fit to the data and transform to feature matrix
text_column = vectoriser.fit_transform(df['text'])

# Convert sparse matrix to dataframe
text_column = pd.DataFrame.sparse.from_spmatrix(text_column)

# Save mapping on which index refers to which words
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}

# Rename each column using the mapping
for col in text_column.columns:
    text_column.rename(columns={col: col_map[col]}, inplace=True)

In [None]:
#-------------------continue cleaning for text columns

# identify which text feature is not populated before dropping them
columns_to_remove = ['requirements', 'company_profile', 'description', 'benefits', 'title', 'text']

for i in columns_to_remove:
  df[i] = df[i].fillna("unspecified")

# create new binary columns to check if features are provided in the job ad
new_columns = ['has_requirements', 'has_company_profile', 'has_description', 'has_benefits']

for i in range(len(new_columns)):
  df[new_columns[i]] = df[columns_to_remove[i]].apply(lambda x: 0 if x == "unspecified" else 1)

df.drop(columns = columns_to_remove, axis = 1, inplace = True)
df.drop(columns = ['job_id'], axis = 1, inplace = True)

In [None]:
#-------------------cleaning categorical columns 
category_columns = ['employment_type', 'required_experience', 'required_education', 'function', 'country', 'state']

for i in category_columns:
  df[i] = df[i].fillna(i + "_unspecified")

# country codes duplicated in country and state columns
duplicated = ['BE', 'DE', 'PA', 'NL', 'ID', 'PL', 'IL', 'IN', 'RS', 'ES', 'SA', 'SD', 'MT', 'MA', 'IS', 'SK', 'HR',
              'BR', 'TN', 'JM', 'CO', 'CN', 'AL', 'AR', 'CH', 'CA', 'BY']

# remove duplicated codes from state column
state_features = pd.get_dummies(df['state'])
state_features.drop(columns = duplicated, axis = 1, inplace = True)

# combine all one-hot encoded categorical columns 
categorical_features = pd.concat([pd.get_dummies(df['employment_type']), 
                                  pd.get_dummies(df['required_experience']), 
                                  pd.get_dummies(df['required_education']), 
                                  pd.get_dummies(df['country']),
                                  state_features],
                                  axis = 1)

df.drop(columns = category_columns, axis = 1, inplace = True)

In [None]:
# number of text and categorical columns (to delete this row later)
text_column.shape[1], categorical_features.shape[1]

In [None]:
# only binary columns remain in df (to delete this row later)
df.columns

In [None]:
# combine vectorised columns and sparse categorical columns to the main df
df = pd.concat([df, text_column, categorical_features], axis =1, ignore_index=False)

In [None]:
X_data = df.drop(columns = 'fraudulent', axis = 1)
y_data = df['fraudulent']

In [None]:
smt = SMOTETomek(random_state=42, sampling_strategy=0.4)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)
X_resample, y_resampled = smt.fit_resample(X_train, y_train)

In [None]:
#------------------------------------- build random forest

In [None]:
clf = ensemble.RandomForestClassifier(random_state=42)
clf.fit(X_resample, y_resampled)

In [None]:
y_pred_train = clf.predict(X_resample)
y_pred_prob_train = clf.predict_proba(X_resample)[:,1]
    
precision_train = metrics.precision_score(y_resampled, y_pred_train)
recall_train = metrics.recall_score(y_resampled, y_pred_train)
f1_train = metrics.f1_score(y_resampled, y_pred_train)
roc_auc_train = metrics.roc_auc_score(y_resampled, y_pred_prob_train)

precision_train, recall_train, f1_train, roc_auc_train

In [None]:
y_pred_test = clf.predict(X_test)
y_pred_prob_test = clf.predict_proba(X_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc

In [None]:
def model_build(base_estimator, parameters):
    
    clf = model_selection.RandomizedSearchCV(
        base_estimator, 
        parameters, 
        scoring = 'recall')
    
    clf.fit(X_resample, y_resampled)
    
    print('best score:', clf.best_score_)
    print('best parameters: ', clf.best_params_)

    optimised = clf.best_estimator_
    return optimised

In [None]:
#@title
random_grid = {'n_estimators': [x for x in np.arange(start=80,stop=200,step=10)],
               'criterion': ['gini', 'entropy'], 
               'min_samples_leaf': [1,2,3,4,5],
               'min_samples_split': [2,3,4,5]}

optimised_rf = model_build(ensemble.RandomForestClassifier(random_state=2022), random_grid)

In [None]:
rf = {'Feature': optimised_rf.feature_names_in_, 'Importance': optimised_rf.feature_importances_}
df_rf = pd.DataFrame(rf)
df_rf = df_rf.sort_values(by = 'Importance', ascending = False, ignore_index =True)
df_rf.head(10)

In [None]:
y_pred_train = optimised_rf.predict(X_resample)
y_pred_prob_train = optimised_rf.predict_proba(X_resample)[:,1]
    
precision_train = metrics.precision_score(y_resampled, y_pred_train)
recall_train = metrics.recall_score(y_resampled, y_pred_train)
f1_train = metrics.f1_score(y_resampled, y_pred_train)
roc_auc_train = metrics.roc_auc_score(y_resampled, y_pred_prob_train)

precision_train, recall_train, f1_train, roc_auc_train

In [None]:
y_pred_test = optimised_rf.predict(X_test)
y_pred_prob_test = optimised_rf.predict_proba(X_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc

In [None]:
rf_matrix = metrics.confusion_matrix(y_resampled, y_pred_train, labels=[1,0])
rf_matrixt = metrics.confusion_matrix(y_test, y_pred_test, labels=[1,0])

report = """
Confusion Matrix for train dataset:
{}
Confusion Matrix for test dataset:
{}
""".format(rf_matrix, rf_matrixt)
print(report)

In [None]:
#---------------------------------- build Naive Bayes Model

In [None]:
mnb = naive_bayes.MultinomialNB()
mnb.fit(X_resample, y_resampled)

In [None]:
y_pred_train = mnb.predict(X_resample)
y_pred_prob_train = mnb.predict_proba(X_resample)[:,1]
    
precision_train = metrics.precision_score(y_resampled, y_pred_train)
recall_train = metrics.recall_score(y_resampled, y_pred_train)
f1_train = metrics.f1_score(y_resampled, y_pred_train)
roc_auc_train = metrics.roc_auc_score(y_resampled, y_pred_prob_train)

precision_train, recall_train, f1_train, roc_auc_train

In [None]:
y_pred_test = mnb.predict(X_test)
y_pred_prob_test = mnb.predict_proba(X_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc

In [None]:
cf_matrixtr = metrics.confusion_matrix(y_resampled, y_pred_train, labels=[1,0])
cf_matrixte = metrics.confusion_matrix(y_test, y_pred_test, labels=[1,0])

report = """
Confusion Matrix for train dataset:
{}
Confusion Matrix for test dataset:
{}
""".format(cf_matrixtr, cf_matrixte)
print(report)

In [None]:
# match feature names and respective feature counts in each class
f = {'Feature': X_resample.columns, 'fraud': mnb.feature_count_[1,:],
         'not_fraud': mnb.feature_count_[0,:]}
features = pd.DataFrame(data=f)

# calculate the P(feature|class) by using feature count in the class divided by respective class count
# 1 count is added to all feature counts to ensure ratio can be calculated later
features['P (f|fraud)'] = (features['fraud'] +1) / mnb.class_count_[1]
features['P (f|not_fraud)'] = (features['not_fraud'] +1) / mnb.class_count_[0]

# ratio of the P(feature|class) is used to evaluate the predictiveness of the token 
features['Ratio'] = features['P (f|fraud)'] / features['P (f|not_fraud)']

# print out the top 10 features for each class
print('The top 20 most predictive tokens for fraud job postings are:')
print(features.sort_values(by=['Ratio'], ascending= False).head(20)['Feature'].values)

print('The top 20 most predictive tokens for not fraud job postings are:')
print(features.sort_values(by=['Ratio'], ascending= False).tail(20)['Feature'].values)

In [None]:
# try naive bayes with only text columns
X_train, X_test, y_train, y_test = train_test_split(text_column, y_data, test_size=0.2, random_state=42)
X_resample, y_resampled = smt.fit_resample(X_train, y_train)
text_resample = X_resample.drop(X_resample.columns[0:8], axis = 1)
text_test = X_test.drop(X_resample.columns[0:8], axis = 1)

In [None]:
mnb2 = naive_bayes.MultinomialNB()
mnb2.fit(text_resample, y_resampled)

y_pred_train = mnb.predict(X_resample)
y_pred_prob_train = mnb.predict_proba(X_resample)[:,1]
y_pred_test = mnb2.predict(text_test)
y_pred_prob_test = mnb2.predict_proba(text_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc

In [None]:
cf_matrixtr = metrics.confusion_matrix(y_resampled, y_pred_train, labels=[1,0])
cf_matrixte = metrics.confusion_matrix(y_test, y_pred_test, labels=[1,0])

report = """
Confusion Matrix for train dataset:
{}
Confusion Matrix for test dataset:
{}
""".format(cf_matrixtr, cf_matrixte)
print(report)

In [None]:
y_score = mnb.predict_proba(X_test)[:, 1]

#calculate precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_score)

#create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color='purple')

#add axis labels to plot
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')

#display plot
plt.show()

In [None]:
#--------------------- gradient boosting

In [None]:
gbt = ensemble.GradientBoostingClassifier(random_state=42)
gbt.fit(X_resample, y_resampled)

In [None]:
y_pred_train = gbt.predict(X_resample)
y_pred_prob_train = gbt.predict_proba(X_resample)[:,1]
    
precision_train = metrics.precision_score(y_resampled, y_pred_train)
recall_train = metrics.recall_score(y_resampled, y_pred_train)
f1_train = metrics.f1_score(y_resampled, y_pred_train)
roc_auc_train = metrics.roc_auc_score(y_resampled, y_pred_prob_train)

precision_train, recall_train, f1_train, roc_auc_train

In [None]:
y_pred_test = gbt.predict(X_test)
y_pred_prob_test = gbt.predict_proba(X_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc

In [None]:
gbt = {'Feature': gbt.feature_names_in_, 'Importance': gbt.feature_importances_}
df_gbt = pd.DataFrame(gbt)
df_gbt = df_gbt.sort_values(by = 'Importance', key = abs, ascending = False, ignore_index =True)
df_gbt.head(10)

In [None]:
gb_matrixtr = metrics.confusion_matrix(y_resampled, y_pred_train, labels=[1,0])
gb_matrixte = metrics.confusion_matrix(y_test, y_pred_test, labels=[1,0])

report = """
Confusion Matrix for train dataset:
{}
Confusion Matrix for test dataset:
{}
""".format(gb_matrixtr, gb_matrixte)
print(report)

In [None]:
gradient_grid = {'n_estimators': [x for x in np.arange(start=80,stop=200,step=20)],
               'learning_rate': [0.001, 0.01, 0.1]}

optimised_gbt = model_build(ensemble.GradientBoostingClassifier(random_state=2022), gradient_grid)

In [None]:
gbt = {'Feature': optimised_gbt.feature_names_in_, 'Importance': optimised_gbt.feature_importances_}
df_gbt = pd.DataFrame(gbt)
df_gbt = df_gbt.sort_values(by = 'Importance', ascending = False, ignore_index =True)
df_gbt.head(10)

In [None]:
gb_matrixtr = metrics.confusion_matrix(y_resampled, y_pred_train, labels=[1,0])
gb_matrixte = metrics.confusion_matrix(y_test, y_pred_test, labels=[1,0])

report = """
Confusion Matrix for train dataset:
{}
Confusion Matrix for test dataset:
{}
""".format(gb_matrixtr, gb_matrixte)
print(report)

In [None]:
y_pred_test = optimised_gbt.predict(X_test)
y_pred_prob_test = optimised_gbt.predict_proba(X_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc

In [None]:
optimised_gbt

In [None]:
y_score = optimised_gbt.predict_proba(X_test)[:, 1]

#calculate precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_score)

#create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color='purple')

#add axis labels to plot
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')

#display plot
plt.show()

In [None]:
clf_svm = svm.SVC(probability=True, random_state=42)
clf_svm.fit(X_resample, y_resampled)

In [None]:
y_pred_test = clf_svm.predict(X_test)
y_pred_prob_test = clf_svm.predict_proba(X_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc

In [None]:
from sklearn.model_selection import GridSearchCV
def model_build2(base_estimator, parameters):
    
    clf = model_selection.GridSearchCV(
        base_estimator, 
        parameters)
    
    clf.fit(X_resample, y_resampled)
    
    print('best score:', clf.best_score_)
    print('best parameters: ', clf.best_params_)

    optimised = clf.best_estimator_
    return optimised

In [None]:
svm_grid = {'C': [0.1, 1.0, 10]}

optimised_svm = model_build2(svm.SVC(random_state=2022, probability=True), svm_grid)

In [None]:
optimised_svm = svm.SVC(random_state=2022, probability=True, C=10)

In [None]:
optimised_svm.fit(X_resample, y_resampled)

In [None]:
y_pred_test = optimised_svm.predict(X_test)
y_pred_prob_test = optimised_svm.predict_proba(X_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc

In [None]:
y_score = optimised_svm.predict_proba(X_test)[:, 1]

#calculate precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_score)

#create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color='purple')

#add axis labels to plot
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')

#display plot
plt.show()