<a href="https://colab.research.google.com/github/mtlmh34/AML_Project/blob/main/grp%20proj1.4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model, tree, model_selection, ensemble, svm, metrics, naive_bayes



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [52]:
df=pd.read_csv('fake_job_postings.csv')

In [53]:
#--------1. split the columns to country, state, and city

# to use country only?
df['country']=df.location.str.split(',',expand=True)[:][0]

df['state']=df.location.str.split(',',expand=True)[:][1]
df['city']=df.location.str.split(',',expand=True)[:][2]

In [54]:
#----------2. replacing different kinds of missing value to np.nan
df.state = df.state.str.strip()
df.state.fillna(value=np.nan, inplace=True)
df.state.replace('', np.nan, inplace=True)
df.state.replace(' ', np.nan, inplace=True)


#df['state'].sort_values().unique()

In [55]:
#----------3. clear the whitespaces and signs at the start/end

df.city = df.city.str.strip(' /:\\')

#---------4. replacing different kinds of missing value to np.nan
df.city.fillna(value=np.nan, inplace=True)
df.city.replace('', np.nan, inplace=True)
df.city.replace(' ', np.nan, inplace=True)
df['city']=df.city.str.lower()
#------

#print(df['city'].sort_values().unique().tolist())

In [56]:
df.salary_range.fillna(value=np.nan, inplace=True)
df.salary_range.replace('', np.nan, inplace=True)
df.salary_range.replace(' ', np.nan, inplace=True)

#---------4. spilt salary range into min and max

df['min_salary']=df.salary_range.str.split('-',expand=True)[:][0]
df['max_salary']=df.salary_range.str.split('-',expand=True)[:][1]

df.max_salary.fillna(value=np.nan, inplace=True)

In [57]:
#----------5. for entry of salary_range as date, max and min salary are grouped as null value
df.loc[df['max_salary'].isin(['Apr', 'Dec', 'Jun', 'Nov', 'Oct', 'Sep']),['max_salary', 'min_salary']]=np.nan
df.loc[df['min_salary'].isin(['Dec', 'Jun', 'Oct']),['max_salary', 'min_salary']]=np.nan

#convert them into numerical value
df[['min_salary','max_salary']] = df[['min_salary','max_salary']].astype(float)

#for regression model, need to impute NaN values to median/mean
# df['max_salary'].fillna(value=df['max_salary'].mean(), inplace=True)
# df['min_salary'].fillna(value=df['min_salary'].mean(), inplace=True)

In [58]:
df.drop(['location', 'salary_range'], axis = 1, inplace = True)

In [None]:
#-------------------end of processing for column salary_range and location

In [None]:
#-------------------start cleaning for text columns

In [59]:
# this function for preprocessing text is used linked from the next function

def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'(?u)\b\w\w+\b')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

In [12]:
df['text'] = df.apply(lambda row: (str(row['title']) + ' ' + 
                                   str(row['company_profile']) + ' ' + 
                                   str(row['description']) + ' ' + 
                                   str(row['requirements']) + ' ' + 
                                   str(row['benefits'])), axis = 1)

# Fill empty columns with "Unspecified"
df['text'] = df['text'].fillna("Unspecified")
    
# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=preprocess_text, ngram_range = (1,2),   #TRY ADJUSTING ngram_range and min_df, max_df
                                min_df= 0.01, max_df= 0.6)

# Fit to the data and transform to feature matrix
text_column = vectoriser.fit_transform(df['text'])

# Convert sparse matrix to dataframe
text_column = pd.DataFrame.sparse.from_spmatrix(text_column)

# Save mapping on which index refers to which words
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}

# Rename each column using the mapping
for col in text_column.columns:
    text_column.rename(columns={col: col_map[col]}, inplace=True)
    

In [13]:
columns_to_remove = ['company_profile', 'description', 'requirements', 'benefits']

for i in columns_to_remove:
  df[i] = df[i].fillna("unspecified")

df['title'] = df['title'].fillna("Unspecified")
df['requirements'] = df['requirements'].fillna("Unspecified")
df['benefits'] = df['benefits'].fillna("Unspecified")
df['company_profile'] = df['company_profile'].fillna("Unspecified")
df['description'] = df['description'].fillna("Unspecified")

df['has_title'] = df['title'].apply(lambda x: 0 if x == "unspecified" else 1)
df['has_requirements'] = df['company_profile'].apply(lambda x: 0 if x == "unspecified" else 1)
df['has_comp_profile'] = df['description'].apply(lambda x: 0 if x == "unspecified" else 1)
df['has_description'] = df['requirements'].apply(lambda x: 0 if x == "unspecified" else 1)
df['has_benefits'] = df['benefits'].apply(lambda x: 0 if x == "unspecified" else 1)

In [14]:
df = pd.concat([df, text_column], axis =1, ignore_index=False)

df.drop(columns= ['title', 'text', 'job_id', 'company_profile', 'description', 'requirements', 'benefits'], axis = 1, inplace = True)

In [None]:
#-------------------end cleaning for text columns

In [16]:
df_trial = df.copy(deep = True)

In [None]:
# This column is not run because there are missing fields in this column and SMOTE cannot be performed when there are missing fields
#from sklearn.preprocessing import StandardScaler
#trial = pd.concat([df['min_salary'], df['max_salary']], axis = 1)

#scaler = StandardScaler()
#std_trial = scaler.fit_transform(trial)
#salary_df = pd.DataFrame(data = std_trial, columns={'min_salary', 'max_salary'})

# df_trial.drop(columns=['min_salary', 'max_salary'], axis=1, inplace=True)
# df_trial = pd.concat([df_trial, salary_df], axis = 1)

In [17]:
y_trial = df_trial['fraudulent']
df_trial.drop(columns='fraudulent', axis = 1, inplace = True)

# columns with missing fields need to be dropped for SMOTE to work
df_trial.dropna(axis = 1, inplace = True)

In [18]:
# Perform SMOTE and train-test-split

from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
smt = SMOTETomek(random_state=42,sampling_strategy=0.4)
X_train, X_test, y_train, y_test = train_test_split(df_trial, y_trial, test_size=0.2, random_state=42)
X_resample, y_resampled = smt.fit_resample(X_train, y_train)

  "pandas.DataFrame with sparse columns found."


In [20]:
# to check the ratio of fraud-nonfraud after SMOTE
y_resampled.value_counts(), y_trial.value_counts()

(0    13619
 1     5447
 Name: fraudulent, dtype: int64, 0    17014
 1      866
 Name: fraudulent, dtype: int64)

In [None]:
# Build Decision Tree Model
from sklearn import tree
clf = tree.DecisionTreeClassifier(random_state=42)
clf.fit(X_resample, y_resampled)



In [None]:
y_pred_train = clf.predict(X_resample)
y_pred_prob_train = clf.predict_proba(X_resample)[:,1]
    
precision_train = metrics.precision_score(y_resampled, y_pred_train)
recall_train = metrics.recall_score(y_resampled, y_pred_train)
f1_train = metrics.f1_score(y_resampled, y_pred_train)
roc_auc_train = metrics.roc_auc_score(y_resampled, y_pred_prob_train)



In [None]:
precision_train, recall_train, f1_train, roc_auc_train

(1.0, 1.0, 1.0, 1.0)

In [None]:
y_pred_test = clf.predict(X_test)
y_pred_prob_test = clf.predict_proba(X_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)



In [None]:
precision, recall, f1, roc_auc

(0.6598984771573604,
 0.7182320441988951,
 0.6878306878306878,
 0.8492485699639541)

In [None]:
clf = ensemble.RandomForestClassifier(random_state=42)
clf.fit(X_resample, y_resampled)



In [None]:
y_pred_train = clf.predict(X_resample)
y_pred_prob_train = clf.predict_proba(X_resample)[:,1]
    
precision_train = metrics.precision_score(y_resampled, y_pred_train)
recall_train = metrics.recall_score(y_resampled, y_pred_train)
f1_train = metrics.f1_score(y_resampled, y_pred_train)
roc_auc_train = metrics.roc_auc_score(y_resampled, y_pred_prob_train)

precision_train, recall_train, f1_train, roc_auc_train



(1.0, 1.0, 1.0, 1.0)

In [None]:
y_pred_test = clf.predict(X_test)
y_pred_prob_test = clf.predict_proba(X_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc



(1.0, 0.6353591160220995, 0.7770270270270271, 0.9897021131172752)

In [None]:
# build Gradient Boosting Model
clf = ensemble.GradientBoostingClassifier(random_state=42)
clf.fit(X_resample, y_resampled)



In [None]:
y_pred_train = clf.predict(X_resample)
y_pred_prob_train = clf.predict_proba(X_resample)[:,1]
    
precision_train = metrics.precision_score(y_resampled, y_pred_train)
recall_train = metrics.recall_score(y_resampled, y_pred_train)
f1_train = metrics.f1_score(y_resampled, y_pred_train)
roc_auc_train = metrics.roc_auc_score(y_resampled, y_pred_prob_train)

precision_train, recall_train, f1_train, roc_auc_train



(0.975609756097561, 0.9473104461171287, 0.9612518628912071, 0.997252782397641)

In [None]:
y_pred_test = clf.predict(X_test)
y_pred_prob_test = clf.predict_proba(X_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc



(0.7714285714285715,
 0.7458563535911602,
 0.7584269662921349,
 0.9778012839811553)

In [None]:
# build SVM model
clf = svm.SVC(probability=True, random_state=42)
clf.fit(X_resample, y_resampled)



In [None]:
y_pred_train = clf.predict(X_resample)
y_pred_prob_train = clf.predict_proba(X_resample)[:,1]
    
precision_train = metrics.precision_score(y_resampled, y_pred_train)
recall_train = metrics.recall_score(y_resampled, y_pred_train)
f1_train = metrics.f1_score(y_resampled, y_pred_train)
roc_auc_train = metrics.roc_auc_score(y_resampled, y_pred_prob_train)

precision_train, recall_train, f1_train, roc_auc_train



(0.9935697225794599,
 0.9928400954653938,
 0.9932047750229568,
 0.9998695315631101)

In [None]:
y_pred_test = clf.predict(X_test)
y_pred_prob_test = clf.predict_proba(X_test)[:,1]

precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc



(0.9294871794871795,
 0.8011049723756906,
 0.8605341246290801,
 0.9805181490492193)

In [21]:
# build Naive Bayes Model
mnb = naive_bayes.MultinomialNB()
mnb.fit(X_resample, y_resampled)

  "pandas.DataFrame with sparse columns found."


MultinomialNB()

In [22]:
y_pred_train = mnb.predict(X_resample)
y_pred_prob_train = mnb.predict_proba(X_resample)[:,1]
    
precision_train = metrics.precision_score(y_resampled, y_pred_train)
recall_train = metrics.recall_score(y_resampled, y_pred_train)
f1_train = metrics.f1_score(y_resampled, y_pred_train)
roc_auc_train = metrics.roc_auc_score(y_resampled, y_pred_prob_train)

precision_train, recall_train, f1_train, roc_auc_train

  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."


(0.8871431308679824,
 0.8500091793647879,
 0.8681792612038253,
 0.9767442387134693)

In [24]:
y_pred_test = mnb.predict(X_test)
y_pred_prob_test = mnb.predict_proba(X_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc

  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."


(0.446875, 0.7900552486187845, 0.5708582834331337, 0.9600940609768996)

In [25]:
from sklearn import metrics

cf_matrixtr = metrics.confusion_matrix(y_resampled, y_pred_train)
cf_matrixte = metrics.confusion_matrix(y_test, y_pred_test)

report = """
Confusion Matrix for train dataset:
{}
Confusion Matrix for test dataset:
{}
""".format(cf_matrixtr, cf_matrixte)
print(report)


Confusion Matrix for train dataset:
[[13030   589]
 [  817  4630]]
Confusion Matrix for test dataset:
[[3218  177]
 [  38  143]]



In [35]:
# match feature names and respective feature counts in each class
f = {'Feature': X_resample.columns, 'fraud': mnb.feature_count_[1,:],
         'not_fraud': mnb.feature_count_[0,:]}
features = pd.DataFrame(data=f)

# calculate the P(feature|class) by using feature count in the class divided by respective class count
# 1 count is added to all feature counts to ensure ratio can be calculated later
features['P (f|fraud)'] = (features['fraud'] +1) / mnb.class_count_[1]
features['P (f|not_fraud)'] = (features['not_fraud'] +1) / mnb.class_count_[0]

# ratio of the P(feature|class) is used to evaluate the predictiveness of the token 
features['Ratio'] = features['P (f|fraud)'] / features['P (f|not_fraud)']

# print out the top 10 features for each class
print('The top 10 most predictive tokens for fraud job postings are:')
print(features.sort_values(by=['Ratio'], ascending= False).head(20)['Feature'].values)

print('The top 10 most predictive tokens for not fraud job postings are:')
print(features.sort_values(by=['Ratio'], ascending= False).tail(20)['Feature'].values)

The top 10 most predictive tokens for fraud job postings are:
['clerical' 'clerk' 'earn' 'entry' 'figure' 'duration' 'oil' 'cash'
 'aptitude' 'gas' '400' 'referral' 'offshore' 'refine' 'minutes'
 'recovery' 'fee' 'suitable' 'guarantee' 'administrative']
The top 10 most predictive tokens for not fraud job postings are:
['necessarycanada' 'olds' 'teachers' 'european' 'interactive' 'airfare'
 'game' 'passport' 'teacher' 'cardsgabriel'
 'url_ed9094c60184b8a4975333957f05be37e69d3cdb68decc9dd9a4242733cfd7f7'
 'url_75db76d58f7994c7db24e8998c2fc953ab9a20ea9ac948b217693963f78d2e6b'
 'adkins' 'reimbursedexcellent' 'celta' '1500' 'tesol' 'tefl' 'asia'
 'abroad']


In [31]:
# try naive bayes with only text columns
X_train, X_test, y_train, y_test = train_test_split(df_trial, y_trial, test_size=0.2, random_state=42)
X_resample, y_resampled = smt.fit_resample(X_train, y_train)
text_resample = X_resample.drop(X_resample.columns[0:8], axis = 1)
text_test = X_test.drop(X_resample.columns[0:8], axis = 1)

Index(['telecommuting', 'has_company_logo', 'has_questions', 'has_title',
       'has_requirements', 'has_comp_profile', 'has_description',
       'has_benefits', '00', '000',
       ...
       'writtenlifting', 'xml', 'year', 'years', 'yes', 'yet', 'york', 'young',
       'zealand', 'zone'],
      dtype='object', length=2340)

In [77]:
mnb2 = naive_bayes.MultinomialNB()
mnb2.fit(text_resample, y_resampled)

y_pred_train = mnb.predict(X_resample)
y_pred_prob_train = mnb.predict_proba(X_resample)[:,1]
y_pred_test = mnb2.predict(text_test)
y_pred_prob_test = mnb2.predict_proba(text_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc

  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."


(0.4723127035830619,
 0.8011049723756906,
 0.5942622950819673,
 0.9501655831210996)

In [78]:
from sklearn import metrics

cf_matrixtr = metrics.confusion_matrix(y_resampled, y_pred_train)
cf_matrixte = metrics.confusion_matrix(y_test, y_pred_test)

report = """
Confusion Matrix for train dataset:
{}
Confusion Matrix for test dataset:
{}
""".format(cf_matrixtr, cf_matrixte)
print(report)


Confusion Matrix for train dataset:
[[13030   589]
 [  817  4630]]
Confusion Matrix for test dataset:
[[3233  162]
 [  36  145]]



In [60]:
# try min_df = 0.1
df['text'] = df.apply(lambda row: (str(row['title']) + ' ' + 
                                   str(row['company_profile']) + ' ' + 
                                   str(row['description']) + ' ' + 
                                   str(row['requirements']) + ' ' + 
                                   str(row['benefits'])), axis = 1)

# Fill empty columns with "Unspecified"
df['text'] = df['text'].fillna("Unspecified")
vectoriser = TfidfVectorizer(analyzer=preprocess_text, ngram_range = (1,2),   #TRY ADJUSTING ngram_range and min_df, max_df
                                min_df= 0.1, max_df= 0.6)

# Fit to the data and transform to feature matrix
text_column = vectoriser.fit_transform(df['text'])

# Convert sparse matrix to dataframe
text_column = pd.DataFrame.sparse.from_spmatrix(text_column)

# Save mapping on which index refers to which words
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}

# Rename each column using the mapping
for col in text_column.columns:
    text_column.rename(columns={col: col_map[col]}, inplace=True)
    


In [73]:
text = pd.concat([df[['fraudulent']], text_column], axis =1, ignore_index=False)

#df.drop(columns= ['title', 'text', 'job_id', 'company_profile', 'description', 'requirements', 'benefits'], axis = 1, inplace = True)
#text

In [75]:
y_trial = text['fraudulent']
X_trial = text.drop(columns='fraudulent', axis = 1)

# columns with missing fields need to be dropped for SMOTE to work
X_trial.dropna(axis = 1, inplace = True)

In [88]:
text

Unnamed: 0,fraudulent,10,ability,able,account,achieve,across,activities,agency,also,...,well,wide,within,without,word,world,would,write,year,years
0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.072084,...,0.068278,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0,0.000000,0.025131,0.029912,0.036379,0.039085,0.000000,0.000000,0.0,0.027882,...,0.000000,0.0,0.000000,0.0,0.000000,0.114007,0.038363,0.000000,0.000000,0.000000
2,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.071080,0.598125,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.089962,0.000000,0.000000,0.000000,0.000000,0.000000
3,0,0.000000,0.031137,0.000000,0.495812,0.048426,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.075878,0.0,0.000000,0.141255,0.000000,0.064909,0.041394,0.026506
4,0,0.000000,0.000000,0.052781,0.000000,0.000000,0.058072,0.000000,0.0,0.000000,...,0.046601,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.037749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,0,0.050844,0.030530,0.000000,0.132585,0.047482,0.119944,0.000000,0.0,0.067746,...,0.000000,0.0,0.037199,0.0,0.000000,0.069251,0.000000,0.031822,0.000000,0.000000
17876,0,0.000000,0.074580,0.044385,0.539810,0.000000,0.000000,0.000000,0.0,0.041373,...,0.078377,0.0,0.045436,0.0,0.000000,0.000000,0.056926,0.038868,0.000000,0.063488
17877,0,0.000000,0.034242,0.040757,0.099138,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.058299
17878,0,0.000000,0.000000,0.240262,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [79]:
smt = SMOTETomek(random_state=42,sampling_strategy=0.4)
X_train, X_test, y_train, y_test = train_test_split(X_trial, y_trial, test_size=0.2, random_state=42)
X_resample, y_resampled = smt.fit_resample(X_train, y_train)

ValueError: ignored

In [None]:
mnb2 = naive_bayes.MultinomialNB()
mnb2.fit(text_resample, y_resampled)

y_pred_train = mnb.predict(X_resample)
y_pred_prob_train = mnb.predict_proba(X_resample)[:,1]
y_pred_test = mnb2.predict(text_test)
y_pred_prob_test = mnb2.predict_proba(text_test)[:,1]
    
precision = metrics.precision_score(y_test, y_pred_test)
recall = metrics.recall_score(y_test, y_pred_test)
f1 = metrics.f1_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_prob_test)

precision, recall, f1, roc_auc

  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."


(0.4723127035830619,
 0.8011049723756906,
 0.5942622950819673,
 0.9501655831210996)

In [None]:
from sklearn import metrics

cf_matrixtr = metrics.confusion_matrix(y_resampled, y_pred_train)
cf_matrixte = metrics.confusion_matrix(y_test, y_pred_test)

report = """
Confusion Matrix for train dataset:
{}
Confusion Matrix for test dataset:
{}
""".format(cf_matrixtr, cf_matrixte)
print(report)


Confusion Matrix for train dataset:
[[13030   589]
 [  817  4630]]
Confusion Matrix for test dataset:
[[3233  162]
 [  36  145]]

