#### 1. Loading Required Libraries

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


##### 2.1 Loading Dataset into DataFrame

In [4]:
df = pd.read_csv("../input/jobs/fake_job_postings.csv")
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


##### 2. Data Preparation


In [5]:
#Drop Unnecessary columns
columns = ['job_id','telecommuting','has_company_logo','has_questions','salary_range','employment_type']
for col in columns:
    del df[col]
    
df.head()

Unnamed: 0,title,location,department,company_profile,description,requirements,benefits,required_experience,required_education,industry,function,fraudulent
0,Marketing Intern,"US, NY, New York",Marketing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,Internship,,,Marketing,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,Not Applicable,,Marketing and Advertising,Customer Service,0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,,,,,0
3,Account Executive - Washington DC,"US, DC, Washington",Sales,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,Bill Review Manager,"US, FL, Fort Worth",,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [6]:
#Filling NaN values with Blank space
df.fillna('', inplace=True)

#### 3. Text Preprocessing

##### 3.1 Creating Target and Text Dataframe

In [7]:
def text_fraud_df(df):

    # replacing Na values with nothing as a string
    df.fillna('', inplace=True)

    # creating a new column named "text"
    # which is concatination of "title", "company_profile", "description", "requirements", benifits
    # ' ' space between two concatinations
    df['text'] = df['title'] + ' ' + df['company_profile'] + ' '+ df['description'] + df['requirements'] + ' ' + df['benefits'] 


    for col in df.columns:
        # remove all columns except "text" and Fradulent
        if col not in ['text','fraudulent']:
            del df[col]
    
    return df

In [8]:
df = text_fraud_df(df)
df.head()

Unnamed: 0,fraudulent,text
0,0,"Marketing Intern We're Food52, and we've creat..."
1,0,Customer Service - Cloud Video Production 90 S...
2,0,Commissioning Machinery Assistant (CMA) Valor ...
3,0,Account Executive - Washington DC Our passion ...
4,0,Bill Review Manager SpotSource Solutions LLC i...


##### 3.2 Lemmatization, Stopword and Punctuation Removal

In [None]:
import spacy

In [10]:
#Load spacy nlp model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) 

# Text Preprocessing with varoius combination
def spacy_process(text):
  # Converts to lowercase
  text = text.strip().lower()

  # passing text to spacy's nlp object
  doc = nlp(text)
    
  # Lemmatization
  lemma_list = []
  for token in doc:
    lemma_list.append(token.lemma_)
  
  # Filter the stopword
  filtered_sentence =[] 
  for word in lemma_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
      filtered_sentence.append(word)
    
  # Remove punctuation
  punctuations="?:!.,;$\'-_"
  for word in filtered_sentence:
    if word in punctuations:
      filtered_sentence.remove(word)

  return " ".join(filtered_sentence)

In [11]:
df.loc[:,"text"] = df.loc[:,"text"].apply(spacy_process)
df.head()

Unnamed: 0,fraudulent,text
0,0,marketing intern food52 create groundbreaking ...
1,0,customer service cloud video production 90 sec...
2,0,commission machinery assistant ( cma ) valor s...
3,0,account executive washington dc passion improv...
4,0,bill review manager spotsource solution llc gl...


In [12]:
#generate local clean csv 
df.to_csv('../input/jobs/clean_df.csv', index=False)

In [77]:
#prepare target csv
df = pd.read_csv('../input/jobs/clean_df.csv')


#### 4. Model Buliding

##### 4.1 Helper Functions

In [78]:
from numpy import sqrt, argmax
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score
import markdown

In [79]:
# Creating a Dataframe with word-vectors in TF-IDF form and Target values

def final_df(df, is_train, vectorizer, column):

    # TF-IDF form
    if is_train:
        x = vectorizer.fit_transform(df.loc[:,column])
    else:
        x = vectorizer.transform(df.loc[:,column])

    # TF-IDF form to Dataframe
    temp = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())

    # Droping the text column
    df.drop(df.loc[:,column].name, axis = 1, inplace=True)

    # Returning TF-IDF form with target
    return pd.concat([temp, df], axis=1)

In [80]:
# Training the model with various combination and returns y_test and y_pred

def train_model(df, input, target, test_size, over_sample, vectorizer, model):

    X = df.drop(target, axis=1)
    y = df[target]
    print("Splitted Data into X and Y.")

    X_train, x_test, Y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    print("Splitted Data into Train and Test.")
    
    # Training Preprocessing
    X_train = final_df(X_train, True, vectorizer, input)
    X_train.dropna(inplace=True)
    print("Vectorized Training Data.")

    if over_sample:
        sm = SMOTE(random_state = 2)
        X_train, Y_train = sm.fit_resample(X_train, Y_train.ravel())
        print("Oversampling Done for Training Data.")

    # Testing Preprocessing

    x_test_init = x_test.copy()
    #x_test_init.dropna(inplace=True)
    
    x_test = final_df(x_test, False, vectorizer, input)
    x_test.dropna(inplace=True)
    print("Vectorized Testing Data.")

    # fitting the model
    model = model.fit(X_train, Y_train)
    print("Model Fitted Successfully.")

    # calculating y_pred
    y_pred = model.predict(x_test)
    y_pred_prob = model.predict_proba(x_test)
    roc_auc = round(roc_auc_score(y_test, y_pred_prob[:, 1]), 2)

    print(f"\n\033[1mROC-AUC Score\033[0m \t\t: {roc_auc*100} %")

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:,1], pos_label=1)
    
    # calculate the g-mean for each threshold
    gmeans = sqrt(tpr * (1-fpr))
    
    # locate the index of the largest g-mean
    ix = argmax(gmeans)
    print('\033[1mBest Threshold\033[0m \t\t: %.3f \n\033[1mG-Mean\033[0m \t\t\t: %.3f' % (thresholds[ix], gmeans[ix]))
    best_threshold_num = round(thresholds[ix], 3)
    gmeans_num = round(gmeans[ix], 3)

    y_pred = (y_pred > thresholds[ix])

    accuracy = accuracy_score(y_test, y_pred)
    
    accuracy_num = f"{accuracy * 100:.1f}"
    
    print("\033[1mModel Accuracy\033[0m \t\t:", round(accuracy*100,2), "%")

    print("\033[1m\nClassification Report:\033[0m")
    print(classification_report(y_test, y_pred))

    return model, x_test_init, y_test, y_pred, gmeans_num, accuracy_num, roc_auc, best_threshold_num

In [81]:
def report_generator(report, model_fullname, model_name, gmeans_num, accuracy_num, roc_auc, best_threshold_num):
    #transfrom report from dictionary to df
    df_report = pd.DataFrame.from_dict(report).transpose()

    #adjust data display format
    df_report['support'] = df_report['support'].astype(int)
    df_report.iloc[:, 0:3] = df_report.iloc[:, 0:3].round(2)
    roc_auc = roc_auc *100

    #markdown file content
    markdown_content = f"""
## {model_fullname} Accuracy

**ROC-AUC Score:** {roc_auc}% &nbsp;&nbsp; **Best Threshold:** {best_threshold_num} &nbsp;&nbsp; **G-Mean:** {gmeans_num} &nbsp;&nbsp; **Model Accuracy:** {accuracy_num}%

                    Precision   Recall      F1-Score    Support

    0               {df_report.iloc[0,0]}        {df_report.iloc[0,1]}        {df_report.iloc[0,2]}        {df_report.iloc[0,3]}
    1               {df_report.iloc[1,0]}        {df_report.iloc[1,1]}        {df_report.iloc[1,2]}        {df_report.iloc[1,3]}

    Accuracy                                {df_report.iloc[2,2]}        {df_report.iloc[3,3]}
    Macro Avg       {df_report.iloc[3,0]}        {df_report.iloc[3,1]}        {df_report.iloc[3,2]}        {df_report.iloc[3,3]}
    Weighted Avg    {df_report.iloc[4,0]}        {df_report.iloc[4,1]}        {df_report.iloc[4,2]}        {df_report.iloc[3,3]}
"""
    with open(f'{model_name}_accuracy.md','w') as markdown_file:
        markdown_file.write(markdown_content)
    

#### 5. Training Models with Various Config

##### Model 1: CV LR

In [82]:
#global variable - total 2 targets 0/1
target_names = ['0','1']

In [83]:
#model name variables
model_fullname = "CV Logistic Regression (LR)"
model_name = "CV-LogisticRegression"

#cvlr model training
cv = CountVectorizer(ngram_range=(1, 1), max_features = 500)
model = LogisticRegression(max_iter=1000)

model, x_test_init, y_test, y_pred, gmeans_num, accuracy_num, roc_auc, best_threshold_num = train_model(
    df=df, 
    input='text', 
    target='fraudulent', 
    test_size=0.2,
    over_sample=True, 
    vectorizer=cv, 
    model=model)

Splitted Data into X and Y.
Splitted Data into Train and Test.


Vectorized Training Data.
Oversampling Done for Training Data.
Vectorized Testing Data.
Model Fitted Successfully.

[1mROC-AUC Score[0m 		: 92.0 %
[1mBest Threshold[0m 		: 0.121 
[1mG-Mean[0m 			: 0.868
[1mModel Accuracy[0m 		: 93.88 %
[1m
Classification Report:[0m
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      3395
           1       0.44      0.75      0.55       181

    accuracy                           0.94      3576
   macro avg       0.71      0.85      0.76      3576
weighted avg       0.96      0.94      0.95      3576



In [84]:
#fetch classification report and generate markdown report
report = classification_report(y_test, y_pred, target_names = target_names, output_dict = True)
report_generator(report, model_fullname, model_name, gmeans_num, accuracy_num, roc_auc, best_threshold_num)


In [85]:
#generate output csv
output = pd.concat([x_test_init, y_test,  pd.Series(y_pred.astype(int),index=y_test.index)], axis=1)
output.columns = ["Processed Text","Fraudulent_Truth","Fraudulent_Prediction"]
output.to_csv(f"../output/jobs/{model_name}.csv")

##### Model 2: CV-RFC

In [86]:
#model name
model_fullname = "CV Random Forest Classifier (RFC)"
model_name = "CV-RandomForestClassifier"

#model training
cv = CountVectorizer(ngram_range=(1, 1), max_features = 500)
rfc = RandomForestClassifier(n_jobs=3, oob_score=True, n_estimators=100, criterion="gini")

model, x_test_init, y_test, y_pred, gmeans_num, accuracy_num, roc_auc, best_threshold_num  = train_model(
    df=df, 
    input='text', 
    target='fraudulent', 
    test_size=0.2,
    over_sample=True, 
    vectorizer=cv, 
    model=rfc)

Splitted Data into X and Y.
Splitted Data into Train and Test.
Vectorized Training Data.
Oversampling Done for Training Data.
Vectorized Testing Data.
Model Fitted Successfully.

[1mROC-AUC Score[0m 		: 98.0 %
[1mBest Threshold[0m 		: 0.130 
[1mG-Mean[0m 			: 0.920
[1mModel Accuracy[0m 		: 97.65 %
[1m
Classification Report:[0m
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3395
           1       0.95      0.56      0.71       181

    accuracy                           0.98      3576
   macro avg       0.97      0.78      0.85      3576
weighted avg       0.98      0.98      0.97      3576



In [87]:
#fetch classification report and generate markdown report
report = classification_report(y_test, y_pred, target_names = target_names, output_dict = True)
report_generator(report, model_fullname, model_name, gmeans_num, accuracy_num, roc_auc, best_threshold_num)

In [88]:
#generate output
output = pd.concat([x_test_init, y_test,  pd.Series(y_pred.astype(int),index=y_test.index)], axis=1)
output.columns = ["Processed Text","Fraudulent_Truth","Fraudulent_Prediction"]
output.to_csv(f"../output/jobs/{model_name}.csv")

##### Model 3: TFIDF-LR


In [89]:
#model name
model_fullname = "TFIDF Logistic Regression (LR)"
model_name = "TFIDF-LogisticRegression"

#model training
tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features = 500)
lr = LogisticRegression()

rfc, x_test_init, y_test, y_pred, gmeans_num, accuracy_num, roc_auc, best_threshold_num = train_model(
    df=df, 
    input='text', 
    target='fraudulent', 
    test_size=0.2,
    over_sample=True, 
    vectorizer=tfidf, 
    model=lr)

Splitted Data into X and Y.
Splitted Data into Train and Test.
Vectorized Training Data.
Oversampling Done for Training Data.
Vectorized Testing Data.
Model Fitted Successfully.

[1mROC-AUC Score[0m 		: 94.0 %
[1mBest Threshold[0m 		: 0.417 
[1mG-Mean[0m 			: 0.871
[1mModel Accuracy[0m 		: 92.59 %
[1m
Classification Report:[0m
              precision    recall  f1-score   support

           0       0.99      0.93      0.96      3395
           1       0.39      0.80      0.52       181

    accuracy                           0.93      3576
   macro avg       0.69      0.86      0.74      3576
weighted avg       0.96      0.93      0.94      3576



In [90]:
#fetch classification report and generate markdown report
report = classification_report(y_test, y_pred, target_names = target_names, output_dict = True)
report_generator(report, model_fullname, model_name, gmeans_num, accuracy_num, roc_auc, best_threshold_num)

In [91]:
#generate output csv
output = pd.concat([x_test_init, y_test,  pd.Series(y_pred.astype(int),index=y_test.index)], axis=1)
output.columns = ["Processed Text","Fraudulent_Truth","Fraudulent_Prediction"]
output.to_csv(f"../output/jobs/{model_name}.csv")

##### Model 4: TFIDF SVC

In [92]:
#model name
model_fullname = "TFIDF Support Vector Classifier (SVC)"
model_name = "TFIDF-SVC"

#model training
tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features = 500)
svc = SVC(probability=True)

model, x_test_init, y_test, y_pred, gmeans_num, accuracy_num, roc_auc, best_threshold_num = train_model(
    df=df, 
    input='text', 
    target='fraudulent', 
    test_size=0.2,
    over_sample=True, 
    vectorizer=tfidf, 
    model=svc)

Splitted Data into X and Y.
Splitted Data into Train and Test.
Vectorized Training Data.
Oversampling Done for Training Data.
Vectorized Testing Data.
Model Fitted Successfully.

[1mROC-AUC Score[0m 		: 97.0 %
[1mBest Threshold[0m 		: 0.001 
[1mG-Mean[0m 			: 0.926
[1mModel Accuracy[0m 		: 98.27 %
[1m
Classification Report:[0m
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3395
           1       1.00      0.66      0.79       181

    accuracy                           0.98      3576
   macro avg       0.99      0.83      0.89      3576
weighted avg       0.98      0.98      0.98      3576



In [93]:
#fetch classification report and generate markdown report
report = classification_report(y_test, y_pred, target_names = target_names, output_dict = True)
report_generator(report, model_fullname, model_name, gmeans_num, accuracy_num, roc_auc, best_threshold_num)

In [94]:
output = pd.concat([x_test_init, y_test,  pd.Series(y_pred.astype(int),index=y_test.index)], axis=1)
output.columns = ["Processed Text","Fraudulent_Truth","Fraudulent_Prediction"]
output.to_csv(f"../output/jobs/{model_name}.csv")