In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler

In [2]:
df = pd.read_csv(r'M:\courses\AI All\Amit Machin Learning\fake_job_postings.csv')
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [3]:
df = df.drop_duplicates()

In [4]:
df = df.drop(["job_id", "title", "location", "department", "salary_range", "description", "requirements", "benefits", "telecommuting", "has_company_logo", "has_questions", "employment_type", "required_experience", "required_education"], axis=1)

In [5]:
def clean_text(text):
    text = str(text) # convert to string type
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = text.lower()
    return text

In [6]:
df['company_profile'] = df['company_profile'].apply(clean_text)
df['industry'] = df['industry'].apply(clean_text)
df['function'] = df['function'].apply(clean_text)

In [7]:
for col in df :
    null = []
    for i in range(len(df[col])) :
        if str(df[col][i]) == 'nan':
            null.append(1)
    print(f'column is {col}',len(null))

column is company_profile 3308
column is industry 4903
column is function 6455
column is fraudulent 0


In [8]:
mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
median = SimpleImputer(missing_values=np.nan, strategy='median')
for col in df.columns:
    print('number of null {} in {}'.format(df[col].isna().sum(), col))
    if df[col].dtype == 'object':
        df[col] = mode.fit_transform(df[[col]])
    else:
        df[col] = median.fit_transform(df[[col]])
    print('number of null {} in {}'.format(df[col].isna().sum(), col))
    print('########' * 30)

number of null 0 in company_profile
number of null 0 in company_profile
################################################################################################################################################################################################################################################
number of null 0 in industry
number of null 0 in industry
################################################################################################################################################################################################################################################
number of null 0 in function
number of null 0 in function
################################################################################################################################################################################################################################################
number of null 0 in fraudulent
number of null 0 in fraudulent
###########################

In [9]:
vectorizer = CountVectorizer(stop_words=stopwords.words('english'))
text_cols = ['company_profile', 'industry', 'function']
text_data = vectorizer.fit_transform(df[text_cols].apply(lambda x: ' '.join(x), axis=1)).toarray()

In [10]:
num_cols = [c for c in df.columns if c not in text_cols + ['fraudulent']]
num_data = df[num_cols].to_numpy()

In [11]:
x = np.concatenate([text_data, num_data], axis=1)
y = df['fraudulent']

In [12]:
oversample = RandomOverSampler(sampling_strategy='minority')
x, y = oversample.fit_resample(x, y)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
lr = LogisticRegression(max_iter=1000)
lr.fit(x_train, y_train)

LogisticRegression(max_iter=1000)

In [14]:
y_pred = lr.predict(x_train)

print('Accuracy:', accuracy_score(y_train, y_pred))

print('Precision:', precision_score(y_train, y_pred))

print('Recall:', recall_score(y_train, y_pred))

print('F1 Score:', f1_score(y_train, y_pred))

# Cross-validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lr, x, y, cv=10)

print('Cross-validation scores:', scores)

print('Mean:', np.mean(scores))

print('Standard Deviation:', np.std(scores))

# Confusion matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train, y_pred)

print('Confusion Matrix:\n', cm)

Accuracy: 0.9454485342737492
Precision: 0.9155180689844339
Recall: 0.9814025286680388
F1 Score: 0.9473161386454748
Cross-validation scores: [0.92535998 0.92741699 0.92007053 0.9280047  0.95562739 0.95944755
 0.964737   0.94886865 0.96237507 0.91798942]
Mean: 0.9409897256682772
Standard Deviation: 0.017894250227312152
Confusion Matrix:
 [[12386  1232]
 [  253 13351]]


In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.91      0.94     13618
         1.0       0.92      0.98      0.95     13604

    accuracy                           0.95     27222
   macro avg       0.95      0.95      0.95     27222
weighted avg       0.95      0.95      0.95     27222



In [16]:
y_pred = lr.predict(x_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))


# Cross-validation

scores = cross_val_score(lr, x, y, cv=10)
print('Cross-validation scores:', scores)
print('Mean:', np.mean(scores))
print('Standard Deviation:', np.std(scores))


# Confusion matrix

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm)

Accuracy: 0.9447546282691742
Precision: 0.9133514986376022
Recall: 0.9829912023460411
F1 Score: 0.9468926553672318
Cross-validation scores: [0.92535998 0.92741699 0.92007053 0.9280047  0.95562739 0.95944755
 0.964737   0.94886865 0.96237507 0.91798942]
Mean: 0.9409897256682772
Standard Deviation: 0.017894250227312152
Confusion Matrix:
 [[3078  318]
 [  58 3352]]


In [17]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.91      0.94      3396
         1.0       0.91      0.98      0.95      3410

    accuracy                           0.94      6806
   macro avg       0.95      0.94      0.94      6806
weighted avg       0.95      0.94      0.94      6806



In [26]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
 

model.fit(x_train, y_train)
    
print('Training Score is {:.0%}:'.format(model.score(x_train, y_train)))

print('Test Score is {:.0%}:'.format(model.score(x_test, y_test)))  

y_pred = model.predict(x_test)

print(('Accuracy is {:.0%}'.format(accuracy_score(y_test, y_pred))))

print('F1 score is {:.0%}'.format(f1_score(y_test, y_pred)))

Training Score is 96%:
Test Score is 96%:
Accuracy is 96%
F1 score is 96%


In [27]:
y_pred = model.predict(x_train)

print('Accuracy:', accuracy_score(y_train, y_pred))

print('Precision:', precision_score(y_train, y_pred))

print('Recall:', recall_score(y_train, y_pred))

print('F1 Score:', f1_score(y_train, y_pred))

# Cross-validation

scores = cross_val_score(model, x, y, cv=10)

print('Cross-validation scores:', scores)

print('Mean:', np.mean(scores))

print('Standard Deviation:', np.std(scores))

# Confusion matrix

cm = confusion_matrix(y_train, y_pred)

print('Confusion Matrix:\n', cm)

print(classification_report(y_train,y_pred))

Accuracy: 0.9595180368819337
Precision: 0.9282680186352426
Recall: 0.9959570714495737
F1 Score: 0.9609219858156028
Cross-validation scores: [0.94446077 0.93917132 0.93652659 0.95033794 0.96444314 0.9753159
 0.97649133 0.9597414  0.97236919 0.96237507]
Mean: 0.958123266067237
Standard Deviation: 0.014033615252819764
Confusion Matrix:
 [[12571  1047]
 [   55 13549]]
              precision    recall  f1-score   support

         0.0       1.00      0.92      0.96     13618
         1.0       0.93      1.00      0.96     13604

    accuracy                           0.96     27222
   macro avg       0.96      0.96      0.96     27222
weighted avg       0.96      0.96      0.96     27222



In [28]:
y_pred = model.predict(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))

print('Precision:', precision_score(y_test, y_pred))

print('Recall:', recall_score(y_test, y_pred))

print('F1 Score:', f1_score(y_test, y_pred))

# Cross-validation

scores = cross_val_score(model, x, y, cv=10)

print('Cross-validation scores:', scores)

print('Mean:', np.mean(scores))

print('Standard Deviation:', np.std(scores))

# Confusion matrix

cm = confusion_matrix(y_test, y_pred)

print('Confusion Matrix:\n', cm)

Accuracy: 0.9570966794005289
Precision: 0.9245642701525054
Recall: 0.9956011730205279
F1 Score: 0.9587687094041231
Cross-validation scores: [0.94416691 0.93917132 0.93652659 0.95033794 0.96503086 0.9753159
 0.97707905 0.9597414  0.97236919 0.96296296]
Mean: 0.958270212522996
Standard Deviation: 0.014185832076684713
Confusion Matrix:
 [[3119  277]
 [  15 3395]]


In [29]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       1.00      0.92      0.96      3396
         1.0       0.92      1.00      0.96      3410

    accuracy                           0.96      6806
   macro avg       0.96      0.96      0.96      6806
weighted avg       0.96      0.96      0.96      6806

