In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load the dataset
file_path = 'fake_job_postings.csv'
data = pd.read_csv(file_path)
print("Initial Data Loaded")
data.head()

Initial Data Loaded


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [3]:
# Step 1: Basic Data Cleaning
data.fillna('', inplace=True)  # Fill empty fields
print("\nAfter Cleaning Missing Values")
data.head()


After Cleaning Missing Values


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [4]:
# Extract and Standardize State Information with Error Handling
def extract_state(location):
    if isinstance(location, str):
        parts = location.split(',')
        if len(parts) >= 2:
            return parts[1].strip()  # Return the state code
    return None  # Return None if format is not as expected

In [5]:
# Ensure 'location' column is a string before applying the filter
data['location'] = data['location'].astype(str)

In [6]:
# Extract state information
data['State'] = data['location'].apply(extract_state)
data['State'] = data['State'].str.upper().str.strip()  # Standardize state format


In [7]:
# Define valid U.S. state codes and filter out international entries
valid_states = [
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 
    'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 
    'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'
]
data = data[data['State'].isin(valid_states)]
print("\nAfter Extracting and Filtering by Valid States")
data[['location', 'State']].head()  # Checkpoint 3: Location extraction and filtering



After Extracting and Filtering by Valid States


Unnamed: 0,location,State
0,"US, NY, New York",NY
2,"US, IA, Wever",IA
4,"US, FL, Fort Worth",FL
5,"US, MD,",MD
7,"US, CA, San Francisco",CA


In [8]:
# Create TF-IDF Matrix
# https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/03-TF-IDF-Scikit-Learn.html

data['text'] = data['description'].fillna('') + ' ' + data['requirements'].fillna('')
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.to_csv('tfidf_matrix.csv', index=False)
print("\nTF-IDF Matrix Created")
tfidf_df.head()


TF-IDF Matrix Created


Unnamed: 0,00,000,10,100,12,15,1500,200,30,40,...,work,working,works,world,write,writing,written,year,years,york
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0855,0.124519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117957
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.060219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.028676,0.041762,0.077311,0.0,0.0,0.0,0.0,0.0,0.034144,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051972,0.058532,0.0,...,0.02131,0.06207,0.0,0.0,0.0,0.0,0.0,0.0,0.025374,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.08269,0.055984,0.070353,0.0,0.0


In [9]:
# False Negatives Analysis and Feature Engineering
false_negatives = data[(data['fraudulent'] == 1)]
print("\nFalse Negatives Sample")
false_negatives.head()  # Checkpoint 5: False negatives sample


False Negatives Sample


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,State,text
180,181,Sales Executive,"PK, SD, Karachi",Sales,,,Sales Executive,Sales Executive,Sales Executive,0,0,0,,,,,Sales,1,SD,Sales Executive Sales Executive
215,216,IC&E Technician Mt Poso,"US, CA, Bakersfield, CA / Mt. Poso",Oil & Energy,95000-115000,...,"IC&amp;E Technician | Bakersfield, CA Mt. Poso...","QualificationsKnowledge, Skills &amp; Abiliti...",BENEFITSWhat is offered:Competitive compensati...,0,1,1,Full-time,Mid-Senior level,High School or equivalent,Oil & Energy,Other,1,CA,"IC&amp;E Technician | Bakersfield, CA Mt. Poso..."
357,358,Financing Auto(car) sales,"US, IL, hazelcrest",hr,35000-73000,Looking for adventurous people to join a thriv...,If you have experience in financing for auto s...,prior car sales expprior car loan financing exp,profit sharingcar allowancecompany car,0,0,0,Contract,Associate,,Automotive,,1,IL,If you have experience in financing for auto s...
493,494,Admin Assistant/ Receptionist,"US, CA, Los Angeles",,23040-28800,,"A Newly established company seeks outgoing, fr...","Must have good knowledge of Outlook, Microsoft...",,0,0,0,Full-time,Entry level,High School or equivalent,Computer Software,Administrative,1,CA,"A Newly established company seeks outgoing, fr..."
537,538,Technology Consultant - Outside Sales,"US, OH,",,,,Apply below using link#URL_00962c0bdac3ecf40b2...,,,0,0,0,Full-time,,,Information Technology and Services,,1,OH,Apply below using link#URL_00962c0bdac3ecf40b2...


In [10]:
# Feature Engineering Based on Analysis
data['is_remote'] = data['location'].str.contains("remote|work from home", case=False, na=False).astype(int)
data['description_length'] = data['description'].apply(len)
print("\nFeature Engineering - Remote Flag and Description Length")
data[['is_remote', 'description_length']].head()  # Checkpoint 6: Feature engineering sample



Feature Engineering - Remote Flag and Description Length


Unnamed: 0,is_remote,description_length
0,0,905
2,0,355
4,0,1520
5,0,3418
7,0,2488


In [11]:
# Reclassification Based on Patterns
keywords = ['intern', 'assistant', 'remote']
data['fraud_suspect'] = data['title'].apply(lambda x: any(keyword in x.lower() for keyword in keywords)).astype(int)
data['fraudulent_corrected'] = np.where(
    (data['fraud_suspect'] == 1) | (data['description_length'] < 100),
    1,
    data['fraudulent']
)
print("\nAfter Reclassification - Fraud Suspect and Corrected Fraudulent Labels")
data[['title', 'fraud_suspect', 'fraudulent', 'fraudulent_corrected']].head()  # Checkpoint 7: Reclassification check



After Reclassification - Fraud Suspect and Corrected Fraudulent Labels


Unnamed: 0,title,fraud_suspect,fraudulent,fraudulent_corrected
0,Marketing Intern,1,0,1
2,Commissioning Machinery Assistant (CMA),1,0,1
4,Bill Review Manager,0,0,0
5,Accounting Clerk,0,0,0
7,Lead Guest Service Specialist,0,0,0


In [12]:
# False Negatives Analysis and Feature Engineering
false_negatives = data[(data['fraudulent'] == 1)]
false_negatives_count = len(false_negatives)  # Calculate the total number of false negatives
print(f"\nTotal Number of False Negatives: {false_negatives_count}")  # Display the count
print("False Negatives Sample")
false_negatives.head()


Total Number of False Negatives: 686
False Negatives Sample


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,required_education,industry,function,fraudulent,State,text,is_remote,description_length,fraud_suspect,fraudulent_corrected
180,181,Sales Executive,"PK, SD, Karachi",Sales,,,Sales Executive,Sales Executive,Sales Executive,0,...,,,Sales,1,SD,Sales Executive Sales Executive,0,15,0,1
215,216,IC&E Technician Mt Poso,"US, CA, Bakersfield, CA / Mt. Poso",Oil & Energy,95000-115000,...,"IC&amp;E Technician | Bakersfield, CA Mt. Poso...","QualificationsKnowledge, Skills &amp; Abiliti...",BENEFITSWhat is offered:Competitive compensati...,0,...,High School or equivalent,Oil & Energy,Other,1,CA,"IC&amp;E Technician | Bakersfield, CA Mt. Poso...",0,2246,0,1
357,358,Financing Auto(car) sales,"US, IL, hazelcrest",hr,35000-73000,Looking for adventurous people to join a thriv...,If you have experience in financing for auto s...,prior car sales expprior car loan financing exp,profit sharingcar allowancecompany car,0,...,,Automotive,,1,IL,If you have experience in financing for auto s...,0,151,0,1
493,494,Admin Assistant/ Receptionist,"US, CA, Los Angeles",,23040-28800,,"A Newly established company seeks outgoing, fr...","Must have good knowledge of Outlook, Microsoft...",,0,...,High School or equivalent,Computer Software,Administrative,1,CA,"A Newly established company seeks outgoing, fr...",0,352,1,1
537,538,Technology Consultant - Outside Sales,"US, OH,",,,,Apply below using link#URL_00962c0bdac3ecf40b2...,,,0,...,,Information Technology and Services,,1,OH,Apply below using link#URL_00962c0bdac3ecf40b2...,0,2133,0,1


In [13]:
# Save updated dataset with reclassifications and engineered features
corrected_data = data.copy()
corrected_data.to_csv('updated_job_postings.csv', index=False)

print("\nUpdated dataset created with adjusted fraud classification and saved as 'updated_job_postings.csv'.")
print("TF-IDF matrix saved as 'tfidf_matrix.csv'.")


Updated dataset created with adjusted fraud classification and saved as 'updated_job_postings.csv'.
TF-IDF matrix saved as 'tfidf_matrix.csv'.
