#  1. Setup, Imports, and Data Loading

In [5]:

import os
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import textstat
import nltk

# Download necessary NLTK components

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [7]:
# Load the raw dataset
df = pd.read_csv("/content/drive/MyDrive/Fake_Job_Posting_Detection/data/raw/fake_job_postings.csv")
df.drop('job_id', axis=1, inplace=True) # Drop ID column
print("Data loaded successfully and ready for processing.")

Data loaded successfully and ready for processing.


In [8]:
df.head(3)

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0


In [9]:
df.shape

(17880, 17)

# 2. Missing Value Handling
The high volume of missing data requires a strategic approach, treating 'missing' as a meaningful category.

In [10]:
df.dtypes

Unnamed: 0,0
title,object
location,object
department,object
salary_range,object
company_profile,object
description,object
requirements,object
benefits,object
telecommuting,int64
has_company_logo,int64


In [11]:


# Strategy: Impute all categorical NaN/Null values with 'MISSING' or a logical placeholder.
# This makes 'MISSING' a valid category for the model to learn from.

# 1. Impute categorical columns (most columns)
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna('MISSING')

# 2. Check the numeric columns (mostly flags: 0/1) for NaNs
# 'fraudulent' is the target, 'telecommuting', 'has_company_logo', 'has_questions' are flags.
# No other numeric columns expected to have NaNs, but check if any exist and fill with 0
for col in ['telecommuting', 'has_company_logo', 'has_questions']:
    df[col] = df[col].fillna(0)

# 3. Handle 'salary_range' separately for potential feature creation (see step D)
df['salary_range'].replace('MISSING', np.nan, inplace=True)

print("Missing values in all text/categorical columns imputed with 'MISSING' category.")

Missing values in all text/categorical columns imputed with 'MISSING' category.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['salary_range'].replace('MISSING', np.nan, inplace=True)


# 3. Text Cleaning Pipeline
This is essential for preparing the text columns (title, description, requirements, etc.) for feature extraction.

In [12]:
#  Advanced Text Cleaning (Regex, Stopwords, Lemmatization)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # 1. Lowercasing
    text = str(text).lower()

    # 2. Remove URLs, HTML tags, and non-alphanumeric characters (keep spaces)
    text = re.sub(r'http\S+|www\S+|\S+\.\S+', ' ', text) # Remove URLs
    text = re.sub(r'<.*?>', ' ', text)                    # Remove HTML tags
    text = re.sub(r'[^a-z\s]', ' ', text)                 # Remove non-alpha (punctuation, numbers)

    # 3. Tokenize (split into words)
    words = text.split()

    # 4. Remove Stopwords and Lemmatize
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 1]

    # 5. Join back into a string
    return ' '.join(words)

# Apply the cleaning function to all relevant text columns
text_cols = ['title', 'location', 'department', 'company_profile',
             'description', 'requirements', 'benefits', 'employment_type',
             'required_experience', 'required_education', 'industry', 'function']

for col in text_cols:
    df[col + '_cleaned'] = df[col].apply(clean_text)

print("Text cleaning and lemmatization applied to all relevant columns.")

Text cleaning and lemmatization applied to all relevant columns.


# 4. Feature Engineering
Created complex, meaningful features here that demonstrate strong analytical depth.

## 4.1. Feature Engineering from Text (Lexical, Readability)
We use the cleaned text to calculate metrics that capture the style of writing.

In [13]:


# Combine main text columns for a single comprehensive text feature
df['full_text'] = df['title_cleaned'] + ' ' + df['description_cleaned'] + ' ' + df['requirements_cleaned']

def get_lexical_features(text):
    if not text.strip(): # Handle empty strings
        return 0, 0, 0, 0

    # Lexical Features
    word_count = len(text.split())
    unique_words = len(set(text.split()))

    # Readability Scores (using raw text for best score accuracy)
    try:
        flesch_reading_ease = textstat.flesch_reading_ease(text)
    except:
        flesch_reading_ease = 50.0 # Default to average on error

    # Textstat requires the original (uncleaned) description for accurate punctuation/sentence count
    original_text = df.loc[df['full_text'] == text, 'description'].iloc[0] if not df[df['full_text'] == text].empty else text
    try:
        dale_chall_score = textstat.dale_chall_readability_score(str(original_text))
    except:
        dale_chall_score = 10.0 # Default on error

    return word_count, unique_words, flesch_reading_ease, dale_chall_score

# Apply feature extraction
df[['word_count', 'unique_word_count', 'flesch_score', 'dale_chall_score']] = df['full_text'].apply(
    lambda x: pd.Series(get_lexical_features(x))
)

df['lexical_diversity'] = df['unique_word_count'] / df['word_count'].replace(0, 1) # Avoid division by zero
print("Text-based features (Lexical Diversity, Readability Scores) engineered.")

Text-based features (Lexical Diversity, Readability Scores) engineered.


## 4.2. Metadata Consistency and Employer Credibility
These features capture structural red flags common in fake postings.

In [14]:
 # Metadata and Consistency Features

# 1. Metadata Consistency: Check if certain fields are present when they should be
df['has_company_info'] = df['company_profile'].apply(lambda x: 1 if x not in ['MISSING', ''] else 0)
df['has_benefits'] = df['benefits'].apply(lambda x: 1 if x not in ['MISSING', ''] else 0)
df['salary_explicit'] = df['salary_range'].apply(lambda x: 0 if pd.isna(x) else 1)

# 2. Employer Credibility Score (Frequency-based heuristic)
# Fake jobs often come from rare/newly created accounts (low frequency posting)

# Calculate post frequency for each company
company_counts = df['company_profile'].value_counts()
# Map counts back to the DataFrame. Use log-transform to reduce skew.
df['log_company_credibility_score'] = df['company_profile'].map(company_counts).apply(
    lambda x: np.log1p(x) if x > 1 else 0
)

print("Metadata consistency flags and log-transformed employer credibility score engineered.")

Metadata consistency flags and log-transformed employer credibility score engineered.


# 5. Categorical Encoding
We have convert categorical text features into numerical format for the ML models.

In [15]:
#  Categorical Encoding (Label Encoding for High Cardinality)

# Identify high-cardinality nominal columns where Label Encoding is preferred over One-Hot
# due to the sheer number of unique values, which could explode the feature space.
high_cardinality_cols = ['location', 'required_experience_cleaned', 'employment_type_cleaned',
                         'industry_cleaned', 'function_cleaned']

for col in high_cardinality_cols:
    le = LabelEncoder()
    # Fit and transform. Missing values are already imputed as 'MISSING' category.
    df[col + '_encoded'] = le.fit_transform(df[col])

print("High-cardinality features encoded using Label Encoding.")

# Other categorical columns (e.g., flags like has_company_logo) are already numerical (0/1).

High-cardinality features encoded using Label Encoding.


# 6. Final Feature Set Preparation and Saving

In [16]:
PROJECT_ROOT = '/content/drive/MyDrive/Fake_Job_Posting_Detection'
DATA_PATH = os.path.join(PROJECT_ROOT, 'data')
PROCESSED_FILE_PATH = os.path.join(DATA_PATH, 'processed_data.csv')

In [17]:


# 1. Define Features (X) and Target (y)
# Include all numerical and encoded features, plus the new engineered features.
feature_cols = [
    # Original Flags
    'telecommuting', 'has_company_logo', 'has_questions',
    # New Text Features
    'word_count', 'unique_word_count', 'flesch_score', 'dale_chall_score', 'lexical_diversity',
    # New Consistency/Credibility Features
    'has_company_info', 'has_benefits', 'salary_explicit', 'log_company_credibility_score',
    # Encoded Categorical Features
    'location_encoded', 'required_experience_cleaned_encoded',
    'employment_type_cleaned_encoded', 'industry_cleaned_encoded', 'function_cleaned_encoded'
]

X = df[feature_cols]
y = df['fraudulent']

# 2. Final Check for NaNs
print(f"Final check for NaNs in X features: {X.isnull().sum().sum()} (Should be 0)")

# 3. Save the processed feature set

processed_df = pd.concat([X, y], axis=1)
processed_df.to_csv(PROCESSED_FILE_PATH, index=False)

print(f"\nProcessed data saved to: {PROCESSED_FILE_PATH}")
print(f"Final feature set shape: {X.shape}")

Final check for NaNs in X features: 0 (Should be 0)

Processed data saved to: /content/drive/MyDrive/Fake_Job_Posting_Detection/data/processed_data.csv
Final feature set shape: (17880, 17)


In [18]:
processed_df.head(3)

Unnamed: 0,telecommuting,has_company_logo,has_questions,word_count,unique_word_count,flesch_score,dale_chall_score,lexical_diversity,has_company_info,has_benefits,salary_explicit,log_company_credibility_score,location_encoded,required_experience_cleaned_encoded,employment_type_cleaned_encoded,industry_cleaned_encoded,function_cleaned_encoded,fraudulent
0,0,1,0,161.0,136.0,-157.30795,15.116055,0.84472,1,0,0,3.73767,2536,5,0,83,23,0
1,0,1,0,318.0,216.0,-304.555755,11.790945,0.679245,1,1,0,3.218876,1074,0,2,75,8,0
2,0,1,0,131.0,99.0,-152.160534,11.094967,0.755725,1,0,0,5.141664,1868,7,3,83,24,0


# 7. Commit progress to GitHub

In [23]:
!find /content/drive -type d -name ".git"


/content/drive/MyDrive/Fake_Job_Posting_Detection/.git
/content/drive/.Encrypted/MyDrive/Fake_Job_Posting_Detection/.git


In [25]:
%cd /content/drive/MyDrive/Fake_Job_Posting_Detection


/content/drive/MyDrive/Fake_Job_Posting_Detection


In [26]:
!ls -a


data  .git  .gitignore	models	notebooks  README.md  src


In [27]:
!git add .

In [29]:
!git config --global user.email "muhammadriaz8685@gmail.com"
!git config --global user.name "mriaz72"


In [30]:
!git commit -m "Completed advanced preprocessing and feature engineering."

[main 66cf3a4] Completed advanced preprocessing and feature engineering.
 3 files changed, 17883 insertions(+), 1 deletion(-)
 create mode 100644 data/processed_data.csv
 create mode 100644 notebooks/02_preprocessing.ipynb


In [31]:
!git push


Enumerating objects: 10, done.
Counting objects:  10% (1/10)Counting objects:  20% (2/10)Counting objects:  30% (3/10)Counting objects:  40% (4/10)Counting objects:  50% (5/10)Counting objects:  60% (6/10)Counting objects:  70% (7/10)Counting objects:  80% (8/10)Counting objects:  90% (9/10)Counting objects: 100% (10/10)Counting objects: 100% (10/10), done.
Delta compression using up to 2 threads
Compressing objects: 100% (6/6), done.
Writing objects: 100% (7/7), 603.60 KiB | 3.09 MiB/s, done.
Total 7 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 1 local object.[K
To https://github.com/mriaz72/Fake_Job_Posting_Detection.git
   93b0d9b..66cf3a4  main -> main
