#  1. Setup, Imports, and Data Loading

In [29]:

import os
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
import textstat
import nltk

# Download necessary NLTK components

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [30]:
# Load the raw dataset
df = pd.read_csv("/content/drive/MyDrive/Fake_Job_Posting_Detection/data/raw/fake_job_postings.csv")
df.drop('job_id', axis=1, inplace=True) # Drop ID column
print("Data loaded successfully and ready for processing.")

Data loaded successfully and ready for processing.


In [18]:
df.head(3)

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0


In [31]:
df.shape

(17880, 17)

# 2. Missing Value Handling
**EDA shows:**

* Missing fields are informative, not random

* “Missing” itself is a fraud signal

In [32]:
df.dtypes

Unnamed: 0,0
title,object
location,object
department,object
salary_range,object
company_profile,object
description,object
requirements,object
benefits,object
telecommuting,int64
has_company_logo,int64


In [33]:
df['salary_range'].isna().sum()
df['salary_range'].value_counts(dropna=False).head(10)


Unnamed: 0_level_0,count
salary_range,Unnamed: 1_level_1
,15012
0-0,142
40000-50000,66
30000-40000,55
25000-30000,37
45000-67000,37
30000-50000,32
35000-45000,30
70000-90000,30
80000-100000,30


In [34]:
def normalize_salary_range(col):
    col = col.copy()

    # Keep NaN as NaN
    col = col.replace('0-0', np.nan)

    return col

In [35]:
df['salary_range'] = normalize_salary_range(df['salary_range'])


In [26]:
df['salary_range'].value_counts(dropna=False).head(5)


Unnamed: 0_level_0,count
salary_range,Unnamed: 1_level_1
,15154
40000-50000,66
30000-40000,55
45000-67000,37
25000-30000,37


In [38]:
df['salary_range'] = df['salary_range'].fillna('unknown')


In [39]:
df['salary_range'].isna().sum()

np.int64(0)

In [40]:
# Categorical/text columns
categorical_cols = df.select_dtypes(include='object').columns
df[categorical_cols] = df[categorical_cols].fillna('MISSING')

# Binary flags
binary_cols = ['telecommuting', 'has_company_logo', 'has_questions']
df[binary_cols] = df[binary_cols].fillna(0)

# Salary handled separately
df['salary_range'].replace('MISSING', np.nan, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['salary_range'].replace('MISSING', np.nan, inplace=True)


In [41]:
df.isnull().sum()

Unnamed: 0,0
title,0
location,0
department,0
salary_range,0
company_profile,0
description,0
requirements,0
benefits,0
telecommuting,0
has_company_logo,0


# 3. Text Cleaning Pipeline
This is essential for preparing the text columns (title, description, requirements, etc.) for feature extraction.

In [43]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r'http\\S+|www\\S+', ' ', text)
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'[^a-z0-9\\s]', ' ', text)  # KEEP numbers

    tokens = [
        lemmatizer.lemmatize(w)
        for w in text.split()
        if w not in stop_words and len(w) > 1
    ]
    return " ".join(tokens)


### Text Columns Cleaning

In [44]:
TEXT_COLS = [
    'title', 'location', 'department', 'company_profile',
    'description', 'requirements', 'benefits',
    'employment_type', 'required_experience',
    'required_education', 'industry', 'function'
]

for col in TEXT_COLS:
    df[f"{col}_clean"] = df[col].apply(clean_text)

In [45]:
(df[[c + "_clean" for c in ['description']]].head())


Unnamed: 0,description_clean
0,food52 fast growing james beard award winning ...
1,organised focused vibrant awesome passion cust...
2,client located houston actively seeking experi...
3,company esri environmental system research ins...
4,job title itemization review managerlocation f...


# 4. Unified Text Field


In [46]:
df['full_text'] = (
    df['title_clean'] + " " +
    df['description_clean'] + " " +
    df['requirements_clean']
).str.strip()

In [47]:
(df['full_text'].str.strip() == '').mean()


np.float64(0.0)

## 5.Lexical Feature Engineering


In [48]:

def extract_lexical_features(clean_text, raw_text):
    if not clean_text.strip():
        return 0, 0, 0.0, 0.0

    word_count = len(clean_text.split())
    unique_words = len(set(clean_text.split()))

    flesch = textstat.flesch_reading_ease(raw_text)
    dale = textstat.dale_chall_readability_score(raw_text)

    return word_count, unique_words, flesch, dale


In [49]:
df[['word_count', 'unique_word_count', 'flesch_score', 'dale_chall_score']] = df.apply(
    lambda r: pd.Series(
        extract_lexical_features(r['full_text'], r['description'])
    ),
    axis=1
)

df['lexical_diversity'] = df['unique_word_count'] / df['word_count'].replace(0, 1)


# 6. Metadata Consistency & Credibility

In [50]:
df['has_company_info'] = (df['company_profile'] != 'MISSING').astype(int)
df['has_benefits'] = (df['benefits'] != 'MISSING').astype(int)
df['salary_explicit'] = df['salary_range'].notnull().astype(int)

company_freq = df['company_profile'].value_counts()
df['log_company_credibility'] = df['company_profile'].map(company_freq).fillna(1)
df['log_company_credibility'] = np.log1p(df['log_company_credibility'])


In [51]:
df.groupby('fraudulent')[
    ['has_company_info', 'has_benefits', 'salary_explicit', 'log_company_credibility']
].mean()


Unnamed: 0_level_0,has_company_info,has_benefits,salary_explicit,log_company_credibility
fraudulent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.840073,0.597508,1.0,4.249956
1,0.322171,0.579677,1.0,6.282387


# 7. Categorical Encoding
We have convert categorical text features into numerical format for the ML models.

In [52]:
ENCODE_COLS = [
    'location_clean', 'required_experience_clean',
    'employment_type_clean', 'industry_clean', 'function_clean'
]

label_encoders = {}

for col in ENCODE_COLS:
    le = LabelEncoder()
    df[col + "_enc"] = le.fit_transform(df[col])
    label_encoders[col] = le


In [53]:
for col in ENCODE_COLS:
    print(col, df[col].nunique())


location_clean 2698
required_experience_clean 8
employment_type_clean 6
industry_clean 132
function_clean 38


# 8.Final Feature Matrix

In [54]:
FEATURES = [
    'telecommuting', 'has_company_logo', 'has_questions',
    'word_count', 'unique_word_count', 'lexical_diversity',
    'flesch_score', 'dale_chall_score',
    'has_company_info', 'has_benefits',
    'salary_explicit', 'log_company_credibility'
] + [c + "_enc" for c in ENCODE_COLS]

X = df[FEATURES]
y = df['fraudulent']


In [55]:
assert X.isnull().sum().sum() == 0
print("Final feature matrix shape:", X.shape)


Final feature matrix shape: (17880, 17)


# 9. Save Clean Output

In [56]:
OUTPUT_PATH = "/content/drive/MyDrive/Fake_Job_Posting_Detection/data/processed/processed_data.csv"
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

pd.concat([X, y], axis=1).to_csv(OUTPUT_PATH, index=False)


In [57]:
df_processed = pd.read_csv('/content/drive/MyDrive/Fake_Job_Posting_Detection/data/processed/processed_data.csv')
df_processed.head(5)

Unnamed: 0,telecommuting,has_company_logo,has_questions,word_count,unique_word_count,lexical_diversity,flesch_score,dale_chall_score,has_company_info,has_benefits,salary_explicit,log_company_credibility,location_clean_enc,required_experience_clean_enc,employment_type_clean_enc,industry_clean_enc,function_clean_enc,fraudulent
0,0,1,0,163.0,139.0,0.852761,-22.565968,15.116055,1,0,1,3.73767,2150,5,0,83,23,0
1,0,1,0,334.0,221.0,0.661677,41.551532,11.790945,1,1,1,3.218876,931,0,2,75,8,0
2,0,1,0,157.0,111.0,0.707006,13.950333,11.094967,1,0,1,5.141664,1592,7,3,83,24,0
3,0,1,0,352.0,257.0,0.730114,-15.455136,13.55608,1,1,1,3.871201,1437,6,2,22,32,0
4,0,1,1,197.0,159.0,0.807107,-54.455,16.28285,1,1,1,4.110874,1477,6,2,50,17,0


# 7. Commit progress to GitHub

In [58]:
!find /content/drive -type d -name ".git"


/content/drive/MyDrive/Fake_Job_Posting_Detection/.git
/content/drive/.Encrypted/MyDrive/Fake_Job_Posting_Detection/.git


In [59]:
%cd /content/drive/MyDrive/Fake_Job_Posting_Detection


/content/drive/MyDrive/Fake_Job_Posting_Detection


In [60]:
!ls -a


data  .git  .gitignore	models	notebooks  README.md  src


In [61]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   notebooks/01_exploratry_data_analysis.ipynb[m
	[31mmodified:   notebooks/02_preprocessing.ipynb[m
	[31mmodified:   notebooks/03_Model_Training_Evaluation.ipynb[m
	[31mmodified:   notebooks/04_RAG_Implementation.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mdata/[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!git add .

In [None]:
!git config --global user.email "muhammadriaz8685@gmail.com"
!git config --global user.name "mriaz72"


In [None]:
!git commit -m "Completed advanced preprocessing and feature engineering."

[main 66cf3a4] Completed advanced preprocessing and feature engineering.
 3 files changed, 17883 insertions(+), 1 deletion(-)
 create mode 100644 data/processed_data.csv
 create mode 100644 notebooks/02_preprocessing.ipynb


In [None]:
!git push


Enumerating objects: 10, done.
Counting objects:  10% (1/10)Counting objects:  20% (2/10)Counting objects:  30% (3/10)Counting objects:  40% (4/10)Counting objects:  50% (5/10)Counting objects:  60% (6/10)Counting objects:  70% (7/10)Counting objects:  80% (8/10)Counting objects:  90% (9/10)Counting objects: 100% (10/10)Counting objects: 100% (10/10), done.
Delta compression using up to 2 threads
Compressing objects: 100% (6/6), done.
Writing objects: 100% (7/7), 603.60 KiB | 3.09 MiB/s, done.
Total 7 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 1 local object.[K
To https://github.com/mriaz72/Fake_Job_Posting_Detection.git
   93b0d9b..66cf3a4  main -> main
