In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Load the dataset
data = pd.read_csv("../data/fake_job_postings.csv")

# Inspect column names and structure
print(data.columns)
print(data.head())

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')
   job_id                                      title            location  \
0       1                           Marketing Intern    US, NY, New York   
1       2  Customer Service - Cloud Video Production      NZ, , Auckland   
2       3    Commissioning Machinery Assistant (CMA)       US, IA, Wever   
3       4          Account Executive - Washington DC  US, DC, Washington   
4       5                        Bill Review Manager  US, FL, Fort Worth   

  department salary_range                                    company_profile  \
0  Marketing          NaN  We're Food52, and we've created a groundbreaki...   
1    Success          NaN  90 Seconds, the wor

In [14]:
# Preprocess dataset: drop rows with missing 'function' or 'description' values
data = data.dropna(subset=['function', 'description'])

# Features and target variable
X = data[['function', 'description']]
y = data['fraudulent']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('function', OneHotEncoder(), ['function']),  # One-hot encode the 'function' column
        ('description', TfidfVectorizer(), 'description')  # Apply TF-IDF to the 'description' column
    ])

# Create a pipeline that first transforms the data and then fits the DecisionTreeClassifier with class weight
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Apply preprocessing steps
    ('classifier', DecisionTreeClassifier(class_weight='balanced'))  # Train the decision tree model with class balancing
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Save the trained model to a file
joblib.dump(pipeline, 'decision_tree_model_pipeline.pkl')


              precision    recall  f1-score   support

           0       0.98      0.96      0.97      2176
           1       0.45      0.61      0.52       109

    accuracy                           0.95      2285
   macro avg       0.72      0.78      0.74      2285
weighted avg       0.95      0.95      0.95      2285



['decision_tree_model_pipeline.pkl']

In [16]:
# Load the trained model pipeline
model = joblib.load('decision_tree_model_pipeline.pkl')

# Example prediction
example = pd.DataFrame([{'function': 'Administrative', 'description': 'This is a fake job posting example'}])

# Predict using the loaded model
prediction = model.predict(example)
print("Prediction (fake job):", "Fake" if prediction[0] == 1 else "Real")


Prediction (fake job): Fake
