In [1]:
# Import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score 
from imblearn.over_sampling import ADASYN  # Import ADASYN
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn.metrics import confusion_matrix

In [2]:
# Load the TF-IDF matrix and the target variable
try:
    tfidf_matrix = pd.read_csv('tfidf_matrix.csv')  # Replace with your TF-IDF matrix file name
    print("TF-IDF matrix loaded successfully.")
except FileNotFoundError:
    print("Error: The TF-IDF matrix file was not found.")
    exit()

TF-IDF matrix loaded successfully.


In [3]:
# Convert the DataFrame to a sparse matrix
# https://stackoverflow.com/questions/20459536/convert-pandas-dataframe-to-sparse-numpy-matrix-directly
X_tfidf = tfidf_matrix.values  # Assuming it's a DataFrame



In [4]:
# Load the dataset containing the target variable
try:
    data = pd.read_csv('updated_job_postings.csv')  # Replace with your actual file name
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: The data file was not found.")
    exit()

Data loaded successfully.


In [5]:
# Define the target variable (y)
y = data['fraudulent']

In [6]:
# Initialize and fit TF-IDF Vectorizer on text data
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(data['text'])
print("TF-IDF vectorizer created and fitted.")

TF-IDF vectorizer created and fitted.


In [7]:
# Split the data into training and hold-out test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)


In [8]:
# Oversample the minority class using ADASYN
# https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.ADASYN.html
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)


In [9]:
# Train the model with Resampled Data
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_resampled, y_resampled)


In [10]:
# Generate probabilities for the positive class (fraudulent)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1] 


In [11]:
# Optimal Threshold Search
thresholds = np.arange(0.1, 1.0, 0.01)
scores = []

for threshold in thresholds:
    y_pred_threshold = (y_pred_proba >= threshold).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_threshold, average='binary', pos_label=1)
    scores.append((threshold, precision, recall, f1))



In [12]:
# Identify the optimal threshold based on F1-score
scores_df = pd.DataFrame(scores, columns=['Threshold', 'Precision', 'Recall', 'F1'])
optimal_threshold = scores_df.loc[scores_df['F1'].idxmax(), 'Threshold']
print(f'Optimal Threshold based on F1-score: {optimal_threshold:.4f}')


Optimal Threshold based on F1-score: 0.3500


In [13]:
# Apply Optimal Threshold to Final Predictions
y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)

In [14]:
# Final Evaluation
accuracy = accuracy_score(y_test, y_pred_optimal)
final_report = precision_recall_fscore_support(y_test, y_pred_optimal, average='binary', pos_label=1)

print(f'Accuracy at Optimal Threshold: {accuracy}')
print("Final Classification Report at Optimal Threshold:")
print(f'Precision: {final_report[0]:.4f}, Recall: {final_report[1]:.4f}, F1-Score: {final_report[2]:.4f}')


Accuracy at Optimal Threshold: 0.9736580516898609
Final Classification Report at Optimal Threshold:
Precision: 0.9200, Recall: 0.6715, F1-Score: 0.7764


In [15]:
# Save the model and vectorizer for deployment
# https://docs.python.org/3/library/pickle.html
with open('best_rf_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)
    print("Trained model saved as 'best_rf_model.pkl'")

Trained model saved as 'best_rf_model.pkl'


In [16]:
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vectorizer, vectorizer_file)
    print("TF-IDF vectorizer saved as 'tfidf_vectorizer.pkl'")

TF-IDF vectorizer saved as 'tfidf_vectorizer.pkl'
