In [1]:
# Import Dependencies
import pandas as pd
import scipy.sparse as sp
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

  from pandas.core import (


In [2]:
# Load the TF-IDF matrix from CSV
try:
    tfidf_matrix = pd.read_csv('tfidf_matrix.csv')  # Replace with your actual TF-IDF matrix file name
    print("TF-IDF matrix loaded successfully.")
except FileNotFoundError:
    print("Error: The TF-IDF matrix file was not found.")
    exit()

TF-IDF matrix loaded successfully.


In [3]:
# Check the shape and type of the loaded TF-IDF matrix
print("Shape of TF-IDF matrix:", tfidf_matrix.shape)


Shape of TF-IDF matrix: (17880, 500)


In [4]:
# Convert the DataFrame to a sparse matrix (if needed)
X_tfidf = sp.csr_matrix(tfidf_matrix.values)

In [5]:
# Load the dataset that contains the target variable
try:
    data = pd.read_csv('fake_job_postings.csv') 
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: The data file was not found.")
    exit()

Data loaded successfully.


In [6]:
# Define target variable (y)
y = data['fraudulent']  # Adjust to refer to the correct target column

In [7]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [8]:
# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [9]:
# Predict on test data
y_pred = rf_model.predict(X_test)

In [10]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.973434004474273
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      3395
           1       0.99      0.48      0.65       181

    accuracy                           0.97      3576
   macro avg       0.98      0.74      0.82      3576
weighted avg       0.97      0.97      0.97      3576



In [11]:
# Use SMOTE for resampling the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [12]:
# Define the parameter distribution for RandomizedSearchCV
param_distributions = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'class_weight': ['balanced', None]  # Adjusts weights inversely proportional to class frequencies
}

In [13]:
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), 
                                   param_distributions=param_distributions, 
                                   n_iter=10,  # Number of parameter settings to sample
                                   cv=3,  # Number of cross-validation folds
                                   scoring='f1_weighted',
                                   random_state=42,
                                   n_jobs=-1)  # Use all available cores


In [14]:
# Fit the Randomized Search with resampled data
random_search.fit(X_resampled, y_resampled)


In [15]:
# Get the best model from Random Search
best_rf_model = random_search.best_estimator_

In [16]:
# Predict using the best model on the original test data
y_pred_proba = best_rf_model.predict_proba(X_test)[:, 1]

In [17]:
# Adjust the threshold for classification
threshold = 0.3  # Experiment with this value
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)  # This is the corrected variable name



In [18]:
optimized_accuracy = accuracy_score(y_test, y_pred_adjusted)
optimized_report = classification_report(y_test, y_pred_adjusted)

In [19]:
print(f"Optimized Accuracy: {optimized_accuracy}")
print("Optimized Classification Report:")
print(optimized_report)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

Optimized Accuracy: 0.9731543624161074
Optimized Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3395
           1       0.72      0.77      0.74       181

    accuracy                           0.97      3576
   macro avg       0.85      0.88      0.87      3576
weighted avg       0.97      0.97      0.97      3576

Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'max_depth': None, 'class_weight': 'balanced'}


In [20]:
# Perform k-fold cross-validation
k = 5  # You can change this value
cv_scores = cross_val_score(rf_model, X_resampled, y_resampled, cv=k, scoring='f1_weighted')



In [21]:
# Output the cross-validation results
print(f"Cross-Validation Scores (F1 Weighted): {cv_scores}")
print(f"Mean CV F1 Score: {np.mean(cv_scores)}")

Cross-Validation Scores (F1 Weighted): [0.99798091 0.99798091 0.99761379 0.99706259 0.99779695]
Mean CV F1 Score: 0.9976870314631843
