In [1]:
# Import Dependencies
import pandas as pd
import scipy.sparse as sp
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

In [2]:
# Load the TF-IDF matrix from CSV
try:
    tfidf_matrix = pd.read_csv('tfidf_matrix.csv')  # Replace with your actual TF-IDF matrix file name
    print("TF-IDF matrix loaded successfully.")
except FileNotFoundError:
    print("Error: The TF-IDF matrix file was not found.")
    exit()

TF-IDF matrix loaded successfully.


In [3]:
# Check the shape and type of the loaded TF-IDF matrix
print("Shape of TF-IDF matrix:", tfidf_matrix.shape)


Shape of TF-IDF matrix: (10059, 1000)


In [4]:
# Convert the DataFrame to a sparse matrix (if needed)
X_tfidf = sp.csr_matrix(tfidf_matrix.values)

In [5]:
# Load the dataset that contains the target variable
try:
    data = pd.read_csv('updated_job_postings.csv') 
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: The data file was not found.")
    exit()

Data loaded successfully.


In [6]:
# Define target variable (y)
y = data['fraudulent']  # Adjust to refer to the correct target column

In [7]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [8]:
# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [9]:
# Predict on test data
y_pred = rf_model.predict(X_test)

In [10]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.9676938369781312
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1880
           1       0.99      0.52      0.68       132

    accuracy                           0.97      2012
   macro avg       0.98      0.76      0.83      2012
weighted avg       0.97      0.97      0.96      2012



In [11]:
#Use SMOTE and RandomizedSearchCV

In [12]:
# Use SMOTE for resampling the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [13]:
# Define the parameter distribution for RandomizedSearchCV
param_distributions = {
    'n_estimators': [50, 100, 200],       
    'max_depth': [None, 10, 20, 30],      
    'min_samples_split': [2, 5, 10],      
    'class_weight': ['balanced', None]    
}

In [14]:
# Initialize RandomizedSearchCV with a try-except block for troubleshooting
try:
    random_search = RandomizedSearchCV(
        RandomForestClassifier(random_state=42),
        param_distributions=param_distributions,
        n_iter=10,
        cv=3,
        scoring='f1_weighted',
        random_state=42,
        n_jobs=1  # Set to 1 for easier debugging
    )
    random_search.fit(X_resampled, y_resampled)  # Fit with resampled data
    
    # Retrieve the best model
    best_rf_model = random_search.best_estimator_
    print("RandomizedSearchCV completed successfully with best parameters:", random_search.best_params_)

except Exception as e:
    print("An error occurred during RandomizedSearchCV:", e)


RandomizedSearchCV completed successfully with best parameters: {'n_estimators': 100, 'min_samples_split': 5, 'max_depth': None, 'class_weight': 'balanced'}


In [15]:
random_search.fit(X_resampled, y_resampled)


In [16]:
# Predict using the best model on the original test data
y_pred_proba = best_rf_model.predict_proba(X_test)[:, 1]

In [17]:
# Adjust the threshold for classification
threshold = 0.3  # Experiment with this value
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)  # This is the corrected variable name



In [18]:
optimized_accuracy = accuracy_score(y_test, y_pred_adjusted)
optimized_report = classification_report(y_test, y_pred_adjusted)

In [19]:
print(f"Optimized Accuracy: {optimized_accuracy}")
print("Optimized Classification Report:")
print(optimized_report)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

Optimized Accuracy: 0.9468190854870775
Optimized Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1880
           1       0.57      0.74      0.65       132

    accuracy                           0.95      2012
   macro avg       0.78      0.85      0.81      2012
weighted avg       0.95      0.95      0.95      2012

Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'max_depth': None, 'class_weight': 'balanced'}
