In [2]:
!pip install scikit-optimize



In [19]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import randint

In [20]:
train_data = pd.read_csv(r"C:\Users\user\Desktop\ASU\CIS 508\508 HW 2\Insurance Fraud - TRAIN-3000.csv")
test_data = pd.read_csv(r"C:\Users\user\Desktop\ASU\CIS 508\508 HW 2\Insurance Fraud -TEST-12900.csv")

In [21]:
train_data['FRAUDFOUND'] = train_data['FRAUDFOUND'].map({'Yes': 1, 'No': 0})
test_data['FRAUDFOUND'] = test_data['FRAUDFOUND'].map({'Yes': 1, 'No': 0})

In [22]:
X_train = train_data.drop('FRAUDFOUND', axis=1)
y_train = train_data['FRAUDFOUND']
X_test = test_data.drop('FRAUDFOUND', axis=1)
y_test = test_data['FRAUDFOUND']

In [23]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [24]:
# Define different parameter spaces and random seeds for each tuning method
random_seeds = [42, 21, 84]

In [25]:
# RandomForest parameters
rf_param_space_random = {
    'n_estimators': randint(50, 300),
    'max_depth': [10, 20, 30, 40, 50, None],
    'min_samples_split': [2, 5, 10]
}

rf_param_grid = {
    'n_estimators': [50, 100, 150, 200, 250],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10]
}

rf_param_space_bayes = {
    'n_estimators': (50, 300),
    'max_depth': (10, 50),
    'min_samples_split': (2, 10)
}

In [26]:
# DecisionTree parameters
dt_param_space_random = {
    'max_depth': [10, 20, 30, 40, 50, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

dt_param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

dt_param_space_bayes = {
    'max_depth': (10, 50),
    'min_samples_split': (2, 10),
    'criterion': ['gini', 'entropy']
}

In [27]:
# Store results
results = {'Model': [], 'Tuning Method': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}

In [28]:
# Define a function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

In [29]:
# Random Forest - Random Search
print("Random Forest - Random Search")
rf_random = RandomForestClassifier(random_state=random_seeds[0])
random_search_rf = RandomizedSearchCV(estimator=rf_random, param_distributions=rf_param_space_random, n_iter=10, cv=5, scoring='accuracy', random_state=random_seeds[0])
random_search_rf.fit(X_train, y_train)
accuracy, precision, recall, f1 = evaluate_model(random_search_rf.best_estimator_, X_test, y_test)
results['Model'].append('Random Forest')
results['Tuning Method'].append('Random Search')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)


Random Forest - Random Search


In [31]:
# Decision Tree - Random Search
from sklearn.tree import DecisionTreeClassifier
print("Decision Tree - Random Search")
dt_random = DecisionTreeClassifier(random_state=random_seeds[0])
random_search_dt = RandomizedSearchCV(estimator=dt_random, param_distributions=dt_param_space_random, n_iter=10, cv=5, scoring='accuracy', random_state=random_seeds[0])
random_search_dt.fit(X_train, y_train)
accuracy, precision, recall, f1 = evaluate_model(random_search_dt.best_estimator_, X_test, y_test)
results['Model'].append('Decision Tree')
results['Tuning Method'].append('Random Search')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)

Decision Tree - Random Search


In [32]:
# Random Forest - Grid Search
print("Random Forest - Grid Search")
rf_grid = RandomForestClassifier(random_state=random_seeds[1])
grid_search_rf = GridSearchCV(estimator=rf_grid, param_grid=rf_param_grid, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
accuracy, precision, recall, f1 = evaluate_model(grid_search_rf.best_estimator_, X_test, y_test)
results['Model'].append('Random Forest')
results['Tuning Method'].append('Grid Search')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)

Random Forest - Grid Search


In [34]:
# Decision Tree - Grid Search
print("Decision Tree - Grid Search")
dt_grid = DecisionTreeClassifier(random_state=random_seeds[1])
grid_search_dt = GridSearchCV(estimator=dt_grid, param_grid=dt_param_grid, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)
accuracy, precision, recall, f1 = evaluate_model(grid_search_dt.best_estimator_, X_test, y_test)
results['Model'].append('Decision Tree')
results['Tuning Method'].append('Grid Search')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)

Decision Tree - Grid Search


In [35]:
# Random Forest - Bayesian Search
print("Random Forest - Bayesian Search")
rf_bayes = RandomForestClassifier(random_state=random_seeds[2])
bayes_search_rf = BayesSearchCV(estimator=rf_bayes, search_spaces=rf_param_space_bayes, cv=5, scoring='accuracy', n_iter=10, random_state=random_seeds[2])
bayes_search_rf.fit(X_train, y_train)
accuracy, precision, recall, f1 = evaluate_model(bayes_search_rf.best_estimator_, X_test, y_test)
results['Model'].append('Random Forest')
results['Tuning Method'].append('Bayesian Search')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)

Random Forest - Bayesian Search


In [36]:
# Decision Tree - Bayesian Search
print("Decision Tree - Bayesian Search")
dt_bayes = DecisionTreeClassifier(random_state=random_seeds[2])
bayes_search_dt = BayesSearchCV(estimator=dt_bayes, search_spaces=dt_param_space_bayes, cv=5, scoring='accuracy', n_iter=10, random_state=random_seeds[2])
bayes_search_dt.fit(X_train, y_train)
accuracy, precision, recall, f1 = evaluate_model(bayes_search_dt.best_estimator_, X_test, y_test)
results['Model'].append('Decision Tree')
results['Tuning Method'].append('Bayesian Search')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)

Decision Tree - Bayesian Search


In [37]:
# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print(results_df)
results_df.to_excel("model_comparison_results.xlsx", index=False)

           Model    Tuning Method  Accuracy  Precision    Recall  F1 Score
0  Random Forest    Random Search  0.951076   0.387584  0.463855  0.422303
1  Decision Tree    Random Search  0.891856   0.217828  0.696787  0.331899
2  Random Forest      Grid Search  0.963152   0.513580  0.835341  0.636086
3  Decision Tree      Grid Search  0.920576   0.199317  0.351406  0.254360
4  Random Forest  Bayesian Search  0.952469   0.406752  0.508032  0.451786
5  Decision Tree  Bayesian Search  0.888450   0.227299  0.789157  0.352941


In [38]:
# Identify and print the best model for each metric
best_accuracy = results_df.loc[results_df['Accuracy'].idxmax()]
best_precision = results_df.loc[results_df['Precision'].idxmax()]
best_recall = results_df.loc[results_df['Recall'].idxmax()]
best_f1_score = results_df.loc[results_df['F1 Score'].idxmax()]

print("\nBest model results (based on different metrics):")
print("Best Accuracy:", best_accuracy)
print("Best Precision:", best_precision)
print("Best Recall:", best_recall)
print("Best F1 Score:", best_f1_score)


Best model results (based on different metrics):
Best Accuracy: Model            Random Forest
Tuning Method      Grid Search
Accuracy              0.963152
Precision              0.51358
Recall                0.835341
F1 Score              0.636086
Name: 2, dtype: object
Best Precision: Model            Random Forest
Tuning Method      Grid Search
Accuracy              0.963152
Precision              0.51358
Recall                0.835341
F1 Score              0.636086
Name: 2, dtype: object
Best Recall: Model            Random Forest
Tuning Method      Grid Search
Accuracy              0.963152
Precision              0.51358
Recall                0.835341
F1 Score              0.636086
Name: 2, dtype: object
Best F1 Score: Model            Random Forest
Tuning Method      Grid Search
Accuracy              0.963152
Precision              0.51358
Recall                0.835341
F1 Score              0.636086
Name: 2, dtype: object
