In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [13]:
train_data = pd.read_csv(r"C:\Users\user\Desktop\ASU\CIS 508\508 HW 2\Insurance Fraud - TRAIN-3000.csv")
test_data = pd.read_csv(r"C:\Users\user\Desktop\ASU\CIS 508\508 HW 2\Insurance Fraud -TEST-12900.csv")

# Combine train and test data for consistent preprocessing
data = pd.concat([train_data, test_data], ignore_index=True)

# Encode categorical features
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

In [14]:
# Separate the data back into train and test sets
train_data = data[:len(train_data)]
test_data = data[len(train_data):]

In [15]:
# Define features and target variable
X_train = train_data.drop(columns=['FRAUDFOUND'])
y_train = train_data['FRAUDFOUND']
X_test = test_data.drop(columns=['FRAUDFOUND'])
y_test = test_data['FRAUDFOUND']

In [16]:
# Define hyperparameter grid for RandomForestClassifier
rf_params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

In [17]:
# Define hyperparameter grid for DecisionTreeClassifier
dt_params = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

In [18]:
# Hyperparameter tuning for RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_grid_search = GridSearchCV(rf_model, rf_params, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)
best_rf = rf_grid_search.best_estimator_

In [19]:
# Hyperparameter tuning for DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_grid_search = GridSearchCV(dt_model, dt_params, cv=5, scoring='accuracy')
dt_grid_search.fit(X_train, y_train)
best_dt = dt_grid_search.best_estimator_

In [20]:
# Evaluate the best models on the test set
rf_predictions = best_rf.predict(X_test)
dt_predictions = best_dt.predict(X_test)

In [21]:
print("Random Forest Classifier Results:")
print("Best Parameters:", rf_grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))

Random Forest Classifier Results:
Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 50}
Accuracy: 0.942328533828766
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     12420
           1       0.28      0.32      0.30       498

    accuracy                           0.94     12918
   macro avg       0.63      0.65      0.64     12918
weighted avg       0.95      0.94      0.94     12918



In [22]:
print("\nDecision Tree Classifier Results:")
print("Best Parameters:", dt_grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, dt_predictions))
print(classification_report(y_test, dt_predictions))


Decision Tree Classifier Results:
Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 2}
Accuracy: 0.8923207926923672
              precision    recall  f1-score   support

           0       0.98      0.90      0.94     12420
           1       0.20      0.59      0.30       498

    accuracy                           0.89     12918
   macro avg       0.59      0.75      0.62     12918
weighted avg       0.95      0.89      0.92     12918

