In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the dataset
url = "https://raw.githubusercontent.com/dsrscientist/Data-Science-ML-Capstone-Projects/master/Automobile_insurance_fraud.csv"
data = pd.read_csv(url)

# Display basic information about the dataset
print(data.info())
print(data.head())

# Drop irrelevant columns
data = data.drop(['policy_number', 'policy_bind_date', 'incident_date', '_c39'], axis=1)

# Preprocessing
# Convert categorical variables to numerical using Label Encoding
label_encoder = LabelEncoder()
categorical_cols = ['policy_state', 'policy_csl', 'insured_sex', 'insured_education_level', 'insured_occupation',
                    'insured_hobbies', 'insured_relationship', 'incident_type', 'collision_type', 'incident_severity',
                    'authorities_contacted', 'incident_state', 'incident_city', 'property_damage', 'police_report_available',
                    'auto_make', 'auto_model']
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Convert 'fraud_reported' to binary labels
data['fraud_reported'] = data['fraud_reported'].apply(lambda x: 1 if x == 'Y' else 0)

# Split the data into features (X) and target (y)
X = data.drop('fraud_reported', axis=1)
y = data['fraud_reported']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Calculate accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
