## **Step 1: Load Data and Preprocess**

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Loading the dateset
url = "https://raw.githubusercontent.com/mohd-faizy/CAREER-TRACK-Data-Scientist-with-Python/main/__Projects__/14_Modeling%20Car%20Insurance%20Claim%20Outcomes/car_insurance.csv"
df_ml = pd.read_csv(url)

# Define the mappings for categorical columns
mappings = {
    'driving_experience': {'0-9y': 0, '10-19y': 1, '20-29y': 2, '30y+': 3},
    'education': {'none': 0, 'high school': 1, 'university': 2},
    'income': {'poverty': 0, 'middle class': 1, 'working class': 2, 'upper class': 3},
    'vehicle_year': {'before 2015': 0, 'after 2015': 1},
    'vehicle_type': {'sedan': 0, 'sports car': 1},
    'postal_code': {10238: 1, 32765: 2, 92101: 3, 21217: 4}
}

# Replace values based on mappings
df_ml.replace(mappings, inplace=True)

# Convert categorical columns to categorical data type
cat_cols = ['age', 'gender', 'driving_experience', 'education', 'income', 'vehicle_ownership',
            'vehicle_year', 'married', 'children', 'vehicle_type', 'postal_code']
df_ml[cat_cols] = df_ml[cat_cols].astype('category')

# Handling missing values in 'credit_score' and 'annual_mileage'
imputer = SimpleImputer(strategy='mean')
df_ml['credit_score'] = imputer.fit_transform(df_ml[['credit_score']])
df_ml['annual_mileage'] = imputer.fit_transform(df_ml[['annual_mileage']])

# Scaling numerical columns
scaler = MinMaxScaler()
df_ml[['credit_score', 'annual_mileage']] = scaler.fit_transform(df_ml[['credit_score', 'annual_mileage']])

# Drop columns not needed for the model
columns_to_drop = ['id']
df_ml = df_ml.drop(columns_to_drop, axis=1)

## **Step 2: Column Transformations**

In [2]:
# Define columns for preprocessing
numerical_features = ['credit_score', 'annual_mileage', 'speeding_violations', 'duis', 'past_accidents']
categorical_features = ['age', 'gender', 'driving_experience', 'education', 'income', 'vehicle_ownership',
                        'vehicle_year', 'married', 'children', 'vehicle_type', 'postal_code']

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', sparse_output=False))
])

# Combine numerical and categorical transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply the transformations
X_preprocessed = preprocessor.fit_transform(df_ml)

# Capture transformed column names for categorical features
categorical_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)

# Combine numerical and transformed categorical column names
all_column_names = numerical_features + list(categorical_columns)

# Convert NumPy array back to a DataFrame with correct column names
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=all_column_names)

## **Step 3: Split Data**

In [3]:
# Split the DataFrame into features (X) and target (y)
X = X_preprocessed_df
y = df_ml['outcome']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Step 4: Define and Train Individual Models**

In [4]:
# Define the individual models
lr = LogisticRegression(random_state=42)
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
svm = SVC(probability=True, random_state=42)  # Set probability=True for soft voting
knn = KNeighborsClassifier()

# Train each model
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)
svm.fit(X_train, y_train)
knn.fit(X_train, y_train)

# Make predictions with each model
y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_gb = gb.predict(X_test)
y_pred_svm = svm.predict(X_test)
y_pred_knn = knn.predict(X_test)

# Evaluate each model
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

Logistic Regression Accuracy: 0.8485
              precision    recall  f1-score   support

         0.0       0.87      0.92      0.89      1367
         1.0       0.79      0.70      0.75       633

    accuracy                           0.85      2000
   macro avg       0.83      0.81      0.82      2000
weighted avg       0.85      0.85      0.85      2000

Random Forest Accuracy: 0.8285
              precision    recall  f1-score   support

         0.0       0.85      0.91      0.88      1367
         1.0       0.77      0.66      0.71       633

    accuracy                           0.83      2000
   macro avg       0.81      0.78      0.79      2000
weighted avg       0.82      0.83      0.82      2000

Gradient Boosting Accuracy: 0.8465
              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89      1367
         1.0       0.79      0.70      0.74       633

    accuracy                           0.85      2000
   macro avg       0.83  

## **Step 5: Ensemble Voting Classifier**

In [5]:
from sklearn.ensemble import VotingClassifier

# Define the voting classifier
voting_clf = VotingClassifier(estimators=[
    ('lr', lr),
    ('rf', rf),
    ('gb', gb),
    ('svm', svm),
    ('knn', knn)
], voting='hard')  # Use 'soft' for probability-based voting

# Train the voting classifier
voting_clf.fit(X_train, y_train)

# Make predictions with the voting classifier
y_pred_voting = voting_clf.predict(X_test)

# Evaluate the voting classifier
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred_voting))
print(classification_report(y_test, y_pred_voting))

Voting Classifier Accuracy: 0.848
              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89      1367
         1.0       0.79      0.71      0.75       633

    accuracy                           0.85      2000
   macro avg       0.83      0.81      0.82      2000
weighted avg       0.85      0.85      0.85      2000



## **Step 5: Hyperparameter Tunning**

Once the best-performing model identified, we can further improve its performance by tuning its hyperparameters using `GridSearchCV` from sklearn.model_selection. Grid search is a powerful method for systematically testing a range of hyperparameters to find the optimal set that maximizes the model's performance.

**Step-by-Step Guide for Hyperparameter Tuning with GridSearchCV**

- `Identify the Best Model`: Suppose the best model from your previous evaluations is GradientBoostingClassifier.

- `Define Hyperparameter Grid`: Create a dictionary specifying the hyperparameters and the range of values you want to test.

- `Set Up Grid Search`: Use GridSearchCV to perform an exhaustive search over the specified parameter values.

- `Fit and Evaluate`: Fit the model with the training data and evaluate the best parameters

### **Define Hyperparameter Grid**




In [6]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5, 10]
}


### **Set Up Grid Search**

In [7]:
# Initialize the model
gb = GradientBoostingClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

### **Fit and Evaluate**

In [8]:
# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best hyperparameters found: ", best_params)

# Get the best estimator
best_model = grid_search.best_estimator_

# Evaluate the best model on the test data
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report
print("Best Model Accuracy:", accuracy_score(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))

Best hyperparameters found:  {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.8}
Best Model Accuracy: 0.8475
              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89      1367
         1.0       0.78      0.72      0.75       633

    accuracy                           0.85      2000
   macro avg       0.83      0.81      0.82      2000
weighted avg       0.85      0.85      0.85      2000



```
Best hyperparameters found:  {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.8}
Best Model Accuracy: 0.8475
              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89      1367
         1.0       0.78      0.72      0.75       633

    accuracy                           0.85      2000
   macro avg       0.83      0.81      0.82      2000
weighted avg       0.85      0.85      0.85      2000
```

### **Example with Logistic Regression**

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

# Initialize the model
lr = LogisticRegression(random_state=42)

# Initialize GridSearchCV
grid_search_lr = GridSearchCV(estimator=lr, param_grid=param_grid_lr, cv=5, n_jobs=-1, scoring='accuracy')

# Fit the grid search
grid_search_lr.fit(X_train, y_train)

# Get the best parameters
best_params_lr = grid_search_lr.best_params_
print("Best hyperparameters found for Logistic Regression: ", best_params_lr)

# Get the best estimator
best_model_lr = grid_search_lr.best_estimator_

# Evaluate the best model on the test data
y_pred_best_lr = best_model_lr.predict(X_test)

# Print accuracy and classification report
print("Best Logistic Regression Model Accuracy:", accuracy_score(y_test, y_pred_best_lr))
print(classification_report(y_test, y_pred_best_lr))


Best hyperparameters found for Logistic Regression:  {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Logistic Regression Model Accuracy: 0.848
              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89      1367
         1.0       0.79      0.70      0.75       633

    accuracy                           0.85      2000
   macro avg       0.83      0.81      0.82      2000
weighted avg       0.85      0.85      0.85      2000



```
Best hyperparameters found for Logistic Regression:  {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Logistic Regression Model Accuracy: 0.848
              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89      1367
         1.0       0.79      0.70      0.75       633

    accuracy                           0.85      2000
   macro avg       0.83      0.81      0.82      2000
weighted avg       0.85      0.85      0.85      2000
```