In [9]:
# Link to the dataset
url = "https://raw.githubusercontent.com/mohd-faizy/CAREER-TRACK-Data-Scientist-with-Python/main/__Projects__/14_Modeling%20Car%20Insurance%20Claim%20Outcomes/car_insurance.csv"

In [10]:
import pandas as pd

df_ml = pd.read_csv(url)
print(df_ml.head())


       id  age  gender driving_experience    education         income  \
0  569520    3       0               0-9y  high school    upper class   
1  750365    0       1               0-9y         none        poverty   
2  199901    0       0               0-9y  high school  working class   
3  478866    0       1               0-9y   university  working class   
4  731664    1       1             10-19y         none  working class   

   credit_score  vehicle_ownership vehicle_year  married  children  \
0      0.629027                1.0   after 2015      0.0       1.0   
1      0.357757                0.0  before 2015      0.0       0.0   
2      0.493146                1.0  before 2015      0.0       0.0   
3      0.206013                1.0  before 2015      0.0       1.0   
4      0.388366                1.0  before 2015      0.0       0.0   

   postal_code  annual_mileage vehicle_type  speeding_violations  duis  \
0        10238         12000.0        sedan                    0  

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Define the mappings for categorical columns
mappings = {
    'driving_experience': {'0-9y': 0, '10-19y': 1, '20-29y': 2, '30y+': 3},
    'education': {'none': 0, 'high school': 1, 'university': 2},
    'income': {'poverty': 0, 'middle class': 1, 'working class': 2, 'upper class': 3},
    'vehicle_year': {'before 2015': 0, 'after 2015': 1},
    'vehicle_type': {'sedan': 0, 'sports car': 1},
    'postal_code': {10238: 1, 32765: 2, 92101: 3, 21217: 4}
}

# Replace values based on mappings
df_ml.replace(mappings, inplace=True)

# Convert categorical columns to categorical data type
cat_cols = ['age', 'gender', 'driving_experience', 'education', 'income', 'vehicle_ownership',
            'vehicle_year', 'married', 'children', 'vehicle_type', 'postal_code']
df_ml[cat_cols] = df_ml[cat_cols].astype('category')

# Handling missing values in 'credit_score' and 'annual_mileage'
imputer = SimpleImputer(strategy='mean')
df_ml['credit_score'] = imputer.fit_transform(df_ml[['credit_score']])
df_ml['annual_mileage'] = imputer.fit_transform(df_ml[['annual_mileage']])

# Scaling numerical columns
scaler = MinMaxScaler()
df_ml[['credit_score', 'annual_mileage']] = scaler.fit_transform(df_ml[['credit_score', 'annual_mileage']])

# Drop columns not needed for the model
columns_to_drop = ['id']
df_ml = df_ml.drop(columns_to_drop, axis=1)
print(df_ml.head())


  age gender driving_experience education income  credit_score  \
0   3      0                  0         1      3      0.634374   
1   0      1                  0         0      0      0.335441   
2   0      0                  0         1      2      0.484636   
3   0      1                  0         2      2      0.168222   
4   1      1                  1         0      2      0.369171   

  vehicle_ownership vehicle_year married children postal_code  annual_mileage  \
0               1.0            1     0.0      1.0           1            0.50   
1               0.0            0     0.0      0.0           1            0.70   
2               1.0            0     0.0      0.0           1            0.45   
3               1.0            0     0.0      1.0           2            0.45   
4               1.0            0     0.0      0.0           2            0.50   

  vehicle_type  speeding_violations  duis  past_accidents  outcome  
0            0                    0     0      

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Define columns for preprocessing
numerical_features = ['credit_score', 'annual_mileage', 'speeding_violations', 'duis', 'past_accidents']
categorical_features = ['age', 'gender', 'driving_experience', 'education', 'income', 'vehicle_ownership',
                        'vehicle_year', 'married', 'children', 'vehicle_type', 'postal_code']

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', sparse_output=False))
])

# Combine numerical and categorical transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply the transformations
X_preprocessed = preprocessor.fit_transform(df_ml)

# Capture transformed column names for categorical features
categorical_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)

# Combine numerical and transformed categorical column names
all_column_names = numerical_features + list(categorical_columns)

# Convert NumPy array back to a DataFrame with correct column names
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=all_column_names)
print(X_preprocessed_df.head(20))
print(X_preprocessed_df.shape)


    credit_score  annual_mileage  speeding_violations      duis  \
0       0.865914    1.130571e-01            -0.661462 -0.431020   
1      -1.208879    1.605576e+00            -0.661462 -0.431020   
2      -0.173367   -2.600726e-01            -0.661462 -0.431020   
3      -2.369485   -2.600726e-01            -0.661462 -0.431020   
4      -0.974770    1.130571e-01             0.230657 -0.431020   
5       0.790195    4.861868e-01             0.676717 -0.431020   
6      -0.174914    4.861868e-01             2.460955 -0.431020   
7      -0.360421    8.593165e-01            -0.661462 -0.431020   
8       0.045907    4.861868e-01            -0.661462 -0.431020   
9       0.349673   -2.600726e-01            -0.661462 -0.431020   
10      0.799632   -6.332023e-01             2.014895  3.172827   
11      1.636903    1.130571e-01             1.122776 -0.431020   
12      0.927235   -1.379462e+00             1.122776  1.370903   
13      0.577053    4.142572e-16            -0.661462 -0.43102

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Split the DataFrame into features (X) and target (y)
X = X_preprocessed_df
y = df_ml['outcome']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions with Random Forest
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.8285
              precision    recall  f1-score   support

         0.0       0.85      0.91      0.88      1367
         1.0       0.77      0.66      0.71       633

    accuracy                           0.83      2000
   macro avg       0.81      0.78      0.79      2000
weighted avg       0.82      0.83      0.82      2000



In [6]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object for Random Forest
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit the Random Forest model
rf_grid_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters for Random Forest: ", rf_grid_search.best_params_)

# Make predictions with the best Random Forest model
best_rf_model = rf_grid_search.best_estimator_
y_pred_rf_best = best_rf_model.predict(X_test)

# Evaluate the tuned Random Forest model
print("Tuned Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf_best))
print(classification_report(y_test, y_pred_rf_best))


Best parameters for Random Forest:  {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Tuned Random Forest Accuracy: 0.848
              precision    recall  f1-score   support

         0.0       0.88      0.91      0.89      1367
         1.0       0.78      0.72      0.75       633

    accuracy                           0.85      2000
   macro avg       0.83      0.81      0.82      2000
weighted avg       0.85      0.85      0.85      2000



In [7]:
from sklearn.ensemble import GradientBoostingClassifier

# Train the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Make predictions with Gradient Boosting
y_pred_gb = gb_model.predict(X_test)

# Evaluate the Gradient Boosting model
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

Gradient Boosting Accuracy: 0.8465
              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89      1367
         1.0       0.79      0.70      0.74       633

    accuracy                           0.85      2000
   macro avg       0.83      0.81      0.82      2000
weighted avg       0.84      0.85      0.84      2000



In [8]:
# Define the parameter grid for Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object for Gradient Boosting
gb_grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit the Gradient Boosting model
gb_grid_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters for Gradient Boosting: ", gb_grid_search.best_params_)

# Make predictions with the best Gradient Boosting model
best_gb_model = gb_grid_search.best_estimator_
y_pred_gb_best = best_gb_model.predict(X_test)

# Evaluate the tuned Gradient Boosting model
print("Tuned Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb_best))
print(classification_report(y_test, y_pred_gb_best))

Best parameters for Gradient Boosting:  {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 1.0}
Tuned Gradient Boosting Accuracy: 0.845
              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89      1367
         1.0       0.78      0.70      0.74       633

    accuracy                           0.84      2000
   macro avg       0.83      0.81      0.82      2000
weighted avg       0.84      0.84      0.84      2000



```
Best parameters for Gradient Boosting:  {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 1.0}
Tuned Gradient Boosting Accuracy: 0.845
              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89      1367
         1.0       0.78      0.70      0.74       633

    accuracy                           0.84      2000
   macro avg       0.83      0.81      0.82      2000
weighted avg       0.84      0.84      0.84      2000

```