In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings

# GitHub Copilot
# Insurance premium prediction - end-to-end notebook cell
# Assumes "Insurance Premium Prediction Dataset.csv" is available in working directory

In [16]:

warnings.filterwarnings("ignore")

# 1) Load data
df = pd.read_csv(r"C:\Users\Mr. Louis Obadiah\Desktop\OKAN\Machine Learning\The Projects\Predicting Insurance Premiums with Data-Driven Insights for SecureLife Insurance Co\Insurance Premium Prediction Dataset.csv")


# Quick look
print("Rows, cols:", df.shape)
display(df.head())

# 2) Basic cleanup & type corrections
# Parse Policy Start Date and create policy age in years
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
current_date = pd.to_datetime("today")
df['Policy_Age_Years'] = ((current_date - df['Policy Start Date']).dt.days / 365.25).fillna(0).clip(lower=0)

# Simple text feature: length of feedback
df['Feedback_Len'] = df['Customer Feedback'].fillna("").astype(str).map(len)

# Map binary and ordinal fields
if 'Smoking Status' in df.columns:
    df['Smoking Status'] = df['Smoking Status'].map({'Yes': 1, 'No': 0})
# Ordinal for Exercise Frequency
exercise_order = ['Rarely', 'Monthly', 'Weekly', 'Daily']
if 'Exercise Frequency' in df.columns:
    df['Exercise Frequency'] = pd.Categorical(df['Exercise Frequency'], categories=exercise_order, ordered=True)

# 3) Target and features
target = 'Premium Amount'
y = df[target].copy()
X = df.drop(columns=[target])

# Remove rows where target is NaN
mask = ~y.isna()
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

# 4) Identify numeric and categorical features
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
# remove Policy_Age_Years if accidentally in numeric list (we want to keep it though)
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove text columns that were transformed / redundant
for col in ['Customer Feedback', 'Policy Start Date']:
    if col in categorical_features:
        categorical_features.remove(col)

# 5) Handle skewed numeric columns by applying log1p transformer where appropriate
skewed = X[numeric_features].skew().abs()
skewed_cols = skewed[skewed > 1].index.tolist()  # threshold for skew
print("Skewed numeric columns to log1p:", skewed_cols)

def log_transform(df_in):
    df_out = df_in.copy()
    for c in skewed_cols:
        if c in df_out.columns:
            df_out[c] = np.log1p(df_out[c].astype(float))
    return df_out

log_transformer = FunctionTransformer(log_transform)

# 6) Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('log', log_transformer, numeric_features),  # apply log1p then numeric pipeline
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop')

# Note: The above applies numeric features twice (log and num). To avoid duplication, we'll build a simple custom pipeline:
# Instead, create a combined pipeline that first applies log transform to skewed numeric cols only, then imputes/scales all numeric.
def make_preprocessor(numeric_features, skewed_cols, categorical_features):
    # Custom transformer using ColumnTransformer properly
    transformers = []
    if len(skewed_cols) > 0:
        transformers.append(('skewed_log', FunctionTransformer(
            lambda df_in: df_in.assign(**{c: np.log1p(df_in[c].astype(float)) for c in skewed_cols}), validate=False),
            skewed_cols))
    # For numeric imputing/scaling (apply to all numeric)
    transformers.append(('num', numeric_transformer, numeric_features))
    # Categorical
    transformers.append(('cat', categorical_transformer, categorical_features))
    return ColumnTransformer(transformers=transformers, remainder='drop')

preprocessor = make_preprocessor(numeric_features, skewed_cols, categorical_features)

# 7) Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 8) Build a baseline pipeline with RandomForest and one with GradientBoosting
rf_pipeline = Pipeline(steps=[('pre', preprocessor),
                              ('est', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])

gbr_pipeline = Pipeline(steps=[('pre', preprocessor),
                               ('est', GradientBoostingRegressor(n_estimators=100, random_state=42))])

# Fit baseline models
print("Training RandomForest...")
rf_pipeline.fit(X_train, y_train)
print("Training GradientBoosting...")
gbr_pipeline.fit(X_train, y_train)

# 9) Evaluation helper
def evaluate_model(pipe, X_tr, X_te, y_tr, y_te, name="Model"):
    y_pred_tr = pipe.predict(X_tr)
    y_pred_te = pipe.predict(X_te)
    metrics = {
        'train_mae': mean_absolute_error(y_tr, y_pred_tr),
        'test_mae': mean_absolute_error(y_te, y_pred_te),
        'test_mse': mean_squared_error(y_te, y_pred_te),
        'test_rmse': mean_squared_error(y_te, y_pred_te, squared=False),
        'test_r2': r2_score(y_te, y_pred_te)
    }
    print(f"\n{name} metrics:")
    for k, v in metrics.items():
        print(f"  {k}: {v:.4f}")
    return metrics

rf_metrics = evaluate_model(rf_pipeline, X_train, X_test, y_train, y_test, "RandomForest")
gbr_metrics = evaluate_model(gbr_pipeline, X_train, X_test, y_train, y_test, "GradientBoosting")

# 10) Feature importance extraction for RandomForest
# To get feature names after preprocessing:
pre = rf_pipeline.named_steps['pre']
# numeric names (as-is)
num_names = numeric_features
# categorical names from onehot
cat_names = []
if categorical_features:
    # find the 'cat' transformer inside ColumnTransformer
    for name, trans, cols in pre.transformers_:
        if name == 'cat':
            onehot = trans.named_steps['onehot']
            # get feature names
            cat_names = list(onehot.get_feature_names_out(cols))
            break

feature_names = num_names + cat_names
# Sometimes skewed_log transformer may reorder; try to fall back on numeric features only if mismatch
try:
    importances = rf_pipeline.named_steps['est'].feature_importances_
    if len(importances) == len(feature_names):
        feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False).head(20)
    else:
        # fallback: show top numeric importances
        feat_imp = pd.Series(importances, index=[f"f_{i}" for i in range(len(importances))]).sort_values(ascending=False).head(20)
except Exception:
    feat_imp = pd.Series(dtype=float)

print("\nTop feature importances (RandomForest):")
display(feat_imp)

# 11) Simple hyperparameter tuning for RandomForest using RandomizedSearchCV
param_dist = {
    'est__n_estimators': [100, 200, 400],
    'est__max_depth': [None, 10, 20, 30],
    'est__min_samples_split': [2, 5, 10],
    'est__min_samples_leaf': [1, 2, 4]
}
rs = RandomizedSearchCV(rf_pipeline, param_distributions=param_dist, n_iter=8, cv=3, scoring='neg_mean_absolute_error',
                        random_state=42, n_jobs=-1, verbose=0)
print("Running RandomizedSearchCV for RandomForest (this may take a while)...")
rs.fit(X_train, y_train)
print("Best params:", rs.best_params_)
best_rf = rs.best_estimator_
best_metrics = evaluate_model(best_rf, X_train, X_test, y_train, y_test, "Tuned RandomForest")

# 12) Quick EDA plots (optional, comment/uncomment as needed)
try:
    plt.figure(figsize=(6,4))
    sns.histplot(y, bins=50, kde=True)
    plt.title("Premium Amount distribution")
    plt.show()

    plt.figure(figsize=(8,6))
    sns.heatmap(df.select_dtypes(include=[np.number]).corr(), cmap='coolwarm', center=0)
    plt.title("Numeric features correlation")
    plt.show()
except Exception:
    pass

# 13) Actionable Insights (printed summary)
print("\nActionable insights:")
print("- Review highly important features above to craft underwriting rules.")
print("- Consider log-transforming highly skewed monetary features (done automatically).")
print("- Investigate outliers in 'Previous Claims' and 'Annual Income' as they can skew premiums.")
print("- Use tuned RandomForest for deployment but validate with a hold-out or time-based split before production.")

Rows, cols: (278860, 20)


Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,56.0,Male,99990.0,Married,1.0,Master's,,31.074627,Urban,Comprehensive,,13,320.0,5,308.0,2022-12-10 15:21:39.078837,Poor,Yes,Daily,Condo
1,46.0,Male,2867.0,Single,1.0,Bachelor's,,50.271335,Urban,Comprehensive,,3,694.0,4,517.0,2023-01-31 15:21:39.078837,Good,Yes,Monthly,House
2,32.0,Female,30154.0,Divorced,3.0,Bachelor's,,14.714909,Suburban,Comprehensive,2.0,16,652.0,8,849.0,2023-11-26 15:21:39.078837,Poor,No,Monthly,House
3,60.0,Female,48371.0,Divorced,0.0,PhD,Self-Employed,25.346926,Rural,Comprehensive,1.0,11,330.0,7,927.0,2023-02-27 15:21:39.078837,Poor,No,Rarely,Condo
4,25.0,Female,54174.0,Divorced,0.0,High School,Self-Employed,6.659499,Urban,Comprehensive,,9,,8,303.0,2020-11-25 15:21:39.078837,Poor,No,Rarely,Condo


Skewed numeric columns to log1p: ['Previous Claims']
Training RandomForest...
Training GradientBoosting...


ValueError: Input X contains NaN.
GradientBoostingRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values