In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")


In [2]:

df = pd.read_csv(r"C:\Users\Mr. Louis Obadiah\Desktop\OKAN\Machine Learning\Insurance Premium Prediction Dataset.csv")

print("Rows & Columns:", df.shape)
print(df.head())

# ============================
# 2. FEATURE ENGINEERING
# ============================

df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
current_date = pd.to_datetime("today")

df['Policy_Age_Years'] = (
    (current_date - df['Policy Start Date']).dt.days / 365.25
).fillna(0).clip(lower=0)

# Safe feedback length
if 'Customer Feedback' in df.columns:
    df['Feedback_Len'] = df['Customer Feedback'].fillna("").astype(str).map(len)
else:
    df['Feedback_Len'] = 0

# Safe smoking encoding
if 'Smoking Status' in df.columns:
    df['Smoking Status'] = df['Smoking Status'].astype(str).str.lower().map({
        'yes': 1,
        'no': 0
    })

# ============================
# 3. TARGET & FEATURES
# ============================

target = 'Premium Amount'
X = df.drop(columns=[target])
y = df[target]

mask = ~y.isna()
X = X.loc[mask]
y = y.loc[mask]

# ============================
# 4. FEATURE TYPES
# ============================

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

for col in ['Customer Feedback', 'Policy Start Date']:
    if col in categorical_features:
        categorical_features.remove(col)

# ============================
# 5. PREPROCESSING
# ============================

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# ============================
# 6. TRAIN / TEST SPLIT
# ============================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42
)

# ============================
# 7. FAST RANDOM FOREST
# ============================

rf_model = Pipeline(steps=[
    ('pre', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=120,
        max_depth=12,
        random_state=42,
        n_jobs=-1
    ))
])

# ============================
# 8. TRAIN
# ============================

print("\nTraining Fast Random Forest Model...")
rf_model.fit(X_train, y_train)
print(" Training Completed")

# ============================
# 9. EVALUATION
# ============================

y_pred = rf_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n MODEL PERFORMANCE")
print(f"MAE  : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"RÂ²   : {r2:.4f}")

# ============================
# 10. FINAL INSIGHTS
# ============================

print("\nActionable Insights:")
print("- Policy age, income level, claims history strongly influence premium.")
print("- Random Forest shows strong generalization.")
print("- Model is ready for deployment.")


Rows & Columns: (278860, 20)
    Age  Gender  Annual Income Marital Status  Number of Dependents  \
0  56.0    Male        99990.0        Married                   1.0   
1  46.0    Male         2867.0         Single                   1.0   
2  32.0  Female        30154.0       Divorced                   3.0   
3  60.0  Female        48371.0       Divorced                   0.0   
4  25.0  Female        54174.0       Divorced                   0.0   

  Education Level     Occupation  Health Score  Location    Policy Type  \
0        Master's            NaN     31.074627     Urban  Comprehensive   
1      Bachelor's            NaN     50.271335     Urban  Comprehensive   
2      Bachelor's            NaN     14.714909  Suburban  Comprehensive   
3             PhD  Self-Employed     25.346926     Rural  Comprehensive   
4     High School  Self-Employed      6.659499     Urban  Comprehensive   

   Previous Claims  Vehicle Age  Credit Score  Insurance Duration  \
0              NaN      

KeyboardInterrupt: 