In [5]:
# Task 4: Predictive Modeling Notebook

# **1. Import Necessary Libraries and Functions**
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath('../src/utils'))
from modeling import (
    handle_missing_data,
    encode_categorical_variables,
    split_data,
    train_regression_models,
    evaluate_regression_models,
    train_classification_models,
    evaluate_classification_models,
    analyze_feature_importance
)

# **2. Load and Preprocess the Dataset**
# Load dataset
print("Loading dataset...")
data_path = "../data/processed/claim_metrics.csv"
df = pd.read_csv(data_path)
print("Dataset loaded successfully.")

# Print shape after each major step for debugging
def print_shape(label, df):
    print(f"{label} shape: {df.shape}")

print_shape("After loading", df)

# Handle missing values
print("Handling missing values...")
df = handle_missing_data(df, method="impute")
print_shape("After first impute", df)

# Encode categorical variables
print("Encoding categorical variables...")
df = encode_categorical_variables(df)
print_shape("After encoding", df)

# Impute again after encoding to ensure no NaNs remain
print("Handling missing values after encoding...")
df = handle_missing_data(df, method="impute")
print_shape("After second impute", df)

# **3. Data Preparation for Claim Severity Prediction**
# Define target variable and split data
print("Preparing data for Claim Severity Prediction...")
target_col_severity = "TotalClaims"
severity_data = df[df[target_col_severity] > 0]  # Use subset where claims > 0
print_shape("After filtering TotalClaims > 0", severity_data)
X_train_severity, X_test_severity, y_train_severity, y_test_severity = split_data(severity_data, target_col_severity)
print_shape("X_train_severity", X_train_severity)
print_shape("y_train_severity", y_train_severity)

# Check for missing values in features and target before training
print("Checking for missing values in training features and target...")
print("NaNs in X_train_severity:", X_train_severity.isna().sum().sum())
print("NaNs in y_train_severity:", y_train_severity.isna().sum())

if X_train_severity.isna().sum().sum() > 0 or y_train_severity.isna().sum() > 0:
    print("Dropping rows with NaNs in training data...")
    nan_mask = ~(X_train_severity.isna().any(axis=1) | y_train_severity.isna())
    X_train_severity = X_train_severity[nan_mask]
    y_train_severity = y_train_severity[nan_mask]
    print("After dropping, NaNs in X_train_severity:", X_train_severity.isna().sum().sum())
    print("After dropping, NaNs in y_train_severity:", y_train_severity.isna().sum())

# Train models for claim severity prediction
print("Training regression models...")
regression_models = train_regression_models(X_train_severity, y_train_severity)

# Evaluate regression models
print("Evaluating regression models...")
severity_results = evaluate_regression_models(regression_models, X_test_severity, y_test_severity)
print("Severity Prediction Results:")
print(severity_results)

# **4. Data Preparation for Premium Optimization**
# Define target variable and split data
print("Preparing data for Premium Optimization...")
target_col_premium = "CalculatedPremiumPerTerm"
X_train_premium, X_test_premium, y_train_premium, y_test_premium = split_data(df, target_col_premium)

# Train models for premium prediction
print("Training regression models...")
premium_models = train_regression_models(X_train_premium, y_train_premium)

# Evaluate premium prediction models
print("Evaluating regression models...")
premium_results = evaluate_regression_models(premium_models, X_test_premium, y_test_premium)
print("Premium Prediction Results:")
print(premium_results)

# **5. Claim Probability Prediction (Advanced Task)**
# Define target variable and split data
print("Preparing data for Claim Probability Prediction...")
target_col_probability = "ClaimOccurred"  # Binary classification (1 if claim occurred, 0 otherwise)
X_train_prob, X_test_prob, y_train_prob, y_test_prob = split_data(df, target_col_probability)

# Train classification models
print("Training classification models...")
classification_models = train_classification_models(X_train_prob, y_train_prob)

# Evaluate classification models
print("Evaluating classification models...")
probability_results = evaluate_classification_models(classification_models, X_test_prob, y_test_prob)
print("Claim Probability Prediction Results:")
print(probability_results)

# **6. Feature Importance Analysis**
print("Analyzing feature importance...")
for model_name, model in regression_models.items():
    print(f"Analyzing feature importance for {model_name}...")
    analyze_feature_importance(model, X_train_severity)

# **7. Save Results**
print("Saving results...")
severity_results_path = "../results/severity_results.csv"
severity_results_df = pd.DataFrame.from_dict(severity_results, orient="index")
severity_results_df.to_csv(severity_results_path)
print(f"Severity results saved to {severity_results_path}.")

premium_results_path = "../results/premium_results.csv"
premium_results_df = pd.DataFrame.from_dict(premium_results, orient="index")
premium_results_df.to_csv(premium_results_path)
print(f"Premium results saved to {premium_results_path}.")

probability_results_path = "../results/probability_results.csv"
probability_results_df = pd.DataFrame.from_dict(probability_results, orient="index")
probability_results_df.to_csv(probability_results_path)
print(f"Probability results saved to {probability_results_path}.")


Loading dataset...


  df = pd.read_csv(data_path)


Dataset loaded successfully.
After loading shape: (1000098, 56)
Handling missing values...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


After first impute shape: (1000098, 56)
Encoding categorical variables...
After encoding shape: (1000098, 1878)
Handling missing values after encoding...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


After second impute shape: (1000098, 1878)
Preparing data for Claim Severity Prediction...
After filtering TotalClaims > 0 shape: (2788, 1878)
X_train_severity shape: (2230, 1877)
y_train_severity shape: (2230,)
Checking for missing values in training features and target...
NaNs in X_train_severity: 4460
NaNs in y_train_severity: 0
Dropping rows with NaNs in training data...
After dropping, NaNs in X_train_severity: 0
After dropping, NaNs in y_train_severity: 0
Training regression models...


ValueError: Found array with 0 sample(s) (shape=(0, 1877)) while a minimum of 1 is required by LinearRegression.