In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
# ============================================================================
# TASK 1: EXPLORATORY DATA ANALYSIS & STATISTICAL THINKING
# ============================================================================

print("\n" + "="*80)
print("TASK 1: EXPLORATORY DATA ANALYSIS & STATISTICAL THINKING")
print("="*80)

# Load the data
print("\n[1.1] Loading data...")
df = pd.read_csv('./Data/MachineLearningRating_v3.txt', sep='|',encoding='utf-8-sig') 



print("Data loaded successfully!")
print(f"Dataset shape: {df.shape}")

print("\n[1.2] Data Understanding - Basic Information")
print("-" * 80)
print(df.info())
print("\nFirst few rows:")
print(df.head())


TASK 1: EXPLORATORY DATA ANALYSIS & STATISTICAL THINKING

[1.1] Loading data...
Data loaded successfully!
Dataset shape: (1000098, 52)

[1.2] Data Understanding - Basic Information
--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000098 entries, 0 to 1000097
Data columns (total 52 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   UnderwrittenCoverID       1000098 non-null  int64  
 1   PolicyID                  1000098 non-null  int64  
 2   TransactionMonth          1000098 non-null  object 
 3   IsVATRegistered           1000098 non-null  bool   
 4   Citizenship               1000098 non-null  object 
 5   LegalType                 1000098 non-null  object 
 6   Title                     1000098 non-null  object 
 7   Language                  1000098 non-null  object 
 8   Bank                      854137 non-null   objec

In [4]:
print("="*80)
print("FEATURE ENGINEERING AND MODEL BUILDING")
print("="*80)

# ============================================================================
# STEP 1: COMPREHENSIVE FEATURE ENCODING
# ============================================================================

def prepare_and_encode_features(df):
    """
    Prepare and encode all features for machine learning
    Ensures all data is numeric and compatible with all ML algorithms
    """
    print("\n[Step 1] Preparing and Encoding Features...")
    print("-" * 80)
    
    df_model = df.copy()
    
    # Create new features
    print("\n1.1 Creating derived features...")
    if 'RegistrationYear' in df_model.columns:
        df_model['VehicleAge'] = 2015 - df_model['RegistrationYear']
    
    if 'TotalClaims' in df_model.columns:
        df_model['HasClaim'] = (df_model['TotalClaims'] > 0).astype(int)
    
    if 'TotalPremium' in df_model.columns and 'TotalClaims' in df_model.columns:
        df_model['Margin'] = df_model['TotalPremium'] - df_model['TotalClaims']
    
    # Identify categorical and numerical columns
    print("\n1.2 Identifying column types...")
    
    # Define categorical columns that need encoding
    categorical_cols = []
    for col in df_model.columns:
        if df_model[col].dtype == 'object':
            categorical_cols.append(col)
        elif df_model[col].dtype == 'bool':
            df_model[col] = df_model[col].astype(int)
    
    print(f"Found {len(categorical_cols)} categorical columns")
    
    # Encode categorical variables
    print("\n1.3 Encoding categorical variables...")
    label_encoders = {}
    
    for col in categorical_cols:
        print(f"  Encoding: {col}")
        le = LabelEncoder()
        # Handle missing values
        df_model[col] = df_model[col].fillna('Unknown')
        # Encode
        df_model[col + '_encoded'] = le.fit_transform(df_model[col].astype(str))
        label_encoders[col] = le
    
    # Handle numerical columns - fill missing values
    print("\n1.4 Handling missing values in numerical columns...")
    numerical_cols = df_model.select_dtypes(include=[np.number]).columns
    for col in numerical_cols:
        if df_model[col].isnull().sum() > 0:
            median_val = df_model[col].median()
            df_model[col].fillna(median_val, inplace=True)
            print(f"  Filled {col} with median: {median_val:.2f}")
    
    print("\nFeature preparation complete!")
    return df_model, categorical_cols, label_encoders

FEATURE ENGINEERING AND MODEL BUILDING


In [5]:
# ============================================================================
# STEP 2: SELECT FEATURES FOR MODELING
# ============================================================================

def select_model_features(df_model, categorical_cols):
    """
    Select appropriate features for modeling
    """
    print("\n[Step 2] Selecting Model Features...")
    print("-" * 80)
    
    # Features to use (encoded categorical + numerical)
    feature_cols = []
    
    # Add encoded categorical features
    for col in categorical_cols:
        if col + '_encoded' in df_model.columns:
            feature_cols.append(col + '_encoded')
    
    # Define important numerical features
    important_numerical = [
        'VehicleAge', 'SumInsured', 'CalculatedPremiumPerTerm',
        'Cylinders', 'cubiccapacity', 'kilowatts', 'NumberOfDoors',
        'CustomValueEstimate', 'ExcessSelected'
    ]
    
    # Add numerical features that exist
    for col in important_numerical:
        if col in df_model.columns:
            feature_cols.append(col)
    
    # Remove any features that shouldn't be used for prediction
    exclude_cols = ['TotalClaims', 'TotalPremium', 'Margin', 'HasClaim', 
                   'PolicyID', 'UnderwrittenCoverID', 'TransactionMonth']
    
    feature_cols = [col for col in feature_cols if col not in exclude_cols]
    
    # Ensure all features are numeric
    print("\n2.1 Verifying all features are numeric...")
    valid_features = []
    for col in feature_cols:
        if col in df_model.columns:
            if pd.api.types.is_numeric_dtype(df_model[col]):
                valid_features.append(col)
            else:
                print(f"  ⚠ Skipping non-numeric column: {col} (dtype: {df_model[col].dtype})")
    
    feature_cols = valid_features
    
    print(f"\nSelected {len(feature_cols)} features for modeling:")
    for i, col in enumerate(feature_cols[:20], 1):  # Show first 20
        print(f"  {i}. {col}")
    if len(feature_cols) > 20:
        print(f"  ... and {len(feature_cols) - 20} more")
    
    return feature_cols


In [6]:
# ============================================================================
# STEP 3: BUILD CLAIM SEVERITY MODELS
# ============================================================================

def build_claim_severity_model(df, feature_cols):
    """
    Build models to predict claim amount for policies with claims
    Target: TotalClaims (where TotalClaims > 0)
    """
    
    print("\n[Step 3] Building Claim Severity Prediction Models...")
    print("="*80)
    
    # Filter to policies with claims
    df_claims = df[df['TotalClaims'] > 0].copy()
    print(f"Training on {len(df_claims):,} policies with claims")
    
    # Prepare data - ensure all features exist and are numeric
    available_features = [col for col in feature_cols if col in df_claims.columns]
    print(f"Using {len(available_features)} features")
    
    X = df_claims[available_features].copy()
    y = df_claims['TotalClaims'].copy()
    
    # Final check - ensure no object types
    object_cols = X.select_dtypes(include=['object']).columns
    if len(object_cols) > 0:
        print(f"\n⚠ Warning: Found object columns, converting...")
        for col in object_cols:
            print(f"  Converting {col}")
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].astype(str))
    
    # Fill any remaining NaN values
    X = X.fillna(0)
    
    print(f"\nFinal X shape: {X.shape}")
    print(f"Target y shape: {y.shape}")
    print(f"X dtypes summary: {X.dtypes.value_counts().to_dict()}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f"\nTrain set: {X_train.shape[0]:,} samples")
    print(f"Test set: {X_test.shape[0]:,} samples")
    
    # Scale features for Linear Regression
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Dictionary to store models and results
    models = {}
    results = []
    
    # 1. Linear Regression
    print("\n" + "-"*80)
    print("1. Training Linear Regression...")
    try:
        lr = LinearRegression()
        lr.fit(X_train_scaled, y_train)
        y_pred_lr = lr.predict(X_test_scaled)
        
        rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
        r2_lr = r2_score(y_test, y_pred_lr)
        mae_lr = mean_absolute_error(y_test, y_pred_lr)
        
        models['Linear Regression'] = lr
        results.append({
            'Model': 'Linear Regression',
            'RMSE': rmse_lr,
            'R2_Score': r2_lr,
            'MAE': mae_lr
        })
        print(f"✓ Linear Regression - RMSE: {rmse_lr:,.2f}, R²: {r2_lr:.4f}, MAE: {mae_lr:,.2f}")
    except Exception as e:
        print(f"✗ Linear Regression failed: {str(e)}")
    
    # 2. Decision Tree
    print("\n" + "-"*80)
    print("2. Training Decision Tree...")
    try:
        dt = DecisionTreeRegressor(max_depth=10, min_samples_split=20, random_state=42)
        dt.fit(X_train, y_train)
        y_pred_dt = dt.predict(X_test)
        
        rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
        r2_dt = r2_score(y_test, y_pred_dt)
        mae_dt = mean_absolute_error(y_test, y_pred_dt)
        
        models['Decision Tree'] = dt
        results.append({
            'Model': 'Decision Tree',
            'RMSE': rmse_dt,
            'R2_Score': r2_dt,
            'MAE': mae_dt
        })
        print(f"✓ Decision Tree - RMSE: {rmse_dt:,.2f}, R²: {r2_dt:.4f}, MAE: {mae_dt:,.2f}")
    except Exception as e:
        print(f"✗ Decision Tree failed: {str(e)}")
    
    # 3. Random Forest
    print("\n" + "-"*80)
    print("3. Training Random Forest...")
    try:
        rf = RandomForestRegressor(n_estimators=100, max_depth=15, 
                                   min_samples_split=20, random_state=42, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred_rf = rf.predict(X_test)
        
        rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
        r2_rf = r2_score(y_test, y_pred_rf)
        mae_rf = mean_absolute_error(y_test, y_pred_rf)
        
        models['Random Forest'] = rf
        results.append({
            'Model': 'Random Forest',
            'RMSE': rmse_rf,
            'R2_Score': r2_rf,
            'MAE': mae_rf
        })
        print(f"✓ Random Forest - RMSE: {rmse_rf:,.2f}, R²: {r2_rf:.4f}, MAE: {mae_rf:,.2f}")
    except Exception as e:
        print(f"✗ Random Forest failed: {str(e)}")
    
    # 4. Gradient Boosting
    print("\n" + "-"*80)
    print("4. Training Gradient Boosting...")
    try:
        gb = GradientBoostingRegressor(n_estimators=100, max_depth=5, 
                                       learning_rate=0.1, random_state=42)
        gb.fit(X_train, y_train)
        y_pred_gb = gb.predict(X_test)
        
        rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_gb))
        r2_gb = r2_score(y_test, y_pred_gb)
        mae_gb = mean_absolute_error(y_test, y_pred_gb)
        
        models['Gradient Boosting'] = gb
        results.append({
            'Model': 'Gradient Boosting',
            'RMSE': rmse_gb,
            'R2_Score': r2_gb,
            'MAE': mae_gb
        })
        print(f"✓ Gradient Boosting - RMSE: {rmse_gb:,.2f}, R²: {r2_gb:.4f}, MAE: {mae_gb:,.2f}")
    except Exception as e:
        print(f"✗ Gradient Boosting failed: {str(e)}")
    
    # 5. XGBoost
    print("\n" + "-"*80)
    print("5. Training XGBoost...")
    try:
        # Ensure X_train and X_test are completely numeric
        X_train_xgb = X_train.copy()
        X_test_xgb = X_test.copy()
        
        # Convert all to float
        X_train_xgb = X_train_xgb.astype(float)
        X_test_xgb = X_test_xgb.astype(float)
        
        xgb_model = xgb.XGBRegressor(
            n_estimators=100, 
            max_depth=6, 
            learning_rate=0.1, 
            random_state=42,
            tree_method='hist',  # Use histogram-based algorithm
            enable_categorical=False  # Disable categorical features
        )
        xgb_model.fit(X_train_xgb, y_train)
        y_pred_xgb = xgb_model.predict(X_test_xgb)
        
        rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
        r2_xgb = r2_score(y_test, y_pred_xgb)
        mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
        
        models['XGBoost'] = xgb_model
        results.append({
            'Model': 'XGBoost',
            'RMSE': rmse_xgb,
            'R2_Score': r2_xgb,
            'MAE': mae_xgb
        })
        print(f"✓ XGBoost - RMSE: {rmse_xgb:,.2f}, R²: {r2_xgb:.4f}, MAE: {mae_xgb:,.2f}")
    except Exception as e:
        print(f"✗ XGBoost failed: {str(e)}")
        import traceback
        traceback.print_exc()
    
    # Results comparison
    if len(results) > 0:
        results_df = pd.DataFrame(results)
        results_df = results_df.sort_values('R2_Score', ascending=False)
        
        print("\n" + "="*80)
        print("MODEL COMPARISON - CLAIM SEVERITY PREDICTION")
        print("="*80)
        print(results_df.to_string(index=False))
        
        # Visualize model comparison
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))
        
        colors = plt.cm.viridis(np.linspace(0, 1, len(results_df)))
        
        axes[0].barh(results_df['Model'], results_df['RMSE'], color=colors)
        axes[0].set_xlabel('RMSE (lower is better)', fontweight='bold')
        axes[0].set_title('Model Comparison: RMSE', fontsize=13, fontweight='bold')
        axes[0].invert_yaxis()
        axes[0].grid(True, alpha=0.3, axis='x')
        
        axes[1].barh(results_df['Model'], results_df['R2_Score'], color=colors)
        axes[1].set_xlabel('R² Score (higher is better)', fontweight='bold')
        axes[1].set_title('Model Comparison: R² Score', fontsize=13, fontweight='bold')
        axes[1].invert_yaxis()
        axes[1].grid(True, alpha=0.3, axis='x')
        axes[1].axvline(x=0, color='red', linestyle='--', alpha=0.5)
        
        axes[2].barh(results_df['Model'], results_df['MAE'], color=colors)
        axes[2].set_xlabel('MAE (lower is better)', fontweight='bold')
        axes[2].set_title('Model Comparison: MAE', fontsize=13, fontweight='bold')
        axes[2].invert_yaxis()
        axes[2].grid(True, alpha=0.3, axis='x')
        
        plt.tight_layout()
        plt.savefig('model_comparison_claims.png', dpi=300, bbox_inches='tight')
        print("\n✓ Saved: model_comparison_claims.png")
        plt.close()
        
        return models, results_df, scaler, available_features
    else:
        print("\n✗ No models trained successfully")
        return None, None, None, available_features

In [7]:
# ============================================================================
# STEP 4: FEATURE IMPORTANCE ANALYSIS
# ============================================================================

def analyze_feature_importance(model, feature_names, model_name='Model', top_n=20):
    """Analyze and visualize feature importance"""
    
    print(f"\n[Step 4] Feature Importance Analysis - {model_name}")
    print("-" * 80)
    
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        
        feature_importance = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values('Importance', ascending=False)
        
        print(f"\nTop {top_n} Important Features:")
        print(feature_importance.head(top_n).to_string(index=False))
        
        # Visualize
        plt.figure(figsize=(12, 10))
        top_features = feature_importance.head(top_n)
        
        colors = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(top_features)))
        
        plt.barh(range(len(top_features)), top_features['Importance'], color=colors, edgecolor='black')
        plt.yticks(range(len(top_features)), top_features['Feature'])
        plt.xlabel('Feature Importance', fontsize=12, fontweight='bold')
        plt.title(f'Top {top_n} Feature Importances - {model_name}', fontsize=14, fontweight='bold')
        plt.gca().invert_yaxis()
        plt.grid(True, alpha=0.3, axis='x')
        plt.tight_layout()
        
        filename = f'feature_importance_{model_name.replace(" ", "_")}.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"\n✓ Saved: {filename}")
        plt.close()
        
        return feature_importance
    else:
        print(f"Model {model_name} does not have feature_importances_ attribute")
        return None

In [8]:
df_model, categorical_cols, label_encoders = prepare_and_encode_features(df)


[Step 1] Preparing and Encoding Features...
--------------------------------------------------------------------------------

1.1 Creating derived features...

1.2 Identifying column types...
Found 36 categorical columns

1.3 Encoding categorical variables...
  Encoding: TransactionMonth
  Encoding: Citizenship
  Encoding: LegalType
  Encoding: Title
  Encoding: Language
  Encoding: Bank
  Encoding: AccountType
  Encoding: MaritalStatus
  Encoding: Gender
  Encoding: Country
  Encoding: Province
  Encoding: MainCrestaZone
  Encoding: SubCrestaZone
  Encoding: ItemType
  Encoding: VehicleType
  Encoding: make
  Encoding: Model
  Encoding: bodytype
  Encoding: VehicleIntroDate
  Encoding: AlarmImmobiliser
  Encoding: TrackingDevice
  Encoding: CapitalOutstanding
  Encoding: NewVehicle
  Encoding: WrittenOff
  Encoding: Rebuilt
  Encoding: Converted
  Encoding: CrossBorder
  Encoding: TermFrequency
  Encoding: ExcessSelected
  Encoding: CoverCategory
  Encoding: CoverType
  Encoding: Cov

In [9]:
feature_cols = select_model_features(df_model, categorical_cols)


[Step 2] Selecting Model Features...
--------------------------------------------------------------------------------

2.1 Verifying all features are numeric...
  ⚠ Skipping non-numeric column: ExcessSelected (dtype: object)

Selected 44 features for modeling:
  1. TransactionMonth_encoded
  2. Citizenship_encoded
  3. LegalType_encoded
  4. Title_encoded
  5. Language_encoded
  6. Bank_encoded
  7. AccountType_encoded
  8. MaritalStatus_encoded
  9. Gender_encoded
  10. Country_encoded
  11. Province_encoded
  12. MainCrestaZone_encoded
  13. SubCrestaZone_encoded
  14. ItemType_encoded
  15. VehicleType_encoded
  16. make_encoded
  17. Model_encoded
  18. bodytype_encoded
  19. VehicleIntroDate_encoded
  20. AlarmImmobiliser_encoded
  ... and 24 more


In [10]:
models, results_df, scaler, features_used = build_claim_severity_model(df_model, feature_cols)


[Step 3] Building Claim Severity Prediction Models...
Training on 2,788 policies with claims
Using 44 features

Final X shape: (2788, 44)
Target y shape: (2788,)
X dtypes summary: {dtype('int64'): 37, dtype('float64'): 7}

Train set: 2,230 samples
Test set: 558 samples

--------------------------------------------------------------------------------
1. Training Linear Regression...
✓ Linear Regression - RMSE: 33,799.34, R²: 0.2897, MAE: 17,911.45

--------------------------------------------------------------------------------
2. Training Decision Tree...
✓ Decision Tree - RMSE: 37,228.65, R²: 0.1382, MAE: 17,055.66

--------------------------------------------------------------------------------
3. Training Random Forest...
✓ Random Forest - RMSE: 34,953.47, R²: 0.2403, MAE: 16,219.32

--------------------------------------------------------------------------------
4. Training Gradient Boosting...
✓ Gradient Boosting - RMSE: 38,263.07, R²: 0.0897, MAE: 17,082.29

--------------------

In [11]:
if models and 'Random Forest' in models:
    fi_rf = analyze_feature_importance(models['Random Forest'], features_used, 'Random_Forest')


[Step 4] Feature Importance Analysis - Random_Forest
--------------------------------------------------------------------------------

Top 20 Important Features:
                   Feature  Importance
  CalculatedPremiumPerTerm    0.344833
                SumInsured    0.217418
       CustomValueEstimate    0.072077
CapitalOutstanding_encoded    0.071502
       VehicleType_encoded    0.033473
  TransactionMonth_encoded    0.031686
                 kilowatts    0.026276
                VehicleAge    0.022002
             cubiccapacity    0.020729
     SubCrestaZone_encoded    0.019388
             Model_encoded    0.015569
  VehicleIntroDate_encoded    0.014051
              make_encoded    0.012248
          Province_encoded    0.011503
             Title_encoded    0.011310
     CoverCategory_encoded    0.010041
              Bank_encoded    0.009564
    MainCrestaZone_encoded    0.009282
          bodytype_encoded    0.008444
             NumberOfDoors    0.006870

✓ Saved: feature_

In [12]:
print("\n[4.7] Model Interpretation with SHAP (Advanced)")
print("-" * 80)


[4.7] Model Interpretation with SHAP (Advanced)
--------------------------------------------------------------------------------
