In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [2]:
# Load the CSV file
df = pd.read_csv('../data/MachineLearningRating_v3.csv', low_memory=False, index_col=False)

In [3]:
# Define numeric and categorical features
numeric_features = ['SumInsured', 'CalculatedPremiumPerTerm', 'RegistrationYear', 'PostalCode']
categorical_features = ['Province', 'CoverType', 'VehicleType', 'make', 'Gender', 'MaritalStatus']

In [4]:
# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values for numeric data
    ('scaler', StandardScaler())  # Standardize numeric data
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values for categorical data
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical data
])

In [5]:
# Combine transformations for both numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [6]:
# Define target variable and features
X = df[numeric_features + categorical_features]
y = df['TotalPremium']  # Change to 'TotalClaims' if needed

In [7]:
# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

**Model building**

In [9]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

In [10]:

Random_Forest =RandomForestRegressor(n_estimators=50, random_state=42)
Random_Forest.fit(X_train, y_train)

In [11]:
# XGBoost
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

**Model Evaluation**

In [17]:
# Calculate Mean Squared Error (MSE) for each model
mse_lr = mean_squared_error(y_test, y_pred_lr) 
mse_rf = mean_squared_error(y_test, y_pred_rf)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)

In [19]:
print(f"Linear Regression MSE: {mse_lr}")
print(f"Random Forest MSE: {mse_rf}")
print(f"XGBoost MSE: {mse_xgb}")

Linear Regression MSE: 15331.258382268134
XGBoost MSE: 10413.508190271543


In [21]:
# importances = rf_model.feature_importances_
# print(f"Feature Importances (Random Forest): {importances}")

# Accessing feature importances from the Random Forest model
importances = Random_Forest.feature_importances_
print(f"Feature Importances (Random Forest): {importances}")


Feature Importances (Random Forest): [2.15450449e-01 6.88366315e-01 1.70581966e-02 5.58180969e-02
 7.61483500e-04 6.71971721e-05 1.98460016e-03 1.20098663e-03
 4.80272740e-04 7.01483209e-04 9.64904026e-04 2.72702695e-04
 5.99501503e-04 6.84329345e-05 1.45901577e-05 1.14579444e-07
 1.53847431e-05 5.45606522e-07 4.11093455e-07 1.85128803e-04
 5.24942947e-06 3.88545745e-07 3.35095441e-13 3.78835929e-05
 1.65884045e-05 1.83905902e-07 3.12339583e-04 6.56296850e-07
 3.05661879e-06 3.15118576e-08 1.29886901e-06 2.11869177e-04
 3.59278627e-07 2.87122179e-08 5.46122151e-07 5.40565003e-05
 3.13479504e-04 1.02177807e-04 8.83421481e-04 9.04277361e-04
 4.16292648e-04 1.58592880e-04 6.29613050e-04 3.00775010e-04
 2.86174952e-05 1.40007185e-04 5.08078803e-05 7.97472676e-04
 0.00000000e+00 1.02091568e-04 2.18831054e-05 1.03237509e-04
 1.60948664e-04 5.57138722e-07 4.33615608e-04 8.39333195e-06
 2.39663659e-05 9.51344118e-05 2.43067932e-04 1.86283757e-04
 3.53509920e-04 3.34279412e-06 1.23361299e-05 1.