In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Step 1: Load the dataset
df = pd.read_csv('C:/Users/user/Desktop/Github/Insurance_solution/data/MachineLearningRating_v3.txt', delimiter='|', low_memory=False)

# Step 2: Initial DataFrame Info
print("Initial dataset shape:", df.shape)
print("Initial DataFrame sample:")
print(df.head())

# Step 3: Identify numerical and categorical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Step 4: Handle missing values
# Remove columns that are entirely empty
df_clean = df.dropna(axis=1, how='all')

# Ensure columns are in the DataFrame before imputation
existing_num_cols = [col for col in num_cols if col in df_clean.columns]
existing_cat_cols = [col for col in cat_cols if col in df_clean.columns]

# Impute numerical columns with mean
if existing_num_cols:
    num_imputer = SimpleImputer(strategy='mean')
    df_clean[existing_num_cols] = num_imputer.fit_transform(df_clean[existing_num_cols])

# Impute categorical columns with the most frequent value
if existing_cat_cols:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    df_clean[existing_cat_cols] = cat_imputer.fit_transform(df_clean[existing_cat_cols])

# Optionally encode categorical variables
label_encoders = {}
for col in existing_cat_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# Step 5: Define the target variables
# You're predicting both TotalPremium and TotalClaims
target_columns = ['TotalPremium', 'TotalClaims']

# Check if both target columns are present in the DataFrame
for target_column in target_columns:
    if target_column not in df_clean.columns:
        raise ValueError(f"Target variable '{target_column}' not found in the DataFrame.")

# Features (X) are all columns except TotalPremium and TotalClaims
X = df_clean.drop(target_columns, axis=1)

# Target (y) are both TotalPremium and TotalClaims
y = df_clean[target_columns]

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Model Building and Evaluation

# 1. Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
lr_mse_totalpremium = mean_squared_error(y_test['TotalPremium'], y_pred_lr[:, 0])
lr_mse_totalclaims = mean_squared_error(y_test['TotalClaims'], y_pred_lr[:, 1])
lr_r2_totalpremium = r2_score(y_test['TotalPremium'], y_pred_lr[:, 0])
lr_r2_totalclaims = r2_score(y_test['TotalClaims'], y_pred_lr[:, 1])

print("\nLinear Regression:")
print("Mean Squared Error for TotalPremium:", lr_mse_totalpremium)
print("Mean Squared Error for TotalClaims:", lr_mse_totalclaims)
print("R² Score for TotalPremium:", lr_r2_totalpremium)
print("R² Score for TotalClaims:", lr_r2_totalclaims)

# 2. Random Forest Regressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
rf_mse_totalpremium = mean_squared_error(y_test['TotalPremium'], y_pred_rf[:, 0])
rf_mse_totalclaims = mean_squared_error(y_test['TotalClaims'], y_pred_rf[:, 1])
rf_r2_totalpremium = r2_score(y_test['TotalPremium'], y_pred_rf[:, 0])
rf_r2_totalclaims = r2_score(y_test['TotalClaims'], y_pred_rf[:, 1])

print("\nRandom Forest Regressor:")
print("Mean Squared Error for TotalPremium:", rf_mse_totalpremium)
print("Mean Squared Error for TotalClaims:", rf_mse_totalclaims)
print("R² Score for TotalPremium:", rf_r2_totalpremium)
print("R² Score for TotalClaims:", rf_r2_totalclaims)

# 3. XGBoost Regressor
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
xgb_mse_totalpremium = mean_squared_error(y_test['TotalPremium'], y_pred_xgb[:, 0])
xgb_mse_totalclaims = mean_squared_error(y_test['TotalClaims'], y_pred_xgb[:, 1])
xgb_r2_totalpremium = r2_score(y_test['TotalPremium'], y_pred_xgb[:, 0])
xgb_r2_totalclaims = r2_score(y_test['TotalClaims'], y_pred_xgb[:, 1])

print("\nXGBoost Regressor:")
print("Mean Squared Error for TotalPremium:", xgb_mse_totalpremium)
print("Mean Squared Error for TotalClaims:", xgb_mse_totalclaims)
print("R² Score for TotalPremium:", xgb_r2_totalpremium)
print("R² Score for TotalClaims:", xgb_r2_totalclaims)


Initial dataset shape: (1000098, 52)
Initial DataFrame sample:
   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0               145249     12827  2015-03-01 00:00:00             True   
1               145249     12827  2015-05-01 00:00:00             True   
2               145249     12827  2015-07-01 00:00:00             True   
3               145255     12827  2015-05-01 00:00:00             True   
4               145255     12827  2015-07-01 00:00:00             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   
1              Close Corporation    Mr  English  First National Bank   
2              Close Corporation    Mr  English  First National Bank   
3              Close Corporation    Mr  English  First National Bank   
4              Close Corporation    Mr  English  First National Bank   

       AccountType  ...                    ExcessSelected C

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[existing_num_cols] = num_imputer.fit_transform(df_clean[existing_num_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[existing_cat_cols] = cat_imputer.fit_transform(df_clean[existing_cat_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col] = le.fit_transform(df

Mean Squared Error for TotalPremium prediction: 15789.813318415207
Mean Squared Error for TotalClaims prediction: 4861367.807125847
