In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("../data/insurance_data.csv", sep='|')

# Check for missing values
print(df.isnull().sum())

# Example: Remove rows where 'TotalClaims' is missing
df = df.dropna(subset=['TotalClaims'])

# Or Impute: Example for numerical features using mean
df['TotalClaims'] = df['TotalClaims'].fillna(df['TotalClaims'].mean())

# For categorical features, use mode (most frequent value) imputation
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])

# Confirm no missing values
print(df.isnull().sum())


  df = pd.read_csv("../data/insurance_data.csv", sep='|')


UnderwrittenCoverID               0
PolicyID                          0
TransactionMonth                  0
IsVATRegistered                   0
Citizenship                       0
LegalType                         0
Title                             0
Language                          0
Bank                         145961
AccountType                   40232
MaritalStatus                  8259
Gender                         9536
Country                           0
Province                          0
PostalCode                        0
MainCrestaZone                    0
SubCrestaZone                     0
ItemType                          0
mmcode                          552
VehicleType                     552
RegistrationYear                  0
make                            552
Model                           552
Cylinders                       552
cubiccapacity                   552
kilowatts                       552
bodytype                        552
NumberOfDoors               

In [5]:
df['TotalPolicies'] = 1
df['ClaimFreq'] = df['TotalClaims'] / df['TotalPolicies']


In [6]:
# Create new features
df['ClaimFreq'] = df['TotalClaims'] / df['TotalPolicies']  # if these columns exist
df['PremiumToClaimsRatio'] = df['TotalPremium'] / df['TotalClaims']


In [17]:
# One-Hot Encoding for categorical variables
df = pd.get_dummies(df, drop_first=True)

# If using label encoding for a binary column like 'Gender':
#from sklearn.preprocessing import LabelEncoder
#label_encoder = LabelEncoder()
#df['Gender_Male'] = label_encoder.fit_transform(df['Gender_Male'])


In [18]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df.drop(columns=['TotalClaims', 'CalculatedPremiumPerTerm'])
y = df['TotalClaims']  # or 'ClaimOccurrence' for the binary classification model

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
# Create HasClaim column (if not already created)
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

# Select features and target
X = df[['Gender_Male', 'Gender_Not specified']].fillna(0)
y = df['HasClaim']

# Split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict
y_pred_lr = lr_model.predict(X_test)


In [24]:
from sklearn.ensemble import RandomForestRegressor

# Initialize model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit model
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)


In [33]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Select features and target
X = df[['Gender_Male', 'Gender_Not specified']].fillna(0)
y = df['HasClaim']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to make duplicate columns unique by adding suffixes
def make_cols_unique(cols):
    seen = {}
    result = []
    for col in cols:
        if col not in seen:
            seen[col] = 0
            result.append(col)
        else:
            seen[col] += 1
            result.append(f"{col}_{seen[col]}")
    return result

# Fix duplicate columns in train and test sets
X_train.columns = make_cols_unique(X_train.columns)
X_test.columns = make_cols_unique(X_test.columns)

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters and train model
params = {'objective': 'reg:squarederror'}
xg_model = xgb.train(params, dtrain, num_boost_round=100)

# Predict
y_pred_xg = xg_model.predict(dtest)

print(y_pred_xg[:5])  # print first 5 predictions


[0.00278351 0.00278351 0.00278351 0.00278351 0.00278351]


In [34]:
from sklearn.linear_model import LogisticRegression

# Initialize logistic regression model
logreg_model = LogisticRegression()

# Fit model to predict claim occurrence (binary)
logreg_model.fit(X_train, y_train)

# Predict claim occurrence (0 or 1)
y_pred_logreg = logreg_model.predict(X_test)


In [35]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# RMSE for Linear Regression
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

# RMSE for Random Forest
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

# RMSE for XGBoost
rmse_xg = np.sqrt(mean_squared_error(y_test, y_pred_xg))
r2_xg = r2_score(y_test, y_pred_xg)

# Print Evaluation Results
print(f"Linear Regression - RMSE: {rmse_lr}, R-squared: {r2_lr}")
print(f"Random Forest - RMSE: {rmse_rf}, R-squared: {r2_rf}")
print(f"XGBoost - RMSE: {rmse_xg}, R-squared: {r2_xg}")


Linear Regression - RMSE: 0.05381683765911473, R-squared: 6.9661862265313346e-06
Random Forest - RMSE: 0.053816863418747495, R-squared: 6.008885106489892e-06
XGBoost - RMSE: 0.05381683833382197, R-squared: 7.033348083496094e-06


In [36]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Accuracy, F1-Score, Precision, and Recall for Logistic Regression
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
f1_logreg = f1_score(y_test, y_pred_logreg)
precision_logreg = precision_score(y_test, y_pred_logreg)
recall_logreg = recall_score(y_test, y_pred_logreg)

# Print Classification Results
print(f"Logistic Regression - Accuracy: {accuracy_logreg}, F1-score: {f1_logreg}, Precision: {precision_logreg}, Recall: {recall_logreg}")


Logistic Regression - Accuracy: 0.9970952904709529, F1-score: 0.0, Precision: 0.0, Recall: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [14]:
# Convert 'TransactionMonth' to datetime format
df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'], errors='coerce')

# Create a new feature: months since the earliest transaction
df['MonthsSinceStart'] = (df['TransactionMonth'] - df['TransactionMonth'].min()).dt.days // 30

# Drop the original 'TransactionMonth' column if it's no longer needed
df = df.drop(columns=['TransactionMonth'])


In [15]:
from sklearn.preprocessing import LabelEncoder

# One-hot encode categorical variables with multiple categories
df = pd.get_dummies(df, columns=['Citizenship', 'LegalType', 'Title', 'Language', 'Bank', 'Country', 'Province', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'make', 'Model'], drop_first=True)

# Label encode binary categorical columns like 'Gender' (Male/Female)
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Verify the columns after encoding
print(df.columns)


Index(['UnderwrittenCoverID', 'PolicyID', 'IsVATRegistered', 'AccountType',
       'MaritalStatus', 'Gender', 'PostalCode', 'mmcode', 'VehicleType',
       'RegistrationYear',
       ...
       'Model_VOLARE W8 28 SEAT B/S B/S', 'Model_X TRAIL 2.0 4X2 XE (R79/R85)',
       'Model_YARIS 1.3 5Dr', 'Model_YARIS T1 3Dr', 'Model_YARIS T1 5Dr',
       'Model_YARIS T3 A/C', 'Model_YARIS T3 SPIRIT 5Dr', 'Model_YARIS T3+',
       'Model_YARIS ZEN3', 'Model_YARIS ZEN3 ACS'],
      dtype='object', length=585)


In [16]:
# Features: drop the target variable and non-useful columns (like IDs, etc.)
X = df.drop(columns=['TotalClaims', 'PolicyID', 'UnderwrittenCoverID'])

# Target: TotalClaims
y = df['TotalClaims']


In [17]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify split
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")


Training set size: 800078, Test set size: 200020


In [21]:
import pandas as pd

# Assuming you have loaded your dataset into `df`
# List the categorical columns you need to one-hot encode
categorical_columns = ['AccountType', 'MaritalStatus', 'VehicleType']

# Apply One-Hot Encoding to the categorical columns
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Check the resulting dataframe to verify encoding
print(df.head())


   UnderwrittenCoverID  PolicyID  IsVATRegistered  Gender  PostalCode  \
0               145249     12827             True       2        1459   
1               145249     12827             True       2        1459   
2               145249     12827             True       2        1459   
3               145255     12827             True       2        1459   
4               145255     12827             True       2        1459   

       mmcode  RegistrationYear  Cylinders  cubiccapacity  kilowatts  ...  \
0  44069150.0              2004        6.0         2597.0      130.0  ...   
1  44069150.0              2004        6.0         2597.0      130.0  ...   
2  44069150.0              2004        6.0         2597.0      130.0  ...   
3  44069150.0              2004        6.0         2597.0      130.0  ...   
4  44069150.0              2004        6.0         2597.0      130.0  ...   

  Model_YARIS ZEN3  Model_YARIS ZEN3 ACS AccountType_Savings account  \
0            False        

In [22]:
# Check for missing values
print(df.isnull().sum())

# Fill missing values with appropriate strategies (mean, median, or mode)
df['mmcode'] = df['mmcode'].fillna(df['mmcode'].median())  # For numeric columns
df['PostalCode'] = df['PostalCode'].fillna(df['PostalCode'].mode()[0])  # For categorical columns


UnderwrittenCoverID              0
PolicyID                         0
IsVATRegistered                  0
Gender                           0
PostalCode                       0
                                ..
MaritalStatus_Single             0
VehicleType_Heavy Commercial     0
VehicleType_Light Commercial     0
VehicleType_Medium Commercial    0
VehicleType_Passenger Vehicle    0
Length: 590, dtype: int64


In [23]:
# Define features (X) and target (y)
X = df.drop(columns=['TotalClaims', 'UnderwrittenCoverID', 'PolicyID'])  # Drop unnecessary columns
y = df['TotalClaims']  # Target: TotalClaims


In [24]:
from sklearn.model_selection import train_test_split

# Split the data (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the split
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")


Training set size: 800078, Test set size: 200020


In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model (RMSE and R-squared)
rmse = mean_squared_error(y_test, y_pred_rf, squared=False)
r2 = r2_score(y_test, y_pred_rf)

print(f"RMSE: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")


ValueError: could not convert string to float: 'B/S'