# Preliminary Analysis

In [3]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import joblib


In [4]:
data_original = pd.read_csv('training.csv', sep=';')
data_original.head(5)

Unnamed: 0,PolNum,CalYear,Gender,Type,Category,Occupation,Age,Group1,Bonus,Poldur,Value,Adind,SubGroup2,Group2,Density,Exppdays,Numtppd,Numtpbi,Indtppd,Indtpbi
1,200114978,2009,Male,C,Large,Employed,25,18,90,3,15080,0,L46,L,72.012883,365,1,0,0.0,0.0
2,200114994,2009,Male,E,Large,Employed,20,11,30,2,22370,1,O38,O,39.550411,365,1,0,0.0,0.0
3,200115001,2009,Female,E,Large,Unemployed,42,11,150,0,39650,0,Q28,Q,169.529148,365,2,0,0.0,0.0
4,200115011,2009,Female,C,Medium,Housewife,21,5,0,0,12600,1,L6,L,58.894688,365,1,0,0.0,0.0
5,200115015,2009,Female,D,Large,Employed,33,12,30,10,9065,0,N4,N,109.631885,365,2,0,0.0,0.0


## Preprocessing

In [5]:
# Eliminating wrong values
polnums = []
for i, v in data_original["PolNum"].value_counts().items():
    if v == 2:
        polnums.append(i)
ttt = data_original[data_original["PolNum"].apply(lambda x: True if x in polnums else False)]

def idx_lowest(dff):
    # Group by the first column and find the index of the minimum value in the third column
    df = dff.copy()
    df["total_cost"] = df["Indtppd"] + df["Indtpbi"]
    idx_to_drop = df.groupby(df.columns[0])[df.columns[-1]].idxmin()
    return idx_to_drop

data_original.drop(idx_lowest(ttt), inplace=True)
#data_original["PolNum"].value_counts()

In [6]:
# Selecting one particular year
data = data_original[data_original["CalYear"] == 2009]
cols_to_drop = ['PolNum', 'CalYear', 'SubGroup2', 'Category']
data.drop(cols_to_drop, axis=1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, 22 to 50021
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      50000 non-null  object 
 1   Type        50000 non-null  object 
 2   Occupation  50000 non-null  object 
 3   Age         50000 non-null  int64  
 4   Group1      50000 non-null  int64  
 5   Bonus       50000 non-null  int64  
 6   Poldur      50000 non-null  int64  
 7   Value       50000 non-null  int64  
 8   Adind       50000 non-null  int64  
 9   Group2      50000 non-null  object 
 10  Density     50000 non-null  float64
 11  Exppdays    50000 non-null  int64  
 12  Numtppd     50000 non-null  int64  
 13  Numtpbi     50000 non-null  int64  
 14  Indtppd     50000 non-null  float64
 15  Indtpbi     50000 non-null  float64
dtypes: float64(3), int64(9), object(4)
memory usage: 6.5+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(cols_to_drop, axis=1, inplace=True)


In [7]:
data.describe()

Unnamed: 0,Age,Group1,Bonus,Poldur,Value,Adind,Density,Exppdays,Numtppd,Numtpbi,Indtppd,Indtpbi
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,41.10156,10.6758,-6.695,5.48276,16495.1332,0.51136,116.458351,327.64954,0.13996,0.04678,98.585283,208.42643
std,14.307887,4.679553,48.816786,4.595224,10528.285772,0.499876,79.190922,73.603182,0.421206,0.219983,422.853097,1761.53048
min,18.0,1.0,-50.0,0.0,1000.0,0.0,14.377142,91.0,0.0,0.0,0.0,0.0
25%,30.0,7.0,-40.0,1.0,8395.0,0.0,50.566406,340.0,0.0,0.0,0.0,0.0
50%,40.0,11.0,-30.0,4.0,14652.5,1.0,93.382351,365.0,0.0,0.0,0.0,0.0
75%,51.0,14.0,10.0,9.0,22595.0,1.0,171.372936,365.0,0.0,0.0,0.0,0.0
max,75.0,20.0,150.0,15.0,49990.0,1.0,297.38517,365.0,7.0,3.0,10955.476251,60914.68741


In [8]:
# Feature engineering
data["total_cost"] = np.log1p(data["Indtppd"] + data["Indtpbi"])
data["frequence_claims"] = data["Numtppd"] + data["Numtpbi"]
data["Exppdays"] = data["Exppdays"] / 365

# Removing already encoded features
df = data.copy()
cols_to_drop2 = ["Numtppd",	"Numtpbi", "Indtppd", "Indtpbi", "Group2", "Gender"]
df.drop(cols_to_drop2, axis=1, inplace=True)

# Removing outliers
percentile_cost = np.percentile(data["total_cost"], 98)
percentile_freq = np.percentile(data["frequence_claims"], 99)
df = df[(df["total_cost"] < percentile_cost) & (df["frequence_claims"] < percentile_freq)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["total_cost"] = np.log1p(data["Indtppd"] + data["Indtpbi"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["frequence_claims"] = data["Numtppd"] + data["Numtpbi"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Exppdays"] = data["Exppdays"] / 365


In [9]:
# The problem is now binary
print(df.columns)
df["frequence_claims"].value_counts()

Index(['Type', 'Occupation', 'Age', 'Group1', 'Bonus', 'Poldur', 'Value',
       'Adind', 'Density', 'Exppdays', 'total_cost', 'frequence_claims'],
      dtype='object')


frequence_claims
0    42294
1     5751
Name: count, dtype: int64

In [10]:
def random_sampling(x, y, values=[1], new_sizes=[0]):
    """
    This function performs oversampling or undersampling,
    depending on the class size and the requested new_size

    PARAMETERS
    ----------
    x: DataFrame
       Dataframe containing the features
    y: Series
       1D array with axis labels that contains the different classes
    values: List of integers    
            It contains the class values required to resample 
    new_sizes: List of integers
               size required for the corresponding class in values
    OUTPUT
    ------
    x_result: DataFrame
              Resampled dataframe containing the features
    y_result: Series
              Resampled series object containing the different classes
    """

    x['target'] = y
    for val, size in zip(values, new_sizes):
        df_sampled = x[x['target'] == val]
        n_lines = df_sampled.shape[0]
        # Over_sampling
        if n_lines <= size:
            rdn_rows = random.choices(range(0, n_lines), k=size - df_sampled.shape[0])
            x = pd.concat([x, df_sampled.iloc[rdn_rows]], ignore_index=True)
        # Under_sampling    
        else:    
            rdn_rows = random.sample(list(df_sampled.index), k=df_sampled.shape[0] - size)
            x = x.drop(rdn_rows)

    x_result = x.drop('target', axis=1)
    y_result = x['target']   
    
    return x_result, y_result

# Defining a selector to get the categorical and numerical variables 
categorical_selector = selector(dtype_include = object)
numerical_selector = selector(dtype_exclude=object)


### Feature selection

In [9]:
X = df.drop(["total_cost", "frequence_claims"], axis=1)
y = df["frequence_claims"]

cat_variables = categorical_selector(X)
num_variables = numerical_selector(X)

X = pd.get_dummies(X, columns=cat_variables) * 1

# Create a random forest classifier object
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
rfc.fit(X, y)

# Get feature importances from the trained model
importances = rfc.feature_importances_

# Sort the feature importances in descending order
indices = np.argsort(importances)[::-1]

# Select the features that sum up to 90%
X_columns = X.columns
suma = 0
for i in indices:
    print(X_columns[i] , '----->' + str(importances[i]))
    suma += importances[i]
    if suma >= 0.9:
        break


Value ----->0.17079641924469724
Density ----->0.1706789779551798
Age ----->0.1335943715457432
Bonus ----->0.11143591314378308
Group1 ----->0.10285737604649824
Poldur ----->0.09816428053939621
Exppdays ----->0.05374331357322445
Adind ----->0.020003519365901137
Type_A ----->0.016621656771068378
Type_D ----->0.015406859470082633
Type_B ----->0.015346275510049783


# ML

## Frequency prediction

In [11]:
# Resampling
X = df.drop(["total_cost", "frequence_claims"], axis=1)
y = df["frequence_claims"]

cat_variables = categorical_selector(X)
num_variables = numerical_selector(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0) 

# resampling the data
X_train, y_train = random_sampling(X_train, y_train, values=[0, 1], new_sizes=[10000, 10000])

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [12]:
# Rescaling
scaler = StandardScaler()
numeric_tranformation = Pipeline(
    steps=[('scaler', scaler)]
)

onehot = OneHotEncoder()
categorical_tranformation = Pipeline(
    steps=[('onehot', onehot)]
)

preprocessor = ColumnTransformer(
    transformers=[("numerical", numeric_tranformation, num_variables),
                    ("categorical", categorical_tranformation, cat_variables)]
)

# Model training
svc = SVC(kernel='poly', degree=3, class_weight='balanced', probability=True, random_state=0)

freq_pipeline = Pipeline(steps=
    [('preprocessor', preprocessor), ("model", svc)]
    )

freq_pipeline.fit(X_train, y_train)
prediction = freq_pipeline.predict(X_val)

score = classification_report(y_val, prediction)
print(score)

              precision    recall  f1-score   support

           0       0.67      0.67      0.67      2014
           1       0.66      0.66      0.66      1986

    accuracy                           0.66      4000
   macro avg       0.66      0.66      0.66      4000
weighted avg       0.66      0.66      0.66      4000



In [13]:
# Testing with data without resampling (original distribution)
prediction_test = freq_pipeline.predict(X_test)
score = classification_report(y_test, prediction_test)
print(score)

              precision    recall  f1-score   support

           0       0.93      0.68      0.79      8459
           1       0.21      0.63      0.32      1150

    accuracy                           0.68      9609
   macro avg       0.57      0.66      0.55      9609
weighted avg       0.85      0.68      0.73      9609



## Cost calculation

In [14]:
def removing_zero_cost(x, y):
    """
    Function to remove the values zero from the target value 
    """
    x_new = pd.DataFrame()
    x['target'] = y
    x_new = x[x['target'] != 0]
    return x_new.drop(['target'], axis=1), x_new['target']

def get_preprocessor():  
    """ 
    Thanks to this function the sklearn pipeline works as expected for
    the ColumnTransformer and the skorch model
    """
    scaler = StandardScaler()
    numeric_tranformation = Pipeline(
        steps=[('scaler', scaler)]
    )

    onehot = OneHotEncoder()
    categorical_tranformation = Pipeline(
        steps=[('onehot', onehot)]
    )

    preprocessor = ColumnTransformer(
        transformers=[("numerical", numeric_tranformation, num_variables),
                      ("categorical", categorical_tranformation, cat_variables)]
    )
    
    return preprocessor

# Splitting data
X2 = df.drop(["total_cost"], axis=1)
y2 = df["total_cost"]

cat_variables = categorical_selector(X2)
num_variables = numerical_selector(X2)


print(X2.shape, y2.shape, 'with zeros')
X2, y2 = removing_zero_cost(X2, y2)
print(X2.shape, y2.shape, 'NO zeros')
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=0) 

(48045, 11) (48045,) with zeros
(5751, 11) (5751,) NO zeros


In [15]:
model = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

nn_pipeline = Pipeline(steps=
    [('preprocessor', get_preprocessor()), ("model", model)]
    )

nn_pipeline.fit(X_train2, y_train2)



In [16]:
# Prediction
predict = nn_pipeline.predict(X_test2)
mse = mean_squared_error(y_test2, predict)
mse

1.9791726070152056

# Final model

In [17]:
X = df.drop(["total_cost", "frequence_claims"], axis=1)
y = df["frequence_claims"]

cat_variables = categorical_selector(X)
num_variables = numerical_selector(X)

prime_avg = sum(np.expm1(df['total_cost'])) / len(df['total_cost'])
print("The average prime is: " + str(prime_avg)) 

class ModelEnsemble(BaseEstimator, TransformerMixin):
    """ 
    With this class we can estimate the prime value for a given client.
    We first calculate the frequency of accidents, then we proceed to calculate the prime.
    This class is not trainable, it receives models that have been already trained
    """
    def __init__(self, model1, model2, prime_avg=86.76, n0=42294, n1=5751):
        self.model1 = model1
        self.model2 = model2
        self.prime_avg = prime_avg
        self.n0 = n0
        self.n1 = n1
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.to_frame(0).T
        model1_output = self.model1.predict(X)
        print("proba:", self.model1.predict_proba(X))
        if model1_output == 0:
            prime = self.prime_avg
        else:
            X["frequence_claims"] = model1_output
            prime = self.prime_avg + self.n1 * np.expm1(self.model2.predict(X)[0]) / self.n0
        return prime
    

ensemble_model = ModelEnsemble(freq_pipeline, nn_pipeline)

# Saving the model
joblib.dump(ensemble_model, 'ensemble_model.joblib')

The average prime is: 86.7633105417968


['ensemble_model.joblib']

In [18]:
# Testing the model
test_data = df.drop(["total_cost", "frequence_claims"], axis=1)
n = random.randint(0, test_data.shape[0])
test_df = test_data.iloc[n]

prediction = ensemble_model.transform(test_df)
print("The prime for this client is: %5.2f" % prediction)

proba: [[0.36097132 0.63902868]]
The prime for this client is: 123.01


In [19]:
# Testing the saved model
model_load = joblib.load('ensemble_model.joblib')
model_load.transform(test_df)

proba: [[0.36097132 0.63902868]]


123.00729794274864