In [33]:
# packages

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [34]:
# read data

# Get the current working directory
cwd = os.getcwd()
data = pd.read_csv(cwd + '/../datasets/train.csv')

# remove spaces in column names
data.columns = data.columns.str.replace(' ', '')

In [35]:
# summary of data types and missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


In [36]:
# change the values of the columns Gender and Vehicle_Damage to binary values (0, 1)

data = pd.get_dummies(data, columns=['Gender', 'Vehicle_Damage', 'Driving_License', 'Previously_Insured'], drop_first=True)

# encode the values of the column Vehicle_Age to numerical values
le = LabelEncoder()
data['Vehicle_Age'] = le.fit_transform(data['Vehicle_Age'])

# drop id and Vintage columns
data = data.drop(['id', 'Vintage'], axis=1)

# transfrom Annual_Premium to log scale
data['Annual_Premium'] = np.log(data['Annual_Premium'])

In [37]:
# summary of data types and missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 10 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Age                             381109 non-null  int64  
 1   Region_Code                     381109 non-null  float64
 2   Vehicle_Age                     381109 non-null  int64  
 3   Annual_Premium                  381109 non-null  float64
 4   Policy_Sales_Channel            381109 non-null  float64
 5   Response                        381109 non-null  int64  
 6   Gender_Male                     381109 non-null  bool   
 7   Vehicle_Damage_Yes              381109 non-null  bool   
 8   Driving_License_1               381109 non-null  bool   
 9   Previously_Insured_1            381109 non-null  bool   
dtypes: bool(4), float64(3), int64(3)
memory usage: 18.9 MB


In [38]:
# select the top 4 values of Policy_Sales_Channel whose proportion is greater than 0.05 and replace the rest with 0
data['Policy_Sales_Channel'] = data['Policy_Sales_Channel'].where(
																															data['Policy_Sales_Channel'].map(
																																data['Policy_Sales_Channel'].value_counts(normalize=True)
																															).ge(0.05), 0
                              															)

# select the top 4 values of Region_Code whose proportion is greater than 0.05 and replace the rest with 0
data['Region_Code'] = data['Region_Code'].where(data['Region_Code'].map(data['Region_Code'].value_counts(normalize=True)).ge(0.04), 0)

# convert the columns Policy_Sales_Channel and Region_Code to integer
data['Policy_Sales_Channel'] = data['Policy_Sales_Channel'].astype(int)
data['Region_Code'] = data['Region_Code'].astype(int)

In [39]:
print(data['Policy_Sales_Channel'].value_counts(normalize=True))

print(data['Region_Code'].value_counts(normalize=True))

Policy_Sales_Channel
152    0.353663
26     0.209127
124    0.194157
0      0.185907
160    0.057146
Name: proportion, dtype: float64
Region_Code
0     0.532144
28    0.279225
8     0.088891
46    0.051820
41    0.047921
Name: proportion, dtype: float64


In [40]:
# apply one-hot encoding to the columns Policy_Sales_Channel and Region_Code
data = pd.get_dummies(data, columns=['Policy_Sales_Channel', 'Region_Code'], drop_first=True)

In [41]:
# summary of data types and missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 16 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Age                             381109 non-null  int64  
 1   Vehicle_Age                     381109 non-null  int64  
 2   Annual_Premium                  381109 non-null  float64
 3   Response                        381109 non-null  int64  
 4   Gender_Male                     381109 non-null  bool   
 5   Vehicle_Damage_Yes              381109 non-null  bool   
 6   Driving_License_1               381109 non-null  bool   
 7   Previously_Insured_1            381109 non-null  bool   
 8   Policy_Sales_Channel_26         381109 non-null  bool   
 9   Policy_Sales_Channel_124        381109 non-null  bool   
 10  Policy_Sales_Channel_152        381109 non-null  bool   
 11  Policy_Sales_Channel_160        381109 non-null  bool   
 12  Region_Code_8   

In [42]:
# proportion of the values of the Response column
print(data['Response'].value_counts(normalize=True))
print(f'Dataset size: {data.shape}')

Response
0    0.877437
1    0.122563
Name: proportion, dtype: float64
Dataset size: (381109, 16)


In [43]:
# Our goal is to transform the instances classified as 1 in the Response column to instances classified as 0
# Therefore, instead of using all the rows we will use a subset of the data which still is a lot and might be enough
# for the diffusion model that we will use for the transformation.
# ! This could change later
# * We want to first build a classifier and then use the generative model for the transformation
# For the classifier we increase the proportion of the instances classified as 1 in the Response column 
# by sampling according to the proportions of the classes

# Separate the dataframe into two based on the binary column
data_0 = data[data['Response'] == 0]
data_1 = data[data['Response'] == 1]

# Define your proportions
prop_0 = 0.8  # Proportion of samples for class 0
prop_1 = 0.2  # Proportion of samples for class 1

# Calculate the number of samples for each class
n_samples = 100_000  # Total number of samples
n_samples_0 = int(n_samples * prop_0)
n_samples_1 = int(n_samples * prop_1)

# Sample separately
sampled_data_0 = data_0.sample(n=n_samples_0, random_state=1)
sampled_data_1 = data_1.sample(n=n_samples_1, random_state=1)

# Concatenate the results
sampled_data = pd.concat([sampled_data_0, sampled_data_1])


In [44]:
# one-hot encoding of the Response column
sampled_data = pd.get_dummies(sampled_data, columns=['Response'], drop_first=True)

# proportion of the values of the Response column in the new encoded column
print(sampled_data['Response_1'].value_counts(normalize=True))

# summary of data types and missing values
sampled_data.info()

Response_1
False    0.8
True     0.2
Name: proportion, dtype: float64
<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 149144 to 326208
Data columns (total 16 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Age                             100000 non-null  int64  
 1   Vehicle_Age                     100000 non-null  int64  
 2   Annual_Premium                  100000 non-null  float64
 3   Gender_Male                     100000 non-null  bool   
 4   Vehicle_Damage_Yes              100000 non-null  bool   
 5   Driving_License_1               100000 non-null  bool   
 6   Previously_Insured_1            100000 non-null  bool   
 7   Policy_Sales_Channel_26         100000 non-null  bool   
 8   Policy_Sales_Channel_124        100000 non-null  bool   
 9   Policy_Sales_Channel_152        100000 non-null  bool   
 10  Policy_Sales_Channel_160        100000 non-null  bool   
 11  Region_C

In [45]:
# save the data as a csv file
sampled_data.to_csv(cwd + '/../datasets/sample_data_preprocessed.csv', index=False)

In [46]:
# # Let's split the sampled data into train and test sets
# from sklearn.model_selection import train_test_split

# X = sampled_data.drop('Response_1', axis=1)
# y = sampled_data['Response_1']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# # Normalize Age, Vehicle_Age, Annual_Premium columns of the train and test sets

# scaler = StandardScaler()

# X_train[['Age', 'Vehicle_Age', 'Annual_Premium']] = scaler.fit_transform(X_train[['Age', 'Vehicle_Age', 'Annual_Premium']])
# X_test[['Age', 'Vehicle_Age', 'Annual_Premium']] = scaler.transform(X_test[['Age', 'Vehicle_Age', 'Annual_Premium']])