In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import arff
import os

In [14]:
# Load the data.
dir_path = os.path.dirname(os.getcwd())
data_path = os.path.join(dir_path, 'data/freMTPL2freq.arff')
with open(data_path) as f:
    dataset = arff.load(f)

# Create a pandas dataframe from the loaded data.
df = pd.DataFrame(data=dataset['data'], columns=[a[0] for a in dataset['attributes']])

# Convert the byte columns to string.
for col in df.select_dtypes([object]):
    df[col] = df[col].apply(lambda x: x.decode() if isinstance(x, bytes) else x)

# Show the first rows.
df.head()

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region
0,1.0,1.0,0.1,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82
1,3.0,1.0,0.77,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82
2,5.0,1.0,0.75,B,6.0,2.0,52.0,50.0,B12,Diesel,54.0,R22
3,10.0,1.0,0.09,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72
4,11.0,1.0,0.84,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72


In [15]:
# Check if there are missing values.(There are not.)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 678013 entries, 0 to 678012
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   IDpol       678013 non-null  float64
 1   ClaimNb     678013 non-null  float64
 2   Exposure    678013 non-null  float64
 3   Area        678013 non-null  object 
 4   VehPower    678013 non-null  float64
 5   VehAge      678013 non-null  float64
 6   DrivAge     678013 non-null  float64
 7   BonusMalus  678013 non-null  float64
 8   VehBrand    678013 non-null  object 
 9   VehGas      678013 non-null  object 
 10  Density     678013 non-null  float64
 11  Region      678013 non-null  object 
dtypes: float64(8), object(4)
memory usage: 62.1+ MB


In [16]:
df.ClaimNb.value_counts()

ClaimNb
0.0     643953
1.0      32178
2.0       1784
3.0         82
4.0          7
11.0         3
5.0          2
6.0          1
8.0          1
16.0         1
9.0          1
Name: count, dtype: int64

In [19]:
df = df.rename(columns={
    'ClaimNb': 'n_claims',
    'Exposure': 'exposure',
    'ClaimAmount': 'claim_amount',
    'DrivAge': 'driver_age',
    'VehAge': 'vehicle_age',
    'BonusMalus': 'bonus_malus',
    'Density': 'density',
    'Area': 'area',
    'VehPower': 'vehicle_power',
    'VehBrand': 'vehicle_brand',
    'VehGas': 'vehicle_gas',
    'Region': 'region'
    })

In [20]:
df.columns

Index(['IDpol', 'n_claims', 'exposure', 'area', 'vehicle_power', 'vehicle_age',
       'driver_age', 'bonus_malus', 'vehicle_brand', 'vehicle_gas', 'density',
       'region'],
      dtype='object')