# Customer Analysis Case Study

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import boxcox
pd.options.display.max_rows = 100
## Install xlrd package to load Excel files
#!conda install openpyxl
#!conda install xlrd

## Activity 1 - Monday 2023-01-16

### 1. Loading Data

In [2]:
# Load datasets individually
file1 = pd.read_csv('Data/file1.csv')
file2 = pd.read_csv('Data/file2.csv')
file3 = pd.read_csv('Data/file3.csv')

In [3]:
# Check if the datasets have the same column names. Applying sorted() to identify differences easier.
file1_colnames = sorted(list(file1.columns))
file2_colnames = sorted(list(file2.columns))
file3_colnames = sorted(list(file3.columns))
print(file1_colnames, file2_colnames, file3_colnames, sep='\n')

['Customer', 'Customer Lifetime Value', 'Education', 'GENDER', 'Income', 'Monthly Premium Auto', 'Number of Open Complaints', 'Policy Type', 'ST', 'Total Claim Amount', 'Vehicle Class']
['Customer', 'Customer Lifetime Value', 'Education', 'GENDER', 'Income', 'Monthly Premium Auto', 'Number of Open Complaints', 'Policy Type', 'ST', 'Total Claim Amount', 'Vehicle Class']
['Customer', 'Customer Lifetime Value', 'Education', 'Gender', 'Income', 'Monthly Premium Auto', 'Number of Open Complaints', 'Policy Type', 'State', 'Total Claim Amount', 'Vehicle Class']


In [4]:
# There are several colnames in file3 are not align with which of the others

file3.rename(columns={'Gender':'GENDER', 'State':'ST'}, inplace=True )

# re-sorts and re-prints the column names of each dataframe to check that the column name changes have been applied correctly
file1_colnames = sorted(list(file1.columns))
file2_colnames = sorted(list(file2.columns))
file3_colnames = sorted(list(file3.columns))
print(file1_colnames, file2_colnames, file3_colnames, sep='\n')

['Customer', 'Customer Lifetime Value', 'Education', 'GENDER', 'Income', 'Monthly Premium Auto', 'Number of Open Complaints', 'Policy Type', 'ST', 'Total Claim Amount', 'Vehicle Class']
['Customer', 'Customer Lifetime Value', 'Education', 'GENDER', 'Income', 'Monthly Premium Auto', 'Number of Open Complaints', 'Policy Type', 'ST', 'Total Claim Amount', 'Vehicle Class']
['Customer', 'Customer Lifetime Value', 'Education', 'GENDER', 'Income', 'Monthly Premium Auto', 'Number of Open Complaints', 'Policy Type', 'ST', 'Total Claim Amount', 'Vehicle Class']


### 2. Concatenating Data

In [5]:
# Concatenating the loaded dataframe into a single dataframe
ca_df = pd.concat([file1,file2,file3], axis=0)
ca_df

Unnamed: 0,Customer,ST,GENDER,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [6]:
# Check the information of the concatenated dataframe
ca_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12074 entries, 0 to 7069
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Customer                   9137 non-null   object 
 1   ST                         9137 non-null   object 
 2   GENDER                     9015 non-null   object 
 3   Education                  9137 non-null   object 
 4   Customer Lifetime Value    9130 non-null   object 
 5   Income                     9137 non-null   float64
 6   Monthly Premium Auto       9137 non-null   float64
 7   Number of Open Complaints  9137 non-null   object 
 8   Policy Type                9137 non-null   object 
 9   Vehicle Class              9137 non-null   object 
 10  Total Claim Amount         9137 non-null   float64
dtypes: float64(3), object(8)
memory usage: 1.1+ MB


### 3. Standardizing Column Names

In [7]:
# Define a function to standardize the column names of the dataframe 
# by converting them to lowercase and replacing spaces with underscores

def standardize_column_names(ca_df):
    ca_df.columns = ['_'.join(i.lower().split()) for i in ca_df.columns]
    return ca_df

In [8]:
# Apply the standardize_column_names function to the concatenated dataframe
ca_df = standardize_column_names(ca_df)
ca_df

Unnamed: 0,customer,st,gender,education,customer_lifetime_value,income,monthly_premium_auto,number_of_open_complaints,policy_type,vehicle_class,total_claim_amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


### 4. Renaming Column

In [9]:
# Define a function to rename the column 'st' to 'state'
def rename_columns(ca_df):
    ca_df.rename(columns={'st':'state'}, inplace=True )
    return ca_df

In [10]:
# Apply the rename_columns function to the concatenated dataframe
rename_columns(ca_df)
ca_df

Unnamed: 0,customer,state,gender,education,customer_lifetime_value,income,monthly_premium_auto,number_of_open_complaints,policy_type,vehicle_class,total_claim_amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


### 5. Removing Duplicates

In [11]:
# Remove duplicates from the dataframe in place 
ca_df.drop_duplicates(inplace = True)
ca_df

Unnamed: 0,customer,state,gender,education,customer_lifetime_value,income,monthly_premium_auto,number_of_open_complaints,policy_type,vehicle_class,total_claim_amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


### 6. Deleting Column

In [12]:
# Delete the column `customer` as it is only a unique identifier for each row of data
def drop_columns(ca_df):
    ca_df.drop(columns=['customer'], inplace=True)
    return ca_df

In [13]:
drop_columns(ca_df)

Unnamed: 0,state,gender,education,customer_lifetime_value,income,monthly_premium_auto,number_of_open_complaints,policy_type,vehicle_class,total_claim_amount
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


### 7. Resetting Index

In [14]:
# Resets the index of the dataframe, 
## dropping the current index and replacing it with a default sequential index, in place
ca_df.reset_index(drop = True, inplace = True)
ca_df

Unnamed: 0,state,gender,education,customer_lifetime_value,income,monthly_premium_auto,number_of_open_complaints,policy_type,vehicle_class,total_claim_amount
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
9130,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
9131,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
9132,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
9133,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


### 8. Rearanging Columns

In [15]:
list(ca_df.columns)

['state',
 'gender',
 'education',
 'customer_lifetime_value',
 'income',
 'monthly_premium_auto',
 'number_of_open_complaints',
 'policy_type',
 'vehicle_class',
 'total_claim_amount']

In [16]:
# Put categorical data before numeric_data
ca_df = ca_df[['state',
               'gender',
               'education',
               'policy_type', 
               'vehicle_class',
               'number_of_open_complaints', 
               'customer_lifetime_value',
               'income',
               'monthly_premium_auto',
               'total_claim_amount']]

ca_df.head()

Unnamed: 0,state,gender,education,policy_type,vehicle_class,number_of_open_complaints,customer_lifetime_value,income,monthly_premium_auto,total_claim_amount
0,Washington,,Master,Personal Auto,Four-Door Car,1/0/00,,0.0,1000.0,2.704934
1,Arizona,F,Bachelor,Personal Auto,Four-Door Car,1/0/00,697953.59%,0.0,94.0,1131.464935
2,Nevada,F,Bachelor,Personal Auto,Two-Door Car,1/0/00,1288743.17%,48767.0,108.0,566.472247
3,California,M,Bachelor,Corporate Auto,SUV,1/0/00,764586.18%,0.0,106.0,529.881344
4,Washington,M,High School or Below,Personal Auto,Four-Door Car,1/0/00,536307.65%,36357.0,68.0,17.269323


### 9. Correcting Data Types

#### 9.1. Correcting Numerical Data

In [17]:
# Remove the percentage from the customer lifetime value and truncate it to an integer value

## The following line will create a sizable number of nan 
## if the .str.rstrip() applied before using .astype() to convert the column to string
ca_df['customer_lifetime_value'] = pd.to_numeric(ca_df['customer_lifetime_value'].astype(str).str.rstrip('%'), errors='coerce').round()

ca_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9135 entries, 0 to 9134
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   state                      9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   policy_type                9134 non-null   object 
 4   vehicle_class              9134 non-null   object 
 5   number_of_open_complaints  9134 non-null   object 
 6   customer_lifetime_value    9127 non-null   float64
 7   income                     9134 non-null   float64
 8   monthly_premium_auto       9134 non-null   float64
 9   total_claim_amount         9134 non-null   float64
dtypes: float64(4), object(6)
memory usage: 713.8+ KB


In [18]:
# FOCUS



# Clean the column `number_of_open_complaints` and extract the middle number which is changing between records

ca_df.loc[ca_df['number_of_open_complaints'].notna() & \
          ca_df['number_of_open_complaints'].astype(str).str.contains('/'), 'number_of_open_complaints'] = \
ca_df.loc[ca_df['number_of_open_complaints'].notna() & \
          ca_df['number_of_open_complaints'].astype(str).str.contains('/'), 'number_of_open_complaints'].apply(lambda x: x[2])

ca_df['number_of_open_complaints'] = pd.to_numeric(ca_df['number_of_open_complaints'], errors='coerce')

ca_df

# Another means of acheiving using str.slice()
#ca_df['number_of_open_complaints'] = ca_df['number_of_open_complaints'].apply(lambda x: (x.dtype(str).str.slice(2,3) if len(x.dtype(str) > 1)))

Unnamed: 0,state,gender,education,policy_type,vehicle_class,number_of_open_complaints,customer_lifetime_value,income,monthly_premium_auto,total_claim_amount
0,Washington,,Master,Personal Auto,Four-Door Car,0.0,,0.0,1000.0,2.704934
1,Arizona,F,Bachelor,Personal Auto,Four-Door Car,0.0,697954.0,0.0,94.0,1131.464935
2,Nevada,F,Bachelor,Personal Auto,Two-Door Car,0.0,1288743.0,48767.0,108.0,566.472247
3,California,M,Bachelor,Corporate Auto,SUV,0.0,764586.0,0.0,106.0,529.881344
4,Washington,M,High School or Below,Personal Auto,Four-Door Car,0.0,536308.0,36357.0,68.0,17.269323
...,...,...,...,...,...,...,...,...,...,...
9130,California,M,Bachelor,Personal Auto,Four-Door Car,0.0,23406.0,71941.0,73.0,198.234764
9131,California,F,College,Corporate Auto,Four-Door Car,0.0,3097.0,21604.0,79.0,379.200000
9132,California,M,Bachelor,Corporate Auto,Four-Door Car,3.0,8164.0,0.0,85.0,790.784983
9133,California,M,College,Personal Auto,Four-Door Car,0.0,7524.0,21941.0,96.0,691.200000


#### 9.2. Correcting Categorical Data¶

In [19]:
# Check `state` column
ca_df.state.unique()

array(['Washington', 'Arizona', 'Nevada', 'California', 'Oregon', 'Cali',
       'AZ', 'WA', nan], dtype=object)

In [20]:
def clean_state(x):
    if x in ['Cali']:
        return 'California'
    elif x in ['AZ']:
        return 'Arizona'
    elif x in ['WA']:
        return 'Washington'
    else:
        return x
    
ca_df['state'] = list(map(clean_state, ca_df['state'])) 
ca_df.state.unique()

array(['Washington', 'Arizona', 'Nevada', 'California', 'Oregon', nan],
      dtype=object)

In [21]:
# Check `gender` column
ca_df.gender.unique()

array([nan, 'F', 'M', 'Femal', 'Male', 'female'], dtype=object)

In [22]:
def clean_gender(x):
    if x in ['M']:
        return 'Male'
    elif x in ['F', 'Femal',"female"]:
        return 'Female'
    else:
        return x

ca_df['gender'] = list(map(clean_gender, ca_df['gender'])) 
ca_df.gender.unique()

array([nan, 'Female', 'Male'], dtype=object)

In [23]:
# Check `education` column
ca_df.education.unique()

array(['Master', 'Bachelor', 'High School or Below', 'College',
       'Bachelors', 'Doctor', nan], dtype=object)

In [24]:
def clean_edu(x):
    if x in ['Bachelors', 'Bachelor']:
        return 'Bachelor'
    else:
        return x

ca_df['education'] = list(map(clean_edu, ca_df['education'])) 
ca_df.education.unique()

array(['Master', 'Bachelor', 'High School or Below', 'College', 'Doctor',
       nan], dtype=object)

In [25]:
# Check `policy_type` column
ca_df.policy_type.unique()

array(['Personal Auto', 'Corporate Auto', 'Special Auto', nan],
      dtype=object)

In [51]:
# Check `vehicle_class` column
ca_df.vehicle_class.unique()

array(['four-door car', 'two-door car', 'suv', 'luxury vehicle',
       'sports car'], dtype=object)

In [50]:
ca_df.vehicle_class.value_counts()

four-door car     4640
two-door car      1895
suv               1773
sports car         483
luxury vehicle     343
Name: vehicle_class, dtype: int64

## Activity 2 - Tuesday 2023-01-17

### 10. Replacing Null Values

#### 10.1. Numerical Columns

In [27]:
# Check the rows with null values in ca_df.customer_lifetime_value (only numerical column with null values)
customer_lifetime_value_nan = ca_df[ca_df.customer_lifetime_value.isna() == True]

customer_lifetime_value_nan

Unnamed: 0,state,gender,education,policy_type,vehicle_class,number_of_open_complaints,customer_lifetime_value,income,monthly_premium_auto,total_claim_amount
0,Washington,,Master,Personal Auto,Four-Door Car,0.0,,0.0,1000.0,2.704934
78,Washington,Female,Master,Personal Auto,Four-Door Car,0.0,,41275.0,96.0,41.122303
988,Washington,Male,High School or Below,Personal Auto,Four-Door Car,0.0,,55561.0,63.0,227.872071
1071,,,,,,,,,,
1394,Washington,Male,High School or Below,Personal Auto,Four-Door Car,1.0,,51878.0,66.0,316.8
1441,Washington,,High School or Below,Personal Auto,Four-Door Car,1.0,,36765.0,66.0,320.849072
1649,Washington,Male,Master,Personal Auto,Four-Door Car,0.0,,0.0,70.0,336.0
1930,Washington,Female,Bachelor,Special Auto,Four-Door Car,0.0,,25859.0,74.0,355.2


In [28]:
# Replace missing values with means of the column (for numerical columns)
mean_customer_lifetime_value = np.mean(ca_df.customer_lifetime_value)
ca_df.customer_lifetime_value = ca_df.customer_lifetime_value.fillna(mean_customer_lifetime_value)

# Check nan values again
ca_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9135 entries, 0 to 9134
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   state                      9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   policy_type                9134 non-null   object 
 4   vehicle_class              9134 non-null   object 
 5   number_of_open_complaints  9134 non-null   float64
 6   customer_lifetime_value    9135 non-null   float64
 7   income                     9134 non-null   float64
 8   monthly_premium_auto       9134 non-null   float64
 9   total_claim_amount         9134 non-null   float64
dtypes: float64(5), object(5)
memory usage: 713.8+ KB


#### 10.2. Categorical Column

In [29]:
# Check the rows with null values in ca_df.gender
gender_nan = ca_df[ca_df.gender.isna() == True]

gender_nan

Unnamed: 0,state,gender,education,policy_type,vehicle_class,number_of_open_complaints,customer_lifetime_value,income,monthly_premium_auto,total_claim_amount
0,Washington,,Master,Personal Auto,Four-Door Car,0.0,182071.519557,0.0,1000.0,2.704934
11,California,,College,Personal Auto,SUV,0.0,819720.000000,0.0,110.0,528.000000
12,California,,Master,Corporate Auto,Four-Door Car,2.0,879880.000000,77026.0,110.0,472.029737
13,Arizona,,High School or Below,Corporate Auto,SUV,1.0,881902.000000,99845.0,110.0,528.000000
14,California,,College,Corporate Auto,Four-Door Car,2.0,538443.000000,83689.0,70.0,307.139132
...,...,...,...,...,...,...,...,...,...,...
1236,Washington,,College,Special Auto,Four-Door Car,0.0,244367.000000,92834.0,61.0,292.800000
1297,Washington,,Bachelor,Corporate Auto,Two-Door Car,0.0,501889.000000,54500.0,63.0,302.400000
1326,Washington,,Bachelor,Personal Auto,Four-Door Car,0.0,485475.000000,29834.0,6464.0,307.200000
1441,Washington,,High School or Below,Personal Auto,Four-Door Car,1.0,182071.519557,36765.0,66.0,320.849072


In [30]:
# Replace missing values with `Other/Unkown`
ca_df.gender = ca_df.gender.fillna('Other/Unkown')

# Check nan values again
ca_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9135 entries, 0 to 9134
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   state                      9134 non-null   object 
 1   gender                     9135 non-null   object 
 2   education                  9134 non-null   object 
 3   policy_type                9134 non-null   object 
 4   vehicle_class              9134 non-null   object 
 5   number_of_open_complaints  9134 non-null   float64
 6   customer_lifetime_value    9135 non-null   float64
 7   income                     9134 non-null   float64
 8   monthly_premium_auto       9134 non-null   float64
 9   total_claim_amount         9134 non-null   float64
dtypes: float64(5), object(5)
memory usage: 713.8+ KB


In [31]:
# Remove the only row with null values 
ca_df = ca_df.dropna()
ca_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9134 entries, 0 to 9134
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   state                      9134 non-null   object 
 1   gender                     9134 non-null   object 
 2   education                  9134 non-null   object 
 3   policy_type                9134 non-null   object 
 4   vehicle_class              9134 non-null   object 
 5   number_of_open_complaints  9134 non-null   float64
 6   customer_lifetime_value    9134 non-null   float64
 7   income                     9134 non-null   float64
 8   monthly_premium_auto       9134 non-null   float64
 9   total_claim_amount         9134 non-null   float64
dtypes: float64(5), object(5)
memory usage: 785.0+ KB


### 11. Bucketing Data

In [32]:
# Replace states with zones

def replace_state_with_region(df):
    # Create a list of state to region mappings
    west = ['California']
    north_west = ['Oregon']
    east = ['Washington']
    central = ['Arizona','Nevada']

    # Use the map function to replace the values in the "State" column
    df = df.rename(columns={'state': 'region'})
    df['region'] = df['region'].map(lambda x: ('West' if x in west else
                                               ('North West' if x in north_west else
                                                ('East' if x in east else
                                                 ('Central' if x in central else x)))))
    return df

In [33]:
ca_df = replace_state_with_region(ca_df)
ca_df

Unnamed: 0,region,gender,education,policy_type,vehicle_class,number_of_open_complaints,customer_lifetime_value,income,monthly_premium_auto,total_claim_amount
0,East,Other/Unkown,Master,Personal Auto,Four-Door Car,0.0,1.820715e+05,0.0,1000.0,2.704934
1,Central,Female,Bachelor,Personal Auto,Four-Door Car,0.0,6.979540e+05,0.0,94.0,1131.464935
2,Central,Female,Bachelor,Personal Auto,Two-Door Car,0.0,1.288743e+06,48767.0,108.0,566.472247
3,West,Male,Bachelor,Corporate Auto,SUV,0.0,7.645860e+05,0.0,106.0,529.881344
4,East,Male,High School or Below,Personal Auto,Four-Door Car,0.0,5.363080e+05,36357.0,68.0,17.269323
...,...,...,...,...,...,...,...,...,...,...
9130,West,Male,Bachelor,Personal Auto,Four-Door Car,0.0,2.340600e+04,71941.0,73.0,198.234764
9131,West,Female,College,Corporate Auto,Four-Door Car,0.0,3.097000e+03,21604.0,79.0,379.200000
9132,West,Male,Bachelor,Corporate Auto,Four-Door Car,3.0,8.164000e+03,0.0,85.0,790.784983
9133,West,Male,College,Personal Auto,Four-Door Car,0.0,7.524000e+03,21941.0,96.0,691.200000


### 12. Merging Categories

In [34]:
#In the column Vehicle Class, merge the two categories Luxury SUV and Luxury Car into one category named Luxury Vehicle
def merge_luxury(df):
    luxury = ['Luxury SUV', 'Luxury Car']
    df['vehicle_class'] = df['vehicle_class'].map(lambda x: ('Luxury Vehicle' if x in luxury else x))
    return df

In [35]:
ca_df = merge_luxury(ca_df)
ca_df.vehicle_class.unique()

array(['Four-Door Car', 'Two-Door Car', 'SUV', 'Luxury Vehicle',
       'Sports Car'], dtype=object)

### 13. Removing Outliers

In [36]:
# Check the descriptive statistics as a reference
before_del_outliers = ca_df.describe().T

In [37]:
# Define a function for identifying numerical columns
def get_num_cols(df):
    num_cols = df.select_dtypes(include=np.number).columns
    return num_cols

In [38]:
# Identify numerical columns
num_cols = get_num_cols(ca_df)

# Alternatively, select certain numerical columns that need to exclude ouliers
#num_cols = ['income', 'monthly_premium_auto', 'total_claim_amount']

num_cols

Index(['number_of_open_complaints', 'customer_lifetime_value', 'income',
       'monthly_premium_auto', 'total_claim_amount'],
      dtype='object')

In [39]:
# FOCUS


# Define a function for removing outliers using 1.5*IQR technique for all numerical columns
def ca_df_remove_outliers(ca_df, num_cols):
    for col in num_cols:
        # Calculate the quartiles ("0.25" and "0.75" can be changed if the range need to be enlarged)
        Q1 = ca_df[col].quantile(0.25)
        Q3 = ca_df[col].quantile(0.75)
        
        # Calculate the IQR
        iqr = Q3 - Q1
        
        # Determine the boundaries for outliers
        lower_bound = Q1 - (1.5 * iqr)
        upper_bound = Q3 + (1.5 * iqr)
        
        # Identify outliers
        #outliers = ca_df[(ca_df.num_cols < lower_bound) | (ca_df.num_cols > upper_bound)]
        #print(outliers)
        
        # Print boundaries for reference
        print(f'From {lower_bound} to {upper_bound} for column "{col}".')
        
        # Remove any data points that fall outside of the boundaries
        ca_df[col] = ca_df.loc[ca_df[col].between(lower_bound, upper_bound), col]

    return ca_df

In [40]:
ca_df_remove_outliers(ca_df, num_cols)

From 0.0 to 0.0 for column "number_of_open_complaints".
From -27673.875 to 58523.125 for column "customer_lifetime_value".
From -93669.75 to 156116.25 for column "income".
From 6.5 to 170.5 for column "monthly_premium_auto".
From -151.673349375 to 964.685323625 for column "total_claim_amount".


Unnamed: 0,region,gender,education,policy_type,vehicle_class,number_of_open_complaints,customer_lifetime_value,income,monthly_premium_auto,total_claim_amount
0,East,Other/Unkown,Master,Personal Auto,Four-Door Car,0.0,,0.0,,2.704934
1,Central,Female,Bachelor,Personal Auto,Four-Door Car,0.0,,0.0,94.0,
2,Central,Female,Bachelor,Personal Auto,Two-Door Car,0.0,,48767.0,108.0,566.472247
3,West,Male,Bachelor,Corporate Auto,SUV,0.0,,0.0,106.0,529.881344
4,East,Male,High School or Below,Personal Auto,Four-Door Car,0.0,,36357.0,68.0,17.269323
...,...,...,...,...,...,...,...,...,...,...
9130,West,Male,Bachelor,Personal Auto,Four-Door Car,0.0,23406.0,71941.0,73.0,198.234764
9131,West,Female,College,Corporate Auto,Four-Door Car,0.0,3097.0,21604.0,79.0,379.200000
9132,West,Male,Bachelor,Corporate Auto,Four-Door Car,,8164.0,0.0,85.0,790.784983
9133,West,Male,College,Personal Auto,Four-Door Car,0.0,7524.0,21941.0,96.0,691.200000


In [41]:
# Save the result
after_del_outliers = ca_df.describe().T

In [42]:
before_del_outliers

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number_of_open_complaints,9134.0,0.383841,0.90911,0.0,0.0,0.0,0.0,5.0
customer_lifetime_value,9134.0,182071.519557,440877.444704,1898.0,4650.0,7716.5,26199.25,5816655.0
income,9134.0,37824.847055,30359.232933,0.0,0.0,34240.0,62446.5,99981.0
monthly_premium_auto,9134.0,110.393146,581.471461,61.0,68.0,83.0,109.0,35354.0
total_claim_amount,9134.0,430.480412,289.617985,0.099007,266.961153,377.505619,546.050821,2893.24


In [43]:
after_del_outliers

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number_of_open_complaints,7252.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
customer_lifetime_value,7061.0,7954.437332,6621.557819,1898.0,3980.0,5776.0,8946.0,58207.0
income,9134.0,37824.847055,30359.232933,0.0,0.0,34240.0,62446.5,99981.0
monthly_premium_auto,8691.0,87.409619,22.803124,61.0,68.0,80.0,106.0,170.0
total_claim_amount,8688.0,386.983657,210.985594,0.099007,252.155821,357.842312,518.69376,964.51598


In [44]:
ca_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9134 entries, 0 to 9134
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   region                     9134 non-null   object 
 1   gender                     9134 non-null   object 
 2   education                  9134 non-null   object 
 3   policy_type                9134 non-null   object 
 4   vehicle_class              9134 non-null   object 
 5   number_of_open_complaints  7252 non-null   float64
 6   customer_lifetime_value    7061 non-null   float64
 7   income                     9134 non-null   float64
 8   monthly_premium_auto       8691 non-null   float64
 9   total_claim_amount         8688 non-null   float64
dtypes: float64(5), object(5)
memory usage: 785.0+ KB


### 14. Standardizing Data

In [45]:
# Define a function for identifying categorical columns
def get_cat_cols(df):
    cat_cols = df.select_dtypes(include=np.object).columns
    return cat_cols

In [46]:
cat_cols = get_cat_cols(ca_df)
cat_cols

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  cat_cols = df.select_dtypes(include=np.object).columns


Index(['region', 'gender', 'education', 'policy_type', 'vehicle_class'], dtype='object')

In [47]:
# Define a function for lowercase the strings in categorical columns
def lower_cat_cols(ca_df, cat_cols):
    ca_df[cat_cols] = ca_df[cat_cols].apply(lambda x: x.str.lower())
    return ca_df

In [48]:
lower_cat_cols(ca_df, cat_cols)
ca_df

Unnamed: 0,region,gender,education,policy_type,vehicle_class,number_of_open_complaints,customer_lifetime_value,income,monthly_premium_auto,total_claim_amount
0,east,other/unkown,master,personal auto,four-door car,0.0,,0.0,,2.704934
1,central,female,bachelor,personal auto,four-door car,0.0,,0.0,94.0,
2,central,female,bachelor,personal auto,two-door car,0.0,,48767.0,108.0,566.472247
3,west,male,bachelor,corporate auto,suv,0.0,,0.0,106.0,529.881344
4,east,male,high school or below,personal auto,four-door car,0.0,,36357.0,68.0,17.269323
...,...,...,...,...,...,...,...,...,...,...
9130,west,male,bachelor,personal auto,four-door car,0.0,23406.0,71941.0,73.0,198.234764
9131,west,female,college,corporate auto,four-door car,0.0,3097.0,21604.0,79.0,379.200000
9132,west,male,bachelor,corporate auto,four-door car,,8164.0,0.0,85.0,790.784983
9133,west,male,college,personal auto,four-door car,0.0,7524.0,21941.0,96.0,691.200000
