In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('marketing_customer_analysis.csv')

# Show the dataframe shape

In [7]:
df.shape

(10910, 26)

# Standardize header names

In [8]:
cols = []
for i in list(df.columns):
    cols.append(i.lower().replace(' ', '_').replace(':', ''))
df.columns = cols
df.columns

Index(['unnamed_0', 'customer', 'state', 'customer_lifetime_value', 'response',
       'coverage', 'education', 'effective_to_date', 'employmentstatus',
       'gender', 'income', 'location_code', 'marital_status',
       'monthly_premium_auto', 'months_since_last_claim',
       'months_since_policy_inception', 'number_of_open_complaints',
       'number_of_policies', 'policy_type', 'policy', 'renew_offer_type',
       'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size',
       'vehicle_type'],
      dtype='object')

# Which columns are numerical?

In [9]:
num_cols = list(df.select_dtypes([np.number]).columns)
print(f'The columns {num_cols} are numerical.')

The columns ['unnamed_0', 'customer_lifetime_value', 'income', 'monthly_premium_auto', 'months_since_last_claim', 'months_since_policy_inception', 'number_of_open_complaints', 'number_of_policies', 'total_claim_amount'] are numerical.


# Which columns are categorical?

In [10]:
cat_cols = list(df.select_dtypes(exclude=['number','bool_']).columns)
print(f'The columns {cat_cols} are categorical.')

The columns ['customer', 'state', 'response', 'coverage', 'education', 'effective_to_date', 'employmentstatus', 'gender', 'location_code', 'marital_status', 'policy_type', 'policy', 'renew_offer_type', 'sales_channel', 'vehicle_class', 'vehicle_size', 'vehicle_type'] are categorical.


# Check and deal with NaN values

First, we will remove all rows in which all values are NaN.

In [11]:
df = df.dropna(how='all')
df.reset_index(drop=True, inplace=True)

Then, we'll deal with NaN values in numerical columns. We'll be replacing them by the median of their own column, because, although the mean and median are not very different, the former might be influed by outliers.

In [12]:
num_vars = ['months_since_last_claim', 'number_of_open_complaints']
for var in num_vars:
    median = df[var].median()
    df[var] = df[var].fillna(median)

Finally, we'll adress the NaN values in categorical columns. In this case, we'll replace them by the string 'Unknown'. Since these are the only NULLS pending, we can just apply the function fillna() to the whole DataFrame.

In [13]:
df = df.fillna('Unknown')

In [14]:
#snapshot table for handling nulls
nulls_df = pd.DataFrame(round(df.isna().sum()/len(df), 4) * 100)
nulls_df = nulls_df.rename(columns={'index':'header', 0: 'prop_nulls'})
nulls_df.index.name = 'header'
nulls_df

Unnamed: 0_level_0,prop_nulls
header,Unnamed: 1_level_1
unnamed_0,0.0
customer,0.0
state,0.0
customer_lifetime_value,0.0
response,0.0
coverage,0.0
education,0.0
effective_to_date,0.0
employmentstatus,0.0
gender,0.0


# Datetime format - Extract the months from the dataset and store in a separate column

First, we will import the necessary modules and convert the column effective_to_date to datetime type.

In [15]:
from datetime import date

In [16]:
df['effective_to_date'] = pd.to_datetime(df['effective_to_date'], errors='coerce')

Then, we will create a new column in our dataframe with the value of the month for each row, which we will have extracted from the effective_to_date column.

In [17]:
df['month'] = df['effective_to_date'].dt.strftime('%m')

## Filter the data to show only the information for the first quarter

Once we have extracted the month for each row and stored it in a new column, we will filter the dataframe and create a new one with only the results from the frist quarter (months 01, 02 and 03).

In [18]:
months = ['01', '02', '03']
first_quarter = df[df.month.isin(months)]
first_quarter.head()

Unnamed: 0,unnamed_0,customer,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,...,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size,vehicle_type,month
0,0,DK49336,Arizona,4809.21696,No,Basic,College,2011-02-18,Employed,M,...,9,Corporate Auto,Corporate L3,Offer3,Agent,292.8,Four-Door Car,Medsize,Unknown,2
1,1,KX64629,California,2228.525238,No,Basic,College,2011-01-18,Unemployed,F,...,1,Personal Auto,Personal L3,Offer4,Call Center,744.924331,Four-Door Car,Medsize,Unknown,1
2,2,LZ68649,Washington,14947.9173,No,Basic,Bachelor,2011-02-10,Employed,M,...,2,Personal Auto,Personal L3,Offer3,Call Center,480.0,SUV,Medsize,A,2
3,3,XL78013,Oregon,22332.43946,Yes,Extended,College,2011-01-11,Employed,M,...,2,Corporate Auto,Corporate L3,Offer2,Branch,484.013411,Four-Door Car,Medsize,A,1
4,4,QA50777,Oregon,9025.067525,No,Premium,Bachelor,2011-01-17,Medical Leave,F,...,7,Personal Auto,Personal L2,Offer1,Branch,707.925645,Four-Door Car,Medsize,Unknown,1


# BONUS: Put all the previously mentioned data transformations into a function

In [6]:
def data_transformation(data):
    # Variables we'll need
    shape = data.shape
    cols = []
    num_vars = ['months_since_last_claim', 'number_of_open_complaints']
    months = ['01', '02', '03']
    # 1. Shape
    print(f"This dataset's shape is: {shape}")
    # 2. Standardize header names
    for i in list(data.columns):
        cols.append(i.lower().replace(' ', '_').replace(':', ''))
    data.columns = cols
    # 3. Numerical columns
    num_cols = list(data.select_dtypes([np.number]).columns)
    print(f'The columns {num_cols} are numerical.')
    # 4. Categorical columns
    cat_cols = list(data.select_dtypes(exclude=['number','bool_']).columns)
    print(f'The columns {cat_cols} are categorical.')
    # 5. Checking and dealing with NaN values
    data = data.dropna(how='all')
    data.reset_index(drop=True, inplace=True)
    for var in num_vars:
        median = data[var].median()
        data[var] = data[var].fillna(median)
    data = data.fillna('Unknown')
    # 6. Datetime
    data['effective_to_date'] = pd.to_datetime(data['effective_to_date'], errors='coerce')
    data['month'] = data['effective_to_date'].dt.strftime('%m')
    first_quarter = data[data.month.isin(months)]
    return first_quarter

data_transformation(df)

This dataset's shape is: (10910, 26)
The columns ['unnamed_0', 'customer_lifetime_value', 'income', 'monthly_premium_auto', 'months_since_last_claim', 'months_since_policy_inception', 'number_of_open_complaints', 'number_of_policies', 'total_claim_amount'] are numerical.
The columns ['customer', 'state', 'response', 'coverage', 'education', 'effective_to_date', 'employmentstatus', 'gender', 'location_code', 'marital_status', 'policy_type', 'policy', 'renew_offer_type', 'sales_channel', 'vehicle_class', 'vehicle_size', 'vehicle_type'] are categorical.


Unnamed: 0,unnamed_0,customer,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,...,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size,vehicle_type,month
0,0,DK49336,Arizona,4809.216960,No,Basic,College,2011-02-18,Employed,M,...,9,Corporate Auto,Corporate L3,Offer3,Agent,292.800000,Four-Door Car,Medsize,Unknown,02
1,1,KX64629,California,2228.525238,No,Basic,College,2011-01-18,Unemployed,F,...,1,Personal Auto,Personal L3,Offer4,Call Center,744.924331,Four-Door Car,Medsize,Unknown,01
2,2,LZ68649,Washington,14947.917300,No,Basic,Bachelor,2011-02-10,Employed,M,...,2,Personal Auto,Personal L3,Offer3,Call Center,480.000000,SUV,Medsize,A,02
3,3,XL78013,Oregon,22332.439460,Yes,Extended,College,2011-01-11,Employed,M,...,2,Corporate Auto,Corporate L3,Offer2,Branch,484.013411,Four-Door Car,Medsize,A,01
4,4,QA50777,Oregon,9025.067525,No,Premium,Bachelor,2011-01-17,Medical Leave,F,...,7,Personal Auto,Personal L2,Offer1,Branch,707.925645,Four-Door Car,Medsize,Unknown,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10905,10905,FE99816,Nevada,15563.369440,No,Premium,Bachelor,2011-01-19,Unemployed,F,...,7,Personal Auto,Personal L1,Offer3,Web,1214.400000,Luxury Car,Medsize,A,01
10906,10906,KX53892,Oregon,5259.444853,No,Basic,College,2011-01-06,Employed,F,...,6,Personal Auto,Personal L3,Offer2,Branch,273.018929,Four-Door Car,Medsize,A,01
10907,10907,TL39050,Arizona,23893.304100,No,Extended,Bachelor,2011-02-06,Employed,F,...,2,Corporate Auto,Corporate L3,Offer1,Web,381.306996,Luxury SUV,Medsize,Unknown,02
10908,10908,WA60547,California,11971.977650,No,Premium,College,2011-02-13,Employed,F,...,6,Personal Auto,Personal L1,Offer1,Branch,618.288849,SUV,Medsize,A,02
