In [30]:
import pandas as pd
import numpy as np


In [31]:
# Read the excel file to dataframe

df = pd.read_excel('files_for_lab/excel_files/marketing_customer_analysis.xlsx')


In [32]:
# Show the DataFrame's shape.

print(f"Rows={df.shape[0]} Columns={df.shape[1]}")


Rows=10910 Columns=26


In [33]:
# Standardize header names.

cols = []

for col in df.columns:
    cols.append(col.lower().replace(' ', '_'))
df.columns = cols

# Print header names
list(df.columns)

['unnamed:_0',
 'customer',
 'state',
 'customer_lifetime_value',
 'response',
 'coverage',
 'education',
 'effective_to_date',
 'employmentstatus',
 'gender',
 'income',
 'location_code',
 'marital_status',
 'monthly_premium_auto',
 'months_since_last_claim',
 'months_since_policy_inception',
 'number_of_open_complaints',
 'number_of_policies',
 'policy_type',
 'policy',
 'renew_offer_type',
 'sales_channel',
 'total_claim_amount',
 'vehicle_class',
 'vehicle_size',
 'vehicle_type']

In [34]:
# 3. Which columns are numerical?

df.select_dtypes(include=np.number).columns.tolist()

['unnamed:_0',
 'customer_lifetime_value',
 'income',
 'monthly_premium_auto',
 'months_since_last_claim',
 'months_since_policy_inception',
 'number_of_open_complaints',
 'number_of_policies',
 'total_claim_amount']

In [35]:
# 4. Which columns are categorical?

df.select_dtypes(include=['object']).columns.tolist()

['customer',
 'state',
 'response',
 'coverage',
 'education',
 'employmentstatus',
 'gender',
 'location_code',
 'marital_status',
 'policy_type',
 'policy',
 'renew_offer_type',
 'sales_channel',
 'vehicle_class',
 'vehicle_size',
 'vehicle_type']

In [36]:
# 5. Check and deal with NaN values.

# Show all columns and their number of NaN values
display(df.isnull().sum())

unnamed:_0                          0
customer                            0
state                             631
customer_lifetime_value             0
response                          631
coverage                            0
education                           0
effective_to_date                   0
employmentstatus                    0
gender                              0
income                              0
location_code                       0
marital_status                      0
monthly_premium_auto                0
months_since_last_claim           633
months_since_policy_inception       0
number_of_open_complaints         633
number_of_policies                  0
policy_type                         0
policy                              0
renew_offer_type                    0
sales_channel                       0
total_claim_amount                  0
vehicle_class                     622
vehicle_size                      622
vehicle_type                     5482
dtype: int64

In [37]:
# Cleaning the columns

# Delete duplicate rows
df = df.drop_duplicates()

# Drop column "unnamed:_0"
df.drop('unnamed:_0', axis=1, inplace=True)

# Column vehicle_type just has ~50% values, so lets drop it
df.drop('vehicle_type', axis=1, inplace=True)

# Fill all missing values of all columns (with numeric data) with their mean value
for col in df.select_dtypes(include=np.number).columns:
    df[col] = df[col].fillna(np.mean(df[col]))

# Fill all missing values of all columns (with categorical data) with their mode value
for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])

# Reset row labels
df = df.reset_index(drop=True)

# Review table for empty values, there shouldn't be any, yet
display(df.isnull().sum())

customer                         0
state                            0
customer_lifetime_value          0
response                         0
coverage                         0
education                        0
effective_to_date                0
employmentstatus                 0
gender                           0
income                           0
location_code                    0
marital_status                   0
monthly_premium_auto             0
months_since_last_claim          0
months_since_policy_inception    0
number_of_open_complaints        0
number_of_policies               0
policy_type                      0
policy                           0
renew_offer_type                 0
sales_channel                    0
total_claim_amount               0
vehicle_class                    0
vehicle_size                     0
dtype: int64

In [38]:
# 6. Datetime format - Extract the months from the dataset and store in a separate column.
# Then filter the data to show only the information for the first quarter , ie. January, February and March.
# _Hint_: If data from March does not exist, consider only January and February.

df['month'] = pd.to_datetime(df['effective_to_date']).dt.month
df = df[ (df['month']==1) | (df['month']==2) | (df['month']==3) ]
df['month']


0        2
1        1
2        2
3        1
4        1
        ..
10905    1
10906    1
10907    2
10908    2
10909    1
Name: month, Length: 10910, dtype: int64

In [39]:
#7. BONUS: Put all the previously mentioned data transformations into a function.

def clean_data(data):
    
    df = data.copy()
    
    # Show tables shape
    print(df.shape)

    # Standardize header names.
    cols = []
    for col in df.columns:
        cols.append(col.lower().replace(' ', '_'))
    df.columns = cols

    # Show numerical columns
    display(df.select_dtypes(include=np.number).columns.tolist())
    
    # Show categorical columns
    display(df.select_dtypes(include=['object']).columns.tolist())
    
    # Delete duplicate rows
    df = df.drop_duplicates()

    # Fill all missing values of all columns (with numeric data) with their mean value
    for col in df.select_dtypes(include=np.number).columns:
        df[col] = df[col].fillna(np.mean(df[col]))

    # Fill all missing values of all columns (with categorical data) with their mode value
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].fillna(df[col].mode())

    # Reset row labels
    df = df.reset_index(drop=True)
    
    # Datetime format - Extract the months from the dataset and store in a separate column.
    # Then filter the data to show only the information for the first quarter , ie. January, February and March.
    df['month'] = pd.to_datetime(df['effective_to_date']).dt.month
    df = df[ (df['month']==1) | (df['month']==2) | (df['month']==3) ]
    
    return df