# 01/02/03 - Getting Data & Cleaning/Wrangling/EDA

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
market_cust_data=pd.read_csv("files_for_lab/csv_files/marketing_customer_analysis.csv")
    
def convert_nan_to_mode(df,colname):
    if df[colname].isna().value_counts(normalize=True)[True]<0.1:
        df[colname] = df[colname].fillna(df[colname].value_counts().index[0])
    return df

# NOTE: Run cell 1 and def convert_nan_to_mode(df,colname) function before running this transformation function.

def transform_df(df):
    # Drop the Unnamed column
    df.drop(['Unnamed: 0'], axis=1,inplace=True)
    # 2. Standardize header names.
    df.columns = [col.lower().replace(' ','_') for col in df.columns] 
    # 3. Which columns are numerical?
    list(df.describe(include=[int,float]).T.index)
    # 4. Which columns are categorical?
    list(df.describe(include=[object]).T.index)
    # 5. Check and deal with NaN values.
    df.isna().sum()
    # As NaN account for less than 10%, we can safely replace the NaNs with Mode (most frequent value)
    df=convert_nan_to_mode(df,'state')
    # As NaN account for less than 10%, we can safely replace the NaNs with Mode (most frequent value)
    df=convert_nan_to_mode(df,'response')
    # Even though the NaNs account for less than 10% of the total values of this column, the column values are categorical and
    # are evenly distributed in all the categories. Moving NaNs to any of the categories will 'skew' the distribution. The best
    # way would be to delete the rows having NaNs in this column.
    df = df[df['months_since_last_claim'].notna()]
    # As NaN account for less than 10%, we can safely replace the NaNs with Mode (most frequent value)
    df=convert_nan_to_mode(df,'vehicle_class')
    # As NaN account for less than 10%, we can safely replace the NaNs with Mode (most frequent value)
    df=convert_nan_to_mode(df,'vehicle_size')
    # As NaNs account for more than 50% of the values in this column, we cannot simply remove the rows. We have to assign a default
    # value to the NaNs which is not equal to the most frequent non null value 'A' otherwise the distribution will again get 'skewed'.
    df['vehicle_type'] = df['vehicle_type'].fillna('U')
    # 6. Extract the months from the dataset and store in a separate column. 
    df['effective_to_date']=pd.to_datetime(df['effective_to_date'],errors='coerce')
    df['effective_to_month']=df['effective_to_date'].apply(lambda x : x.month)
    df[df['effective_to_month'].isin([1,2,3])]
    return df


In [None]:
market_cust_data=transform_df(market_cust_data)
market_cust_data.to_csv("market_cust_data.csv",index=False)

In [None]:
# 2. Describe DataFrame.
market_cust_data.describe().T

In [None]:
# 3. Show a plot of the total number of responses.

import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

market_cust_data['response'].value_counts().plot(kind='bar')

In [None]:
# 4. Show a plot of the response rate by the sales channel.
market_cust_data['response_numeric']=market_cust_data['response'].apply(lambda x : 1 if x=='Yes' else 0)
sns.barplot(x="sales_channel", y="response_numeric", data=market_cust_data)
plt.show()

In [None]:
# 5. Show a plot of the response rate by the total claim amount.
sns.barplot(x = 'response',y='total_claim_amount', data=pd.DataFrame(market_cust_data.groupby(['response'])['total_claim_amount'].sum()).reset_index())
plt.show()

In [None]:
# 6. Show a plot of the response rate by income.
#sns.boxplot(x = 'response',y='income', data=pd.DataFrame(market_cust_data.groupby(['response'])['income'].sum()).reset_index())
#plt.show()

In [None]:
# 1. Check the data types of the columns. Get the numeric data into dataframe called numerical and categorical columns in a dataframe called categoricals.
import numpy as np
numerical=market_cust_data.select_dtypes(include=np.number)
categoricals=market_cust_data.select_dtypes(include=object)

In [None]:
# 2.1. Use seaborn library to construct distribution plots for the numerical variables
for col in numerical.columns:
    sns.displot(numerical[col]) 
plt.show()

# 04 - Processing Data