In [1]:
import numpy as np
import pandas as pd
import sqlite3
from sklearn.preprocessing import StandardScaler
from pip._internal import main as pipmain

from utils.data_extraction import data_extract
# from utils.preprocessing import preprocessing_df

In [27]:
# preprocessing.py

def cleaning_df(df):
    # turning impossible values into NaN
    df.loc[df["Birthday"] < 1900, "Birthday"] = np.nan
    df.loc[df["First_Policy"] > 2020, "First_Policy"] = np.nan
    # turning Education into numeric
    df["Education"] = df["Education"].str.extract(r"(\d)").astype(np.float)
    return df


def add_dummies(df, cols):
    """Adds dummy columns to selected variables using the One Hot Encoding method.
    Drops the first column."""
    df_with_dummies = pd.get_dummies(df, columns=cols, drop_first=True)
    return df_with_dummies


def outlier_conditions(df):
    """
    Sets the condition for the identification of outliers in a dataframe
    """
    return ~(np.abs(df - df.mean()) > (3 * df.std()))


def remove_outliers(df, cols):
    """
    Replaces outliers by NaNs.
    Selected columns must be numerical.
    """
    outlier_df_cond = outlier_conditions(df)
    outliers_count = (
        (df[cols] == df[outlier_df_cond][cols]) == False
        )[cols].sum()
    
    temp_df = df[cols].copy()
    outlier_tempdf_cond = outlier_conditions(temp_df)
    temp_df = temp_df[outlier_tempdf_cond]
    
    df.loc[:, cols] = temp_df.loc[:, cols].copy()
    return df, outliers_count


def handle_nans(df, cols):
    """
    Replaces NaNs by column mean.
    Selected columns must be numerical.
    """
    df.fillna(df.mean()[cols], inplace=True)
    return df


def standardize_data(df, cols):
    """Standardizes data from `cols`.
    cols -> list
    """
    df[cols] = StandardScaler().fit_transform(df[cols])
    return df


def preprocessing_df(df):
    df = cleaning_df(df)
    df, outliers_count = remove_outliers(df, ['Motor', 'Household', 'Health', 'Life', 'Work_Compensation'])
    df = handle_nans(df, ["Salary", "First_Policy", "Birthday", "Children", 'Motor', 'Household', 'Health', 'Life', 'Work_Compensation'])
    df["Children"] = df["Children"].astype(np.int8)
    df = standardize_data(df, ['Motor', 'Household', 'Health', 'Life', 'Work_Compensation'])
    df = add_dummies(df, ['Area', 'Education'])
#     duplicated rows (showing only the duplicates)
    dups_df = df[df.duplicated(keep="first")].copy()
    return df, outliers_count, dups_df

In [28]:
# project.py

my_path = r'.\data\insurance.db'
profile = False

_, df = data_extract(my_path)

if profile:
    try:
        import pandas_profiling
    except ImportError as e:
        print(e.args)
        pipmain(['install', 'pandas_profiling'])
        import pandas_profiling
    prof = df.profile_report(style={'full_width': True}, title='Pandas Profiling Report')
    prof.to_file(output_file="./out/df_profiling.html")

df, outliers_count, dups_df = preprocessing_df(df)

In [31]:
len(dups_df)

3

In [4]:
outliers_count

Motor                 40
Household             36
Health                46
Life                 314
Work_Compensation    245
dtype: int64

In [5]:
df.head()

Unnamed: 0_level_0,First_Policy,Birthday,Salary,Children,CMV,Claims,Motor,Household,Health,Life,Work_Compensation,Area_2.0,Area_3.0,Area_4.0,Education_2.0,Education_3.0,Education_4.0
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,1985.0,1982.0,2177.0,1,380.97,0.39,0.571983,-0.55047,-0.289996,0.2384159,-0.537669,0,0,0,1,0,0
2,1981.0,1995.0,677.0,1,-131.13,1.12,-1.5926,0.955143,-0.687859,6.581986e-14,1.75254,0,0,1,1,0,0
3,1991.0,1970.0,2277.0,0,504.67,0.28,-0.659056,0.09805,-0.582057,1.267848,1.570073,0,1,0,0,0,0
4,1990.0,1981.0,1099.0,1,-16.99,0.99,-0.830763,-0.711873,1.920039,-0.06695965,-0.243822,0,0,1,0,1,0
5,1986.0,1973.0,1763.0,1,35.23,0.9,0.301909,-0.691977,0.195833,-0.5002947,0.092626,0,0,1,0,1,0


In [6]:
df.columns

Index(['First_Policy', 'Birthday', 'Salary', 'Children', 'CMV', 'Claims',
       'Motor', 'Household', 'Health', 'Life', 'Work_Compensation', 'Area_2.0',
       'Area_3.0', 'Area_4.0', 'Education_2.0', 'Education_3.0',
       'Education_4.0'],
      dtype='object')

In [7]:
df.isna().any().any()

False

In [8]:
for i in df.columns:
    if df.nunique()[i] < 10:
        print(f"column '{i}':\nno. of uniques: {df.nunique()[i]} || {df.dtypes[i]} || {df[i].unique().tolist()}\n")
    else:
        print(f"column '{i}':\nno. of uniques: {df.nunique()[i]} || {df.dtypes[i]}\n")

column 'First_Policy':
no. of uniques: 26 || float64

column 'Birthday':
no. of uniques: 68 || float64

column 'Salary':
no. of uniques: 3566 || float64

column 'Children':
no. of uniques: 2 || int8 || [1, 0]

column 'CMV':
no. of uniques: 7012 || float64

column 'Claims':
no. of uniques: 165 || float64

column 'Motor':
no. of uniques: 1945 || float64

column 'Household':
no. of uniques: 1028 || float64

column 'Health':
no. of uniques: 1004 || float64

column 'Life':
no. of uniques: 465 || float64

column 'Work_Compensation':
no. of uniques: 771 || float64

column 'Area_2.0':
no. of uniques: 2 || uint8 || [0, 1]

column 'Area_3.0':
no. of uniques: 2 || uint8 || [0, 1]

column 'Area_4.0':
no. of uniques: 2 || uint8 || [0, 1]

column 'Education_2.0':
no. of uniques: 2 || uint8 || [1, 0]

column 'Education_3.0':
no. of uniques: 2 || uint8 || [0, 1]

column 'Education_4.0':
no. of uniques: 2 || uint8 || [0, 1]



In [21]:
cols_for_clustering = []

In [22]:
dummycols = df.columns[df.columns.str.startswith("Education") | df.columns.str.startswith("Area")].tolist()

In [23]:
cols_for_clustering.extend(dummycols)

In [24]:
cols_for_clustering.

['Area_2.0',
 'Area_3.0',
 'Area_4.0',
 'Education_2.0',
 'Education_3.0',
 'Education_4.0']

In [None]:
# might be handy: https://github.com/joaolcorreia/RFM-analysis