In [274]:

import sqlite3
import pandas as pd


def data_extract(path_to_db):
    # connect to the database
    conn = sqlite3.connect(path_to_db)

    query = """
        SELECT
        e."Customer Identity",
        "First Policy´s Year",
        "Brithday Year",
        "Educational Degree",
        "Gross Monthly Salary",
        "Geographic Living Area",
        "Has Children (Y=1)",
        "Customer Monetary Value",
        "Claims Rate",
        l."Premiums in LOB: Motor",
        l."Premiums in LOB: Household",
        l."Premiums in LOB: Health",
        l."Premiums in LOB:  Life",
        l."Premiums in LOB: Work Compensations"
        FROM
        Engage AS e
        JOIN LOB AS l ON l."Customer Identity" = e."Customer Identity"
        ORDER BY
        e."Customer Identity";
    """

    data_df = pd.read_sql_query(query, conn)
    df = data_df.copy()  # let's keep a copy of the original data

    # remaining column names to manageable variable names
    column_names = ['ID', 'First_Policy', 'Birthday', 'Education',
                    'Salary', 'Area', 'Children', 'CMV', 'Claims',
                    'Motor', 'Household', 'Health', 'Life',
                    'Work_Compensation']
    # renaming the columns
    df.columns = column_names
    # seting 'ID' as index
    df.set_index('ID', inplace=True, drop=True)
    return data_df, df


my_path = r'/home/kalrashid/Dropbox/nova/data_mining/project/data/insurance.db'
_, df = data_extract(my_path)


"""new file"""



'new file'

In [275]:
#get the data 

my_path = r'./data/insurance.db' #path of the data file
_, df = data_extract(my_path)

In [276]:
#check the data

df.head()

Unnamed: 0_level_0,First_Policy,Birthday,Education,Salary,Area,Children,CMV,Claims,Motor,Household,Health,Life,Work_Compensation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,1985.0,1982.0,2 - High School,2177.0,1.0,1.0,380.97,0.39,375.85,79.45,146.36,47.01,16.89
2,1981.0,1995.0,2 - High School,677.0,4.0,1.0,-131.13,1.12,77.46,416.2,116.69,194.48,106.13
3,1991.0,1970.0,1 - Basic,2277.0,3.0,0.0,504.67,0.28,206.15,224.5,124.58,86.35,99.02
4,1990.0,1981.0,3 - BSc/MSc,1099.0,4.0,1.0,-16.99,0.99,182.48,43.35,311.17,35.34,28.34
5,1986.0,1973.0,3 - BSc/MSc,1763.0,4.0,1.0,35.23,0.9,338.62,47.8,182.59,18.78,41.45


In [277]:
df.isna().sum()

First_Policy          30
Birthday              17
Education             17
Salary                36
Area                   1
Children              21
CMV                    0
Claims                 0
Motor                 34
Household              0
Health                43
Life                 104
Work_Compensation     86
dtype: int64

In [246]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sb
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsRegressor

"""
Steps to follow, according to the lectures:
Data preparation
 Exploratory data analysis
 Detecting outliers
 Dealing with missing values
 Data discretization
 Imbalanced learning and data generation

Data preprocessing
 The curse of dimensionality
 Identifying informative attributes/features
 Creating attributes/features
 Dimensionality reduction
  Relevancy
  Redundancy
 Data standardization

"""

# undocumented handy function: df._get_numeric_data()


def cleaning_df(df):
    # removing duplicate rows
    df = df[~df.duplicated(keep="last")]
    # turning impossible values into NaN
    df.loc[df["Birthday"] < 1900, "Birthday"] = np.nan
    df.loc[df["Birthday"] > 2016, "Birthday"] = np.nan
    df.loc[df["First_Policy"] > 2016, "First_Policy"] = np.nan
    df.loc[df["Birthday"] > df["First_Policy"], "First_Policy"] = np.nan

    # turning Education into numeric
    df["Education"] = df["Education"].str.extract(r"(\d)").astype(np.float)
    return df


def add_dummies(df, cols):
    """Adds dummy columns to selected variables using the One Hot Encoding method.
    Drops the first column."""
    df_with_dummies = pd.get_dummies(df, columns=cols, drop_first=True)
    return df_with_dummies


def outlier_conditions(df):
    """
    Sets the condition for the identification of outliers in a dataframe
    """
    # ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR)))


    # Q1 = df['col'].quantile(.25)
    # Q3 = df['col'].quantile(.75)
    # mask = d['col'].between(q1, q2, inclusive=True)
    # iqr = d.loc[mask, 'col']



    return ~(np.abs(df - df.mean()) > (3 * df.std()))


def remove_outliers(df, cols):
    """
    Replaces outliers by NaNs.
    Selected columns must be numerical.
    """
    outlier_df_cond = outlier_conditions(df)
    outliers_count = (
        (df[cols] == df[outlier_df_cond][cols]) == False
        )[cols].sum()
    
    temp_df = df[cols].copy()
    outlier_tempdf_cond = outlier_conditions(temp_df)
    temp_df = temp_df[outlier_tempdf_cond]
    
    df.loc[:, cols] = temp_df.loc[:, cols].copy()
    return df, outliers_count


# def remove_outliers(df, cols):
#     """
#     Replaces outliers by NaNs.
#     Selected columns must be numerical.
#     """
#     ~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)))


#     for col in cols:
#         Q1 = df[col].quantile(.25)
#         Q3 = df[col].quantile(.75)

#         mask = df[col].between(Q1, Q3, inclusive=True)
#         IQR = df.loc[mask, col]
        
#         df.loc[
#             (df[col] < (Q1 - 1.5 * IQR)) | ( )
#             , col
#         ] = np.nan

#         cond = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))



#     return df, outliers_count


def handle_nans(df, cols):
    """
    Replaces NaNs by column mean.
    Selected columns must be continuous.
    """
    df.fillna(df.mean()[cols], inplace=True)
    return df


def handle_premium_nans(df, cols):
    """
    Replaces NaNs with 0.
    Selected columns must be continuous.
    """
    for col in cols:
        df[col].fillna(0, inplace=True)
    return df


def handle_cat_nans(df, cat_cols, independent_cols):
    """
    Uses a Random Forest classifier to predict and impute the nan values 
    for each categorical column given in `cols`.
    """
    
    Xcols, imputated_cols = [], []

    for cat_col in cols:
        if df[cat_col].isna().any().sum() > 0:
            Xcols.append(cat_col)
    
    if len(Xcols) > 0:
        for nan_col in Xcols:
            X_train = df.loc[:, df.columns.difference(list(set(Xcols) - set(imputated_cols)))].values
            y_train = df.loc[:, nan_col].values

            # TODO: tune Random Forest
            clf = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=2019)
            clf.fit(X_train, y_train)

            X_test = df.loc[df[cat_col].isna(), Xcols].copy()
            no_of_nans = len(X_test)

            y_pred = clf.predict(X_test)
            
            for pred, index in zip(y_pred, X_test.index.tolist()):
                df.loc[index, cat_col] = pred

            imputated_cols.append(cat_col)
            print(f'{no_of_nans} NaN values of "{cat_col}" column were imputed.')
        return df
    else:
        return df


def standardize_data(df, cols):
    """Standardizes data from `cols`.
    cols -> list
    """
    df_Norm = df[cols].copy()
    df_Norm[cols] = StandardScaler().fit_transform(df[cols])
    return df, df_Norm


def feature_selection(df):
    corr = df.corr(method='pearson')

    # Obtain Correlation and plot it
    plt.figure(figsize=(16,6))

    h_map = sb.heatmap(corr, 
            xticklabels=corr.columns,
            yticklabels=corr.columns,
            cmap='PRGn', annot=True, linewidths=.5)

    bottom, top = h_map.get_ylim()
    h_map.set_ylim(bottom + 0.5, top - 0.5)

    plt.show()


def feature_eng(df):
    """
    Creates useful features from the original ones in the dataframe.
    """
    if "Birthday" in df.columns:
        df["Age"] = 2016 - df["Birthday"]
        del df["Birthday"]
    
    if "First_Policy" in df.columns:
        df["Customer_Years"] = 2016 - df["First_Policy"]
        del df["First_Policy"]
    
    return df


def dim_reduction(df):
    """
    Applies Principal Component Analysis (PCA) to the dataframe.
    """
    x = df.values
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data = principalComponents, columns = ['principal_comp_1', 'principal_comp_2'])
    print(pca.explained_variance_ratio_)
    return principalDf




In [287]:
one_hot_encoded_education = pd.get_dummies(df['Education'], prefix = 'Education')    
one_hot_encoded_area = pd.get_dummies(df['Area'], prefix = 'Area')
one_hot_encoded_children = pd.get_dummies(df['Children'], prefix = 'Children')

In [297]:
len(one_hot_encoded_area)


10296

In [298]:
df05 = df.drop(['Area', 'Children', 'Education'], axis=1)

df1 = pd.concat([df05, one_hot_encoded_area, one_hot_encoded_children, one_hot_encoded_education], axis=1)

len(df)
#len(df1)
df1.reset_index(drop=True)
df1.head()

df1.index = np.arange(0, len(df1))


In [309]:
df1.head()
df1.index = np.arange(0, len(df1))
len(df)

10296

In [300]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [301]:
df_no = scaler.fit_transform(df1)
df_no = pd.DataFrame(df_no, columns = df1.columns)


In [306]:
len(df1)

10296

In [305]:
len(df_no)

10296

In [303]:
from sklearn.cluster import KMeans



Sum_of_squared_distances = []
K = range(1,20)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(df_no)
    Sum_of_squared_distances.append(km.inertia_)

# Plot the elbow
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [248]:
from sklearn.ensemble import RandomForestRegressor

def handle_cat_nans(df, cols):
    """
    Uses a Random Forest classifier to predict and impute the nan values 
    for each categorical column given in `cols`.
    """
    """
    Xcols, imputated_cols = [], []

    for cat_col in cols:
        if df[cat_col].isna().any().sum() > 0:
            Xcols.append(cat_col)
    
    if len(Xcols) > 0:
        for nan_col in Xcols:
            X_train = df.loc[:, df.columns.difference(list(set(Xcols) - set(imputated_cols)))].values
            y_train = df.loc[:, nan_col].values

            # TODO: tune Random Forest
            clf = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=2019)
            clf.fit(X_train, y_train)

            X_test = df.loc[df[cat_col].isna(), Xcols].copy()
            no_of_nans = len(X_test)

            y_pred = clf.predict(X_test)
            
            for pred, index in zip(y_pred, X_test.index.tolist()):
                df.loc[index, cat_col] = pred

            imputated_cols.append(cat_col)
            print(f'{no_of_nans} NaN values of "{cat_col}" column were imputed.')
        return df
    else:
        return df
        
        titanicWithAge = titanic[pd.isnull(titanic['age']) == False]
        titanicWithoutAge = titanic[pd.isnull(titanic['age'])]
        """
    
    
    variables = cols
    
    #for col in cols:
    one_hot_encoded_area = pd.get_dummies(df['Area'])
    one_hot_encoded_education = pd.get_dummies(df['Education'])
    one_hot_encoded_children = pd.get_dummies(df['Children'])
    
    customer_with_area = df[pd.isnull(df['Area']) == False]
    customer_without_area = df[pd.isnull(df['Area'])]
    
    customer_with_edu = df[pd.isnull(df['Education']) == False]
    customer_without_edu = df[pd.isnull(df['Education'])]

    customer_with_child = df[pd.isnull(df['Children']) == False]
    customer_without_child = df[pd.isnull(df['Children'])]
    
    
    customer_with_area = df[variables]
    customer_with_area = pd.concat([customer_with_area, one_hot_encoded_children, one_hot_encoded_education], axis =1)
    
    customer_without_area = customer_without_area[variables]
    customer_without_area = pd.concat([customer_without_area, one_hot_encoded_children, one_hot_encoded_education], axis = 1)
    

    customer_with_edu = df[variables]
    customer_with_edu = pd.concat([customer_with_edu, one_hot_encoded_area, one_hot_encoded_education], axis =1)
    
    customer_without_edu = customer_without_area[variables]
    customer_without_edu = pd.concat([customer_without_edu, one_hot_encoded_area, one_hot_encoded_education], axis = 1)
    
    customer_with_child = df[variables]
    customer_with_child = pd.concat([customer_with_child, one_hot_encoded_children, one_hot_encoded_area], axis =1)
    
    customer_without_child = customer_without_area[variables]
    customer_without_child = pd.concat([customer_without_child, one_hot_encoded_children, one_hot_encoded_area], axis = 1)
    
    independentVariables = ['pclass', 'female', 'male', 'sibsp', 'parch', 'fare', 'C', 'Q', 'S']

    rfModel_area = RandomForestRegressor()
    rfModel_area.fit(titanicWithAge[independentVariables], df['Area'])

    generatedAgeValues = rfModel_age.predict(X = titanicWithoutAge[independentVariables])
    
    
    
    titanicWithAge = titanicWithAge[variables]
    titanicWithAge = pd.concat([titanicWithAge, one_hot_encoded_sex, one_hot_encoded_embarked], axis = 1)

    one_hot_encoded_embarked = pd.get_dummies(titanicWithoutAge['embarked'])
    one_hot_encoded_sex = pd.get_dummies(titanicWithoutAge['sex'])
    titanicWithoutAge = titanicWithoutAge[variables]
    titanicWithoutAge = pd.concat([titanicWithoutAge, one_hot_encoded_sex, one_hot_encoded_embarked], axis = 1)

In [214]:
from sklearn.ensemble import RandomForestRegressor

def handle_cats(df, cat_cols, independent_cols):
    """
    Uses a Random Forest classifier to predict and impute the nan values 
    for each categorical column given in `cols`.
    """
    
    Xcols, ind_cols, imputated_cols = [], [], []

    
    
    
    for cat_col in cat_cols:
        if df[cat_col].isna().any().sum() > 0:
            Xcols.append(cat_col)
    

    
    if len(Xcols) > 0:
        for nan_col in Xcols:
            #create dataframe
            temp_df = df[[*independent_cols, nan_col]]
            temp_df_non_nan = temp_df.dropna()
            
            X_train = temp_df_non_nan.loc[:,independent_cols]
            y_train = temp_df_non_nan.loc[:, nan_col]
            
            #print(y_train)

            # TODO: tune Random Forest
            clf = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=2019)
            clf.fit(X_train, y_train)
            
            #X_test = temp_df.copy()
            X_test = temp_df[independent_cols]

            print(f'Xtrain-length {len(X_train)}, X-test leng: {len(X_test)}')
            
            
            #print(X_test.isna().any())
            #print(df.loc[df[cat_col].isna(), independent_cols].head())
            no_of_nans = len(X_test)



            y_pred = clf.predict(X_test)
            
            for pred, index in zip(y_pred, X_test.index.tolist()):
                df.loc[index, cat_col] = pred

            imputated_cols.append(cat_col)
            print(f'{no_of_nans} NaN values of "{cat_col}" column were imputed.')
        return df
    else:
        return df
        #"""

In [272]:
from sklearn.ensemble import RandomForestRegressor

def handle_cats1(df, cat_cols, independent_cols):
    """
    Uses a Random Forest classifier to predict and impute the nan values 
    for each categorical column given in `cols`.
    """
    print(independent_cols)
    
    Xcols, imputated_cols = [], []

    for cat_col in cat_cols:
        if df[cat_col].isna().any().sum() > 0:
            Xcols.append(cat_col)
    
    if len(Xcols) > 0:
        for nan_col in Xcols:
            #X_train = df.loc[:, df.columns.difference(list(set(independent_cols) - set(Xcols)))].values
            temp_df = df.copy()
            
            temp_df[nan_col].fillna(df[nan_col].mode()[0], inplace=True)

            X_train = temp_df.loc[:, independent_cols].values
            y_train = temp_df.loc[:, nan_col].values
            #print(df.loc[:, independent_cols])
            # TODO: tune Random Forest
            clf = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=2019)
            clf.fit(X_train, y_train)
            #print(df.loc[df[cat_col].isna()])
            X_test = temp_df.loc[:, independent_cols].copy()
            print(X_test.isna().any())
            no_of_nans = len(X_test)

            y_pred = clf.predict(X_test)
            
            for pred, index in zip(y_pred, X_test.index.tolist()):
                df.loc[index, cat_col] = pred

            imputated_cols.append(cat_col)
            print(f'{no_of_nans} NaN values of "{cat_col}" column were imputed.')
        return df
    else:
        return df

In [273]:
new_df = handle_cats1(df, cat, independent)

['Salary', 'CMV', 'Motor', 'Household', 'Health', 'Life', 'Work_Compensation', 'Customer_Years']
Salary               False
CMV                  False
Motor                False
Household            False
Health               False
Life                 False
Work_Compensation    False
Customer_Years       False
dtype: bool


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


10293 NaN values of "Education" column were imputed.
Salary               False
CMV                  False
Motor                False
Household            False
Health               False
Life                 False
Work_Compensation    False
Customer_Years       False
dtype: bool


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


ValueError: Cannot setitem on a Categorical with a new category, set the categories first

In [194]:
df.head()

Unnamed: 0_level_0,Education,Salary,Area,Children,CMV,Claims,Motor,Household,Health,Life,Work_Compensation,Age,Customer_Years
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2.0,2177.0,1.0,1.0,380.97,0.39,375.85,79.45,146.36,47.01,16.89,34.0,31.0
2,2.0,677.0,4.0,1.0,-131.13,1.12,77.46,416.2,116.69,0.0,106.13,21.0,29.0
3,1.0,2277.0,3.0,0.0,504.67,0.28,206.15,224.5,124.58,86.35,99.02,46.0,25.0
4,3.0,1099.0,4.0,1.0,-16.99,0.99,182.48,43.35,311.17,35.34,28.34,35.0,26.0
5,3.0,1763.0,4.0,1.0,35.23,0.9,338.62,47.8,182.59,18.78,41.45,43.0,30.0


In [249]:
cat = ['Area', 'Children', 'Education']

In [250]:
df[cat].isna().any()

Area         True
Children     True
Education    True
dtype: bool

In [251]:
independent = ['Salary', 'CMV', 'Motor', 'Household', 'Health', 'Life', 'Work_Compensation', 'Customer_Years']

In [254]:
df[independent].isna().sum()

Salary               0
CMV                  0
Motor                0
Household            0
Health               0
Life                 0
Work_Compensation    0
Customer_Years       0
dtype: int64

In [253]:

def preprocessing_df(df):
    #separation of variables
    ValueEngage = ['Age', 'Education', 'Salary', 'Area', 'Children', 'CMV', 'Customer_Years']

    ConsAff = ['Motor', 'Household', 'Health', 'Life', 'Work_Compensation']
    Cat_Values = ["Area", "Education", "Children"]
    
    collist = []
    collist.extend(ConsAff)
    collist.extend(Cat_Values)

    df = cleaning_df(df)
    df, outliers_count = remove_outliers(df, df.columns)
    
    df = handle_nans(df, df.columns.difference(collist))
    df = handle_premium_nans(df, ConsAff)
    print(df[Cat_Values])
    print(df.isna().sum())
    #df = handle_cat_nans(df, Cat_Values)

    df.loc[:, ["First_Policy", "Birthday", "Salary"]] = df[["First_Policy", "Birthday", "Salary"]].round().astype(np.int32)

    df.loc[:, Cat_Values] = df[Cat_Values].astype("category")
    
    df = feature_eng(df)
    df, df_Norm = standardize_data(df, [*ConsAff, 'Salary', 'CMV', 'Customer_Years'])

    # df = dim_reduction(df)
    
    return df, df_Norm



df, df_Norm = preprocessing_df(df)

df_Norm['Area'], df_Norm['Education'], df_Norm['Children'] = df['Area'], df['Education'], df['Children']

df_Norm.isna().any()

       Area  Education  Children
ID                              
1       1.0        2.0       1.0
2       4.0        2.0       1.0
3       3.0        1.0       0.0
4       4.0        3.0       1.0
5       4.0        3.0       1.0
...     ...        ...       ...
10292   2.0        4.0       0.0
10293   3.0        1.0       0.0
10294   1.0        3.0       1.0
10295   2.0        1.0       1.0
10296   1.0        4.0       1.0

[10293 rows x 3 columns]
First_Policy          0
Birthday              0
Education            17
Salary                0
Area                  1
Children             21
CMV                   0
Claims                0
Motor                 0
Household             0
Health                0
Life                  0
Work_Compensation     0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Motor                False
Household            False
Health               False
Life                 False
Work_Compensation    False
Salary               False
CMV                  False
Customer_Years       False
Area                  True
Education             True
Children              True
dtype: bool