In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
class DataLoader:
    """
    This class provides method to load data
    """

    def __init__(self, path):
        self.path = path
        # print('in DataLoader')

    def loader(self):
        # print('Loading data.')
        file = pd.read_csv(self.path)
        # print('Finish loading')
        return file

In [3]:
train = DataLoader(path='train.csv')
train_data = train.loader()
test = DataLoader(path='test.csv')
test_data = test.loader()
data = train_data.append(test_data)
type(data)

pandas.core.frame.DataFrame

In [4]:
def parse_data(df):
    
    df["BMI_Ins_age"] = df.BMI*df.Ins_Age
    
    df["Product_Info2_let"] = df.Product_Info_2.str[0]
    df["Product_Info2_num"] = df.Product_Info_2.str[1]
    
    df["Product_Info2_let"] = pd.factorize(df.Product_Info2_let)[0]+1
    df["Product_Info_2"] = pd.factorize(df.Product_Info_2)[0]+1
    
    df['Medical_KW'] = df[['Medical_Keyword_1',
       'Medical_Keyword_2', 'Medical_Keyword_3', 'Medical_Keyword_4',
       'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7',
       'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10',
       'Medical_Keyword_11', 'Medical_Keyword_12', 'Medical_Keyword_13',
       'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16',
       'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19',
       'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22',
       'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25',
       'Medical_Keyword_26', 'Medical_Keyword_27', 'Medical_Keyword_28',
       'Medical_Keyword_29', 'Medical_Keyword_30', 'Medical_Keyword_31',
       'Medical_Keyword_32', 'Medical_Keyword_33', 'Medical_Keyword_34',
       'Medical_Keyword_35', 'Medical_Keyword_36', 'Medical_Keyword_37',
       'Medical_Keyword_38', 'Medical_Keyword_39', 'Medical_Keyword_40',
       'Medical_Keyword_41', 'Medical_Keyword_42', 'Medical_Keyword_43',
       'Medical_Keyword_44', 'Medical_Keyword_45', 'Medical_Keyword_46',
       'Medical_Keyword_47', 'Medical_Keyword_48']].sum(axis = 1)
    
    df['Na_Num'] = df.isnull().sum(axis = 1)
    
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    col = df.columns.values
    df = pd.DataFrame(imp.fit_transform(df))
    df.columns = col
    
    return df

#Matching the name of the selected features with original features`
def match(select_ls, ls):
    new_ls = []
    length = select_ls.shape[0]
    for i in np.arange(length):
        if str(select_ls[i]) == "True":
            new_ls.append(str(ls[i]))
    return new_ls

In [5]:
# add BMI*Ins_age etc.
data = parse_data(data)

In [6]:
# Preprocess data #

# factorize categorical variables
data['Product_Info_2'] = pd.factorize(data['Product_Info_2'])[0]

# drop id variable
data = data.drop('Id', axis=1)

# drop response variable
data = data.drop('Response', axis=1)

# data.to_csv('complete_data.csv')

# feature scaling and standardisation/ normalisation
def feature_scale(df):
    scale_df = (df - df.mean()) / df.std(ddof=1)
    return scale_df


data = feature_scale(data)


#  dealing missing value

def check_missing(df):
    # Explore missing data
    missing_data = df.isnull().sum()
    # print(missing_data.dtypes)
    # print(type(missing_data))
    total_data = len(df)
    df_missing_data = missing_data.to_frame()
    df_missing_data.columns = ['counts']
    # Identify missing categories
    df_missing_data = df_missing_data[df_missing_data.counts != 0]
    # Calculate missing percentage
    df_missing_data['missing_percent'] = df_missing_data.counts / total_data
    print(df_missing_data)
    print(len(df_missing_data))
    return df_missing_data


# check_missing(data)

# Create list of variable types

cont_variable_list = ['Product_Info_4', 'Ins_Age', 'Ht', 'Wt', 'BMI', 'Employment_Info_1', 'Employment_Info_4',
                      'Employment_Info_6', 'Insurance_History_5', 'Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4',
                      'Family_Hist_5']

dis_variable_list = ['Medical_History_1', 'Medical_History_10', 'Medical_History_15', 'Medical_History_24',
                     'Medical_History_32']

for i in range(48):
    i += 1
    dis_variable_list.append('Medical_Keyword_' + str(i))

cat_variable_list = []
for header in data.columns:
    if header in cont_variable_list and dis_variable_list:
        pass
    else:
        cat_variable_list.append(header)

missing_list = ['Employment_Info_1', 'Employment_Info_4', 'Employment_Info_6', 'Family_Hist_2', 'Family_Hist_3',
                'Family_Hist_4', 'Family_Hist_5', 'Insurance_History_5', 'Medical_History_1', 'Medical_History_10',
                'Medical_History_15', 'Medical_History_24', 'Medical_History_32']


# recommend method : pca, interpolation,svd, boosting

class MissingMethod:
    """
    This class will provide various method to handle missing values
    """

    def __init__(self, data):
        self.df = data

    def drop_response(self):
        self.df = self.df.drop('Response', axis=1, inplace=True)
        return self.df

    def fill_mode(self):
        for var in missing_list:
            if var in dis_variable_list and cat_variable_list:
                self.df[var] = self.df[var].fillna(self.df[var].mode()[0])
        return self.df

    def fill_avg(self):
        for var in missing_list:
            self.df[var] = self.df[var].fillna(self.df[var].mean())
        return self.df

    def drop_col(self):
        self.df = self.df.drop(['Medical_History_10', 'Medical_History_24',
                                'Medical_History_32'])
        return self.df


# preprocess = MissingMethod(data).fill_mode()
# preprocess = MissingMethod(data).fill_avg()
# check_missing(preprocess)
# preprocess = MissingMethod(data)

# use SVD to fill missing data
# pls normalise the data before using this function
# 1. if filling missing data, pls drop response
# 2. if use it to predict response, pls keep response
def fill_svd(df):
    col_mean = np.nanmean(df, axis=0, keepdims=1)
    valid = np.isfinite(df)
    df0 = np.where(valid, df, col_mean)
    halt = True
    maxiter = 100
    ii = 1
    normlist = []
    while halt == True:
        U, s, V = np.linalg.svd(df0, full_matrices=False)
        s1 = [(i * 0 if i <= 30 else i) for i in s]
        df1 = U.dot(np.diag(s1).dot(V))
        df2 = np.where(~valid, df1, df0)
        norm = np.linalg.norm(df2 - df1)
        normlist.append(norm)
        #        print(norm)
        df0 = df2
        if norm < 0.00001 or ii >= maxiter:
            halt = False
            error = np.nansum((df1 - df) ** 2)
        ii += 1
        print(ii)
    return df2, normlist, error

In [7]:
# fill missing data
data_average = MissingMethod(data).fill_avg()
data_average.head()
# data, list, error = fill_svd(data_3)

Unnamed: 0,BMI,Employment_Info_1,Employment_Info_2,Employment_Info_3,Employment_Info_4,Employment_Info_5,Employment_Info_6,Family_Hist_1,Family_Hist_2,Family_Hist_3,...,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Wt,BMI_Ins_age,Product_Info2_let,Product_Info2_num,Medical_KW,Na_Num
0,-1.19792,-0.603498,0.807795,-0.425875,-0.201719,2.433374,1.950315e-13,-1.421739,5.784993e-13,1.08808,...,-0.890333,-0.083038,-2.246073,-0.1492,-1.611867,0.112046,-0.619589,-0.192947,-0.855011,-0.74602
1,-1.611906,-0.941931,-1.799771,2.348078,-0.201719,-0.410947,-1.141934,-1.421739,-2.577391,-4.317251e-12,...,-0.890333,-0.083038,0.445216,-0.1492,-1.799149,-1.596686,0.59799,-1.124988,-0.855011,-1.605874
2,-0.334589,-0.579325,0.09664,-0.425875,-0.201719,-0.410947,-1.052836,0.648942,-1.533079,-4.317251e-12,...,-0.890333,-0.083038,0.445216,-0.1492,-0.043379,-1.627602,1.815568,-1.124988,-0.855011,0.113835
3,-0.95771,-0.434282,0.09664,-0.425875,-0.201719,2.433374,-0.5157191,0.648942,-0.4887674,-4.317251e-12,...,0.565558,-0.083038,0.445216,-0.1492,-0.97979,-1.224553,-0.619589,0.273073,-0.183208,0.113835
4,-0.373236,-0.615585,0.09664,-0.425875,-0.201719,-0.410947,-0.9896456,-1.421739,-0.09715044,-4.317251e-12,...,-0.344374,-0.083038,0.445216,-0.1492,-0.652046,-0.157084,-0.619589,-0.658968,-0.855011,0.973689


In [8]:
#chose features
#      3. use LinearSVC with L2 C=0.01
features =  ['Product_Info_2', 'Product_Info_3', 'Product_Info_4', 'Product_Info_6', 'Ins_Age', 'Ht', 'Wt', 'BMI', 'Employment_Info_1', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_4', 'Employment_Info_5', 'Employment_Info_6', 'InsuredInfo_1', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_6', 'Insurance_History_1', 'Insurance_History_4', 'Insurance_History_5', 'Insurance_History_7', 'Insurance_History_8', 'Family_Hist_1', 'Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5', 'Medical_History_1', 'Medical_History_2', 'Medical_History_3', 'Medical_History_4', 'Medical_History_8', 'Medical_History_9', 'Medical_History_13', 'Medical_History_15', 'Medical_History_16', 'Medical_History_21', 'Medical_History_23', 'Medical_History_24', 'Medical_History_25', 'Medical_History_26', 'Medical_History_28', 'Medical_History_29', 'Medical_History_30', 'Medical_History_34', 'Medical_History_36', 'Medical_History_39', 'Medical_History_41', 'Medical_Keyword_3', 'Medical_Keyword_15', 'Medical_Keyword_25', 'Medical_Keyword_37', 'BMI_Ins_age', 'Product_Info2_let', 'Product_Info2_num', 'Medical_KW', 'Na_Num']

data_3 = data[features]
# type(data_3)
# data_3.shape

In [9]:
data_3 = feature_scale(data_3)
data_3.head()

Unnamed: 0,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_6,Ins_Age,Ht,Wt,BMI,Employment_Info_1,Employment_Info_2,...,Medical_History_41,Medical_Keyword_3,Medical_Keyword_15,Medical_Keyword_25,Medical_Keyword_37,BMI_Ins_age,Product_Info2_let,Product_Info2_num,Medical_KW,Na_Num
0,-1.142966,-2.821329,-0.890333,-2.246073,1.18016,-1.683671,-1.611867,-1.19792,-0.603498,0.807795,...,1.451172,-0.229862,-0.488197,-0.320819,-0.266733,0.112046,-0.619589,-0.192947,-0.855011,-0.74602
1,-0.861071,0.31518,-0.890333,0.445216,-1.756901,-1.438982,-1.799149,-1.611906,-0.941931,-1.799771,...,-0.689099,-0.229862,-0.488197,-0.320819,-0.266733,-1.596686,0.59799,-1.124988,-0.855011,-1.605874
2,-0.579176,0.31518,-0.890333,0.445216,-1.90752,0.518537,-0.043379,-0.334589,-0.579325,0.09664,...,-0.689099,-0.229862,-0.488197,-0.320819,-0.266733,-1.627602,1.815568,-1.124988,-0.855011,0.113835
3,-0.297282,-2.821329,0.565558,0.445216,-1.229736,-0.460222,-0.97979,-0.95771,-0.434282,0.09664,...,-0.689099,-0.229862,-0.488197,-0.320819,-0.266733,-1.224553,-0.619589,0.273073,-0.183208,0.113835
4,-0.015387,0.31518,-0.344374,0.445216,0.050521,-0.704912,-0.652046,-0.373236,-0.615585,0.09664,...,-0.689099,-0.229862,-0.488197,-0.320819,-0.266733,-0.157084,-0.619589,-0.658968,-0.855011,0.973689


In [10]:
data_3.to_csv('data_features4.csv')