In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
def parse_data(df):
    
    df["BMI_Ins_age"] = df.BMI*df.Ins_Age
    
    df["Product_Info2_let"] = df.Product_Info_2.str[0]
    df["Product_Info2_num"] = df.Product_Info_2.str[1]
    
    df["Product_Info2_let"] = pd.factorize(df.Product_Info2_let)[0]+1
    df["Product_Info_2"] = pd.factorize(df.Product_Info_2)[0]+1
    
    df['Medical_KW'] = df[['Medical_Keyword_1',
       'Medical_Keyword_2', 'Medical_Keyword_3', 'Medical_Keyword_4',
       'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7',
       'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10',
       'Medical_Keyword_11', 'Medical_Keyword_12', 'Medical_Keyword_13',
       'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16',
       'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19',
       'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22',
       'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25',
       'Medical_Keyword_26', 'Medical_Keyword_27', 'Medical_Keyword_28',
       'Medical_Keyword_29', 'Medical_Keyword_30', 'Medical_Keyword_31',
       'Medical_Keyword_32', 'Medical_Keyword_33', 'Medical_Keyword_34',
       'Medical_Keyword_35', 'Medical_Keyword_36', 'Medical_Keyword_37',
       'Medical_Keyword_38', 'Medical_Keyword_39', 'Medical_Keyword_40',
       'Medical_Keyword_41', 'Medical_Keyword_42', 'Medical_Keyword_43',
       'Medical_Keyword_44', 'Medical_Keyword_45', 'Medical_Keyword_46',
       'Medical_Keyword_47', 'Medical_Keyword_48']].sum(axis = 1)
    
    df['Na_Num'] = df.isnull().sum(axis = 1)
    
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    col = df.columns.values
    df = pd.DataFrame(imp.fit_transform(df))
    df.columns = col
    
    return df
    

In [3]:
#Matching the name of the selected features with original features`
def match(select_ls, ls):
    new_ls = []
    length = select_ls.shape[0]
    for i in np.arange(length):
        if str(select_ls[i]) == "True":
            new_ls.append(str(ls[i]))
    return new_ls

In [4]:
train = pd.read_csv("train.csv")
df = parse_data(train)
y= df.Response

In [5]:
list = train.columns.values
len(list)

133

In [6]:
#extremely randomized trees model
#Train a extremely randomized trees model with "n_estimators=300" and other paramer by default
columns_to_drop = ["Id", "Response"]
et = ExtraTreesClassifier(n_estimators=300, random_state=0)
et.fit(df.drop(columns_to_drop, axis = 1), y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [7]:
et2 = ExtraTreesClassifier(n_estimators=100, random_state=0)
et2.fit(df.drop(columns_to_drop, axis = 1), y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [None]:
#Display the 20th first features by importances
importances = pd.DataFrame({'features': df.drop(columns_to_drop, axis = 1).columns,
                           'importances':et.feature_importances_})
importances.sort_values(by = 'importances', ascending = False).head(20)

In [None]:
#Display the 20th last feature
importances.sort_values(by = 'importances', ascending = False).tail(20)

In [None]:
#plot the features importances
%matplotlib inline
importances.sort_values(by = 'importances', ascending = True, inplace =True)
val = importances.importances*100 #the bar lengths
pos = np.arange(importances.shape[0])+.5

plt.figure(figsize = (13,28))
plt.barh(pos, val, align = 'center')
plt.yticks(pos, importances.features.values)
plt.xlabel('Importances')
plt.title('Features importances')
plt.grid(True)

In [None]:
#cumsum of importances
importances.sort_values(by = 'importances', ascending = False,
                       inplace = True)
importances['cumul'] = np.cumsum(importances.importances,
                                axis = 0)
importances.sort_values(by = 'importances', ascending = True, 
                       inplace = True)
val = importances.cumul*100 # the bar lengths
pos = np.arange(importances.shape[0])+.5
plt.figure(figsize = (13,28))
plt.barh(pos, val, align = 'center')
plt.yticks(pos, importances.features.values)
plt.xlabel('Importances')
plt.title('Features importances')
plt.grid(True)

In [None]:

for i in np.arange(50,100,5):
    print('Number of variables to have {0} % d\' \"importance\" of variables  : {1} sur {2}'.format(i,importances.features[importances.cumul < i/100].shape[0],
                                                                                                    importances.features.shape[0]))

In [None]:
#Variables ro remove to get X % of importances 

X = 90

importances.features[importances.cumul>X/100].values

In [8]:
model = SelectFromModel(et, prefit=True, threshold = 0.005)
X_new = model.transform(df.drop(columns_to_drop, axis = 1))
X_new.shape  

(59381, 58)

In [9]:
match(model.get_support(), df.drop(columns_to_drop, axis = 1).columns.values)

['Product_Info_2',
 'Product_Info_3',
 'Product_Info_4',
 'Product_Info_6',
 'Ins_Age',
 'Ht',
 'Wt',
 'BMI',
 'Employment_Info_1',
 'Employment_Info_2',
 'Employment_Info_3',
 'Employment_Info_4',
 'Employment_Info_5',
 'Employment_Info_6',
 'InsuredInfo_1',
 'InsuredInfo_3',
 'InsuredInfo_4',
 'InsuredInfo_6',
 'Insurance_History_1',
 'Insurance_History_4',
 'Insurance_History_5',
 'Insurance_History_7',
 'Insurance_History_8',
 'Family_Hist_1',
 'Family_Hist_2',
 'Family_Hist_3',
 'Family_Hist_4',
 'Family_Hist_5',
 'Medical_History_1',
 'Medical_History_2',
 'Medical_History_3',
 'Medical_History_4',
 'Medical_History_8',
 'Medical_History_9',
 'Medical_History_13',
 'Medical_History_15',
 'Medical_History_16',
 'Medical_History_21',
 'Medical_History_23',
 'Medical_History_24',
 'Medical_History_25',
 'Medical_History_26',
 'Medical_History_28',
 'Medical_History_29',
 'Medical_History_30',
 'Medical_History_34',
 'Medical_History_36',
 'Medical_History_39',
 'Medical_History_41',

In [10]:
model2 = SelectFromModel(et2, prefit=True, threshold = 0.005)
X_new2 = model.transform(df.drop(columns_to_drop, axis = 1))
X_new2.shape  

(59381, 58)

In [11]:
match(model2.get_support(), df.drop(columns_to_drop, axis = 1).columns.values)

['Product_Info_2',
 'Product_Info_3',
 'Product_Info_4',
 'Product_Info_6',
 'Ins_Age',
 'Ht',
 'Wt',
 'BMI',
 'Employment_Info_1',
 'Employment_Info_2',
 'Employment_Info_3',
 'Employment_Info_4',
 'Employment_Info_5',
 'Employment_Info_6',
 'InsuredInfo_1',
 'InsuredInfo_3',
 'InsuredInfo_4',
 'InsuredInfo_6',
 'Insurance_History_1',
 'Insurance_History_4',
 'Insurance_History_5',
 'Insurance_History_7',
 'Insurance_History_8',
 'Family_Hist_1',
 'Family_Hist_2',
 'Family_Hist_3',
 'Family_Hist_4',
 'Family_Hist_5',
 'Medical_History_1',
 'Medical_History_2',
 'Medical_History_3',
 'Medical_History_4',
 'Medical_History_8',
 'Medical_History_9',
 'Medical_History_13',
 'Medical_History_15',
 'Medical_History_16',
 'Medical_History_21',
 'Medical_History_23',
 'Medical_History_24',
 'Medical_History_25',
 'Medical_History_26',
 'Medical_History_28',
 'Medical_History_29',
 'Medical_History_30',
 'Medical_History_34',
 'Medical_History_36',
 'Medical_History_39',
 'Medical_History_41',