### Feature selection--wrapper model
* This notebook iteratively select 50 most significant features out of the 907 numerical features using silhouette score of the K-means algorithm as criteria. The algorithm was run on a subsample of the original data (1.25% of the original data set).

In [21]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist, pdist
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.cm as cm
import time

In [22]:
# Use a smaller data set to save time
df = pd.read_csv('PHBsample14_sss.csv', low_memory=False)
# drop the column resulted from sampling of the original data set
df.drop('Unnamed: 0', axis=1, inplace=True)
# In order to run K-means, drop all the categoricald data for now.
df = df.select_dtypes(include=['float64', 'int64'])
# Impute missing values with means
df = df.fillna(df.mean())

In [23]:
selected_variable = pd.read_csv('selectedVariables.csv')
selected_variable.drop('Unnamed: 0', axis=1, inplace=True)
df1 = df[df.columns.intersection(selected_variable.columns)]
PolNum = np.asarray(df1.PolNum_UW)
df2 = df1.iloc[:, :-1]
df3 = StandardScaler().fit_transform(df2)

In [24]:
X = pd.DataFrame(df3, columns = df2.columns)

In [18]:
X.head()

Unnamed: 0,ValDate,IssDate,IssAgeALB,Dur,AttAge,JointInd,AV,CSV,SCPeriod,WDtoDate,...,Match4,tie3,HealthScore_C5,Surr,EligibleInd,WDResponse,FirstEligQInd,UtilizationInd,WDModelFilterIn,PolNum_UW
0,1.218114,1.836405,0.464104,-1.522253,0.001056,-0.584071,2.32331,2.245969,-0.325517,-0.350792,...,0.384917,-1.366834e-15,-0.6390812,-0.098372,0.475903,-0.12155,-0.07341,-0.603657,0.804456,0.237161
1,-1.458401,0.081612,0.891779,-0.420762,0.703782,-0.584071,-0.414788,-0.419684,0.715922,-0.350792,...,0.384917,-1.140632,1.92255e-16,-0.098372,0.475903,-0.12155,-0.07341,-0.603657,0.804456,0.148166
2,0.217171,-0.821998,-0.605082,0.680729,-0.33966,-0.584071,1.264387,1.309258,0.715922,1.914637,...,0.384917,-0.7559021,-0.6390812,-0.098372,0.475903,-0.12155,-0.07341,1.937469,-1.243077,1.449031
3,1.218114,0.104228,-0.818919,0.313565,-0.797497,-0.584071,0.447228,0.449699,0.715922,-0.350792,...,0.384917,3.76467,-0.6390812,-0.098372,0.475903,-0.12155,-0.07341,-0.603657,0.804456,0.197315
4,-1.458401,0.746727,0.250267,-1.155089,-0.062829,-0.584071,0.417146,0.38263,0.195203,-0.276719,...,-2.597963,-1.366834e-15,1.92255e-16,-0.098372,0.475903,-0.12155,-0.07341,1.937469,0.804456,-1.632484


In [25]:
n, m = X.shape[0], X.shape[1]
print(n, m)

59159 498


In [10]:
model_test = KMeans(n_clusters=7)
model_test.fit(X) 
pred_y=model_test.labels_

In [9]:
print("Clustered class labels:", "\n", pd.value_counts(pd.Series(pred_y)))

Clustered class labels: 
 0    15491
5    14160
4     8910
6     5925
3     5687
1     5313
2     3673
dtype: int64


In [10]:
X.columns

Index(['ValDate', 'IssDate', 'IssAgeALB', 'Dur', 'AttAge', 'JointInd', 'AV',
       'CSV', 'SCPeriod', 'WDtoDate',
       ...
       'Match4', 'tie3', 'HealthScore_C5', 'Surr', 'EligibleInd', 'WDResponse',
       'FirstEligQInd', 'UtilizationInd', 'WDModelFilterIn', 'PolNum_UW'],
      dtype='object', length=907)

In [15]:
X.head()

Unnamed: 0,ValDate,IssDate,IssAgeALB,Dur,AttAge,JointInd,AV,CSV,SCPeriod,WDtoDate,...,Match4,tie3,HealthScore_C5,Surr,EligibleInd,WDResponse,FirstEligQInd,UtilizationInd,WDModelFilterIn,PolNum_UW
0,16343.0,16104.0,65.0,1.0,65.8,0.0,448559.96,421076.98,5,0.0,...,1.0,64.859049,0.5,0.0,1.0,0.0,0.0,0.0,1.0,294692
1,15613.0,14397.0,69.0,4.0,72.4,0.0,67321.31,64451.77,7,0.0,...,1.0,53.0,0.869053,0.0,1.0,0.0,0.0,0.0,1.0,281394
2,16070.0,13518.0,55.0,7.0,62.6,0.0,301121.04,295758.92,7,56438.97,...,1.0,57.0,0.5,0.0,1.0,0.0,0.0,1.0,0.0,475776
3,16343.0,14419.0,53.0,6.0,58.3,0.0,187344.04,180762.56,7,0.0,...,1.0,104.0,0.5,0.0,1.0,0.0,0.0,0.0,1.0,288738
4,15613.0,15044.0,63.0,2.0,65.2,0.0,183155.51,171789.66,6,1845.38,...,0.0,64.859049,0.869053,0.0,1.0,0.0,0.0,1.0,1.0,15320


In [26]:
start = time.time()

# let's assume there are 7 clusters
num_of_cluster = 7
# Let's assume we're going to select 50 features out of 498 features, therefore we're going to iterate 50 times
num_of_iter = 50
model = KMeans(n_clusters=num_of_cluster)
score = np.zeros([num_of_iter, m]) # the sum of squared distances of samples to their closest cluster center
exclude_columns = [] # best performed models with selected features will be added to this list after every iteration
include_columns = [i for i in range(np.shape(score)[1]) if i not in exclude_columns] # rest of the features

for iteration in range(num_of_iter):
    # The first iteration, we're going to test clustering models on each individual variables
    if iteration == 0:
        print("Now processing iteration %d" %iteration, "\n")   
        for i in range(m):
            data = X.iloc[:, i][:, np.newaxis]
            model.fit(data)
#             pred_y = model.labels_
#             print("cluster labels based on variable %s:" %X.columns[i], "\n", pd.value_counts(pd.Series(pred_y)))
#             
            
            score[iteration][i] = model.inertia_            
            
        selected_feature_index = np.argmin(score[iteration], axis=0) 
        selected_feature_score = np.amin(score[iteration], axis=0) 
        selected_feature = X[X.columns[selected_feature_index]]
        exclude_columns.append(selected_feature_index)
        print("Conclusion: cluster based on variable %s" %X.columns[selected_feature_index], "gives the best performance", "\n") 
    #for following iteration, we're going to add the rest the feature to the selected feature and perform cluster model
    else:
        print("Now processing iteration %d" %iteration, "\n") 
        for i in range(m):
            if i not in exclude_columns:
                # Generate data with features selected from last iteration plus each individual rest of the features
                data = pd.concat([selected_feature, X[X.columns[i]]], axis=1)
                model.fit(data)
#                 pred_y = model.labels_
#                 print("cluster labels based on variables:", data.columns, "\n", pd.value_counts(pd.Series(pred_y)))
 
                score[iteration][i] = silhouette_score(data, model.labels_, sample_size=3000)
        include_columns = [i for i in range(np.shape(score)[1]) if i not in exclude_columns]
        selected_feature_score = np.amax(score[:,include_columns][iteration], axis=0) 
        selected_feature_index = np.argmax(score[:,include_columns][iteration], axis=0) 
        selected_feature = pd.concat([selected_feature, X[X.columns[selected_feature_index]]], axis=1)
        exclude_columns.append(selected_feature_index)
        print("Conclusion: cluster based on variable %s" %X.columns[exclude_columns], "gives the best performance", "\n") 
print("Selected features are %s" %X.columns[exclude_columns])

end = time.time()
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print('The whole process took:')
print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))


Now processing iteration 0 

Conclusion: cluster based on variable JointInd gives the best performance 

Now processing iteration 1 



  sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)


Conclusion: cluster based on variable Index(['JointInd', 'Number.of.Active.Sources_C3'], dtype='object') gives the best performance 

Now processing iteration 2 

Conclusion: cluster based on variable Index(['JointInd', 'Number.of.Active.Sources_C3', 'Mortgage.Liability_C3'], dtype='object') gives the best performance 

Now processing iteration 3 

Conclusion: cluster based on variable Index(['JointInd', 'Number.of.Active.Sources_C3', 'Mortgage.Liability_C3',
       'Housing.Units.Percent.5.Units_C3'],
      dtype='object') gives the best performance 

Now processing iteration 4 

Conclusion: cluster based on variable Index(['JointInd', 'Number.of.Active.Sources_C3', 'Mortgage.Liability_C3',
       'Housing.Units.Percent.5.Units_C3', 'iat96m12_C4'],
      dtype='object') gives the best performance 

Now processing iteration 5 

Conclusion: cluster based on variable Index(['JointInd', 'Number.of.Active.Sources_C3', 'Mortgage.Liability_C3',
       'Housing.Units.Percent.5.Units_C3', 'iat

Conclusion: cluster based on variable Index(['JointInd', 'Number.of.Active.Sources_C3', 'Mortgage.Liability_C3',
       'Housing.Units.Percent.5.Units_C3', 'iat96m12_C4', 'ifn21_C4',
       'ifn20xam_C4', 'icc33_C4', 'iat96m03_C4', 'iat89_C4', 'ifn01_C4',
       'Property.Lot.Size.In.Acres_num', 'Target.Narrow.Band.Income_num',
       'ibr34_C4', 'Match3', 'Housing.Units.Percent.Built.2005.or.Later_C3',
       'ibr20_C4', 'GMDBInd', 'ihi01y5_C4', 'icc07_C4', 'CEN_tr_pctArtsCon',
       'icc21_C4', 'iau20_C4', 'icc96m06_C4'],
      dtype='object') gives the best performance 

Now processing iteration 24 

Conclusion: cluster based on variable Index(['JointInd', 'Number.of.Active.Sources_C3', 'Mortgage.Liability_C3',
       'Housing.Units.Percent.5.Units_C3', 'iat96m12_C4', 'ifn21_C4',
       'ifn20xam_C4', 'icc33_C4', 'iat96m03_C4', 'iat89_C4', 'ifn01_C4',
       'Property.Lot.Size.In.Acres_num', 'Target.Narrow.Band.Income_num',
       'ibr34_C4', 'Match3', 'Housing.Units.Percent.Built.

Conclusion: cluster based on variable Index(['JointInd', 'Number.of.Active.Sources_C3', 'Mortgage.Liability_C3',
       'Housing.Units.Percent.5.Units_C3', 'iat96m12_C4', 'ifn21_C4',
       'ifn20xam_C4', 'icc33_C4', 'iat96m03_C4', 'iat89_C4', 'ifn01_C4',
       'Property.Lot.Size.In.Acres_num', 'Target.Narrow.Band.Income_num',
       'ibr34_C4', 'Match3', 'Housing.Units.Percent.Built.2005.or.Later_C3',
       'ibr20_C4', 'GMDBInd', 'ihi01y5_C4', 'icc07_C4', 'CEN_tr_pctArtsCon',
       'icc21_C4', 'iau20_C4', 'icc96m06_C4', 'Number.of.Lifestyles.All_C3',
       'ibr31_C4', 'CEN_tr_pctRetailService', 'i12ccxd1_C4', 'iau31_C4',
       'icc34_C4', 'Political.Donor.Propensity_C3', 'icc31_C4', 'icc96m12_C4',
       'Housing.Units.Percent.Built.2000.to.2004_C3',
       'CEN_tr_pctLT25KAge65plus', 'CEN_tr_pctLT10KAge65plus'],
      dtype='object') gives the best performance 

Now processing iteration 36 

Conclusion: cluster based on variable Index(['JointInd', 'Number.of.Active.Sources_C3', 

Conclusion: cluster based on variable Index(['JointInd', 'Number.of.Active.Sources_C3', 'Mortgage.Liability_C3',
       'Housing.Units.Percent.5.Units_C3', 'iat96m12_C4', 'ifn21_C4',
       'ifn20xam_C4', 'icc33_C4', 'iat96m03_C4', 'iat89_C4', 'ifn01_C4',
       'Property.Lot.Size.In.Acres_num', 'Target.Narrow.Band.Income_num',
       'ibr34_C4', 'Match3', 'Housing.Units.Percent.Built.2005.or.Later_C3',
       'ibr20_C4', 'GMDBInd', 'ihi01y5_C4', 'icc07_C4', 'CEN_tr_pctArtsCon',
       'icc21_C4', 'iau20_C4', 'icc96m06_C4', 'Number.of.Lifestyles.All_C3',
       'ibr31_C4', 'CEN_tr_pctRetailService', 'i12ccxd1_C4', 'iau31_C4',
       'icc34_C4', 'Political.Donor.Propensity_C3', 'icc31_C4', 'icc96m12_C4',
       'Housing.Units.Percent.Built.2000.to.2004_C3',
       'CEN_tr_pctLT25KAge65plus', 'CEN_tr_pctLT10KAge65plus', 'iat96m01_C4',
       'CEN_tr_pctProductionFamily', 'iin01_C4', 'ibr31_C4', 'iat51_C4',
       'iau02y5_C4', 'CEN_tr_pctConstructionFamily', 'MarkettoArea_C1',
       'ib

In [27]:
res = ['JointInd', 'Number.of.Active.Sources_C3', 'Mortgage.Liability_C3',
       'Housing.Units.Percent.5.Units_C3', 'iat96m12_C4', 'ifn21_C4',
       'ifn20xam_C4', 'icc33_C4', 'iat96m03_C4', 'iat89_C4', 'ifn01_C4',
       'Property.Lot.Size.In.Acres_num', 'Target.Narrow.Band.Income_num',
       'ibr34_C4', 'Match3', 'Housing.Units.Percent.Built.2005.or.Later_C3',
       'ibr20_C4', 'GMDBInd', 'ihi01y5_C4', 'icc07_C4', 'CEN_tr_pctArtsCon',
       'icc21_C4', 'iau20_C4', 'icc96m06_C4', 'Number.of.Lifestyles.All_C3',
       'ibr31_C4', 'CEN_tr_pctRetailService', 'i12ccxd1_C4', 'iau31_C4',
       'icc34_C4', 'Political.Donor.Propensity_C3', 'icc31_C4', 'icc96m12_C4',
       'Housing.Units.Percent.Built.2000.to.2004_C3',
       'CEN_tr_pctLT25KAge65plus', 'CEN_tr_pctLT10KAge65plus', 'iat96m01_C4',
       'CEN_tr_pctProductionFamily', 'iin01_C4', 'ibr31_C4', 'iat51_C4',
       'iau02y5_C4', 'CEN_tr_pctConstructionFamily', 'MarkettoArea_C1',
       'ibr02y5_C4', 'Match3', 'ibr02y5_C4', 'iat41_C4', 'ire31_C4',
       'iau102_C4']
res2 = {}
df_dic = pd.read_excel("/data/capstone_data/DataDictionary_allPHB_allvendors_cleaned.xlsx")
for column in res:
    res2[column] = df_dic.loc[df_dic['Variable'] == column, 'Description'].item()
    print(df_dic.loc[df_dic['Variable'] == column, 'Description'].item())
selected_feature = pd.DataFrame.from_dict(res2, orient='index')
selected_feature.reset_index(level=0, inplace=True)
selected_feature.columns = ['Variable', 'Description']

Indicator of a joint contract
Self explanatory
Self explanatory
Self explanatory
Number of inquiries (dedupped) in last 12 months
Months since most recent auto trade opened
Months since oldest finance trade opened excluding auto and mortgage
Total balance of open Credit -Charged Cards verified in last 12 months
Number of inquiries (dedupped) in last 3 months
Highest delinquency on a trade
Number of finance trades
Self explanatory
Self explanatory
Utilization of open bank revolving trades updated in last 12 months
Self explanatory
Self explanatory
Months since oldest bank revolving trade opened
Indicator of death benefit rider
Number of home equity trades verified in last 5 years
Number of credit-charge cards opened in last 12 months
Pecentage of people in Arts, entertainment, and recreation, and accommodation and food services Industry with Natural resources, construction, and maintenance occupations
Months since most recent credit-charge card opened
Months since oldest auto trade open

In [28]:
pd.set_option('display.max_colwidth', -1) 
selected_feature

Unnamed: 0,Variable,Description
0,JointInd,Indicator of a joint contract
1,Number.of.Active.Sources_C3,Self explanatory
2,Mortgage.Liability_C3,Self explanatory
3,Housing.Units.Percent.5.Units_C3,Self explanatory
4,iat96m12_C4,Number of inquiries (dedupped) in last 12 months
5,ifn21_C4,Months since most recent auto trade opened
6,ifn20xam_C4,Months since oldest finance trade opened excluding auto and mortgage
7,icc33_C4,Total balance of open Credit -Charged Cards verified in last 12 months
8,iat96m03_C4,Number of inquiries (dedupped) in last 3 months
9,iat89_C4,Highest delinquency on a trade


In [29]:
selected_feature.to_csv('selected_feature_Kmeans_silhouettes.csv')