In [1]:
%matplotlib inline
from pandas import read_csv, DataFrame, get_dummies, Series
from numpy import nanmean
import matplotlib.pyplot as plt
from multiprocessing import Pool, cpu_count
from functools import partial
from scipy import stats
from sklearn.preprocessing import normalize
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
# from boruta import BorutaPy
from random import sample
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE, SelectFromModel, SelectKBest, VarianceThreshold, SelectFpr, chi2, mutual_info_classif

In [2]:
from warnings import simplefilter
simplefilter("ignore")

In [3]:
# Train data
train = read_csv('train.csv', na_values=-1)
print(train.head(2))

      PERID  IFATHER  NRCH17_2  IRHHSIZ2  IIHHSIZ2  IRKI17_2  IIKI17_2  \
0  25095143      4.0       2.0       4.0       1.0       3.0       1.0   
1  13005143      4.0       1.0       3.0       1.0       2.0       1.0   

   IRHH65_2  IIHH65_2  PRXRETRY    ...     TOOLONG  TROUBUND  PDEN10  COUTYP2  \
0       1.0       1.0      99.0    ...         1.0       2.0     1.0      1.0   
1       1.0       1.0      99.0    ...         2.0       2.0     2.0      3.0   

   MAIIN102  AIIND102     ANALWT_C    VESTR  VEREP  Criminal  
0       2.0       2.0  3884.805998  40026.0    1.0         0  
1       2.0       2.0  1627.108106  40015.0    2.0         1  

[2 rows x 72 columns]


In [4]:
train.drop('PERID', axis=1, inplace=True)
train = train.dropna()
train.columns.values

array(['IFATHER', 'NRCH17_2', 'IRHHSIZ2', 'IIHHSIZ2', 'IRKI17_2',
       'IIKI17_2', 'IRHH65_2', 'IIHH65_2', 'PRXRETRY', 'PRXYDATA',
       'MEDICARE', 'CAIDCHIP', 'CHAMPUS', 'PRVHLTIN', 'GRPHLTIN',
       'HLTINNOS', 'HLCNOTYR', 'HLCNOTMO', 'HLCLAST', 'HLLOSRSN',
       'HLNVCOST', 'HLNVOFFR', 'HLNVREF', 'HLNVNEED', 'HLNVSOR',
       'IRMCDCHP', 'IIMCDCHP', 'IRMEDICR', 'IIMEDICR', 'IRCHMPUS',
       'IICHMPUS', 'IRPRVHLT', 'IIPRVHLT', 'IROTHHLT', 'IIOTHHLT',
       'HLCALLFG', 'HLCALL99', 'ANYHLTI2', 'IRINSUR4', 'IIINSUR4',
       'OTHINS', 'CELLNOTCL', 'CELLWRKNG', 'IRFAMSOC', 'IIFAMSOC',
       'IRFAMSSI', 'IIFAMSSI', 'IRFSTAMP', 'IIFSTAMP', 'IRFAMPMT',
       'IIFAMPMT', 'IRFAMSVC', 'IIFAMSVC', 'IRWELMOS', 'IIWELMOS',
       'IRPINC3', 'IRFAMIN3', 'IIPINC3', 'IIFAMIN3', 'GOVTPROG',
       'POVERTY3', 'TOOLONG', 'TROUBUND', 'PDEN10', 'COUTYP2', 'MAIIN102',
       'AIIND102', 'ANALWT_C', 'VESTR', 'VEREP', 'Criminal'], dtype=object)

In [None]:
# Normalization of Train and Test
cols = list(X.columns.values)

# Train
X = DataFrame(normalize(X))
X.columns = cols
X.head(2)

# Test
test_xgb_org = DataFrame(normalize(test_xgb_org))
test_xgb_org.columns = cols
test_xgb_org.head(2)

# Data Exploration and prepocessing

In [5]:
# Class imbalance
print('Target Class\n', train['Criminal'].value_counts())
cols = train.columns.values

# Number of unique values
print('\nNumber of unique values in each column')
for col in cols:
    print(col, len(train[col].unique()))

Target Class
 0    42233
1     3060
Name: Criminal, dtype: int64

Number of unique values in each column
IFATHER 4
NRCH17_2 4
IRHHSIZ2 6
IIHHSIZ2 1
IRKI17_2 4
IIKI17_2 2
IRHH65_2 3
IIHH65_2 3
PRXRETRY 5
PRXYDATA 6
MEDICARE 5
CAIDCHIP 5
CHAMPUS 5
PRVHLTIN 5
GRPHLTIN 7
HLTINNOS 5
HLCNOTYR 7
HLCNOTMO 17
HLCLAST 9
HLLOSRSN 17
HLNVCOST 6
HLNVOFFR 6
HLNVREF 6
HLNVNEED 6
HLNVSOR 6
IRMCDCHP 2
IIMCDCHP 2
IRMEDICR 2
IIMEDICR 2
IRCHMPUS 2
IICHMPUS 2
IRPRVHLT 2
IIPRVHLT 2
IROTHHLT 3
IIOTHHLT 3
HLCALLFG 2
HLCALL99 2
ANYHLTI2 5
IRINSUR4 2
IIINSUR4 2
OTHINS 2
CELLNOTCL 6
CELLWRKNG 6
IRFAMSOC 2
IIFAMSOC 2
IRFAMSSI 2
IIFAMSSI 2
IRFSTAMP 2
IIFSTAMP 2
IRFAMPMT 2
IIFAMPMT 2
IRFAMSVC 2
IIFAMSVC 2
IRWELMOS 13
IIWELMOS 3
IRPINC3 7
IRFAMIN3 7
IIPINC3 2
IIFAMIN3 2
GOVTPROG 2
POVERTY3 3
TOOLONG 3
TROUBUND 3
PDEN10 3
COUTYP2 3
MAIIN102 2
AIIND102 2
ANALWT_C 45250
VESTR 50
VEREP 2
Criminal 2


In [6]:
# Separate numerical and categorical columns
target = ['Criminal']
num_cols = ['NRCH17_2', 'IRHHSIZ2', 'IRKI17_2', 'IRHH65_2', 'HLCNOTMO', 'HLCLAST', 'IRWELMOS', 'ANALWT_C']
cat_cols = [col for col in train.columns.values if col not in (num_cols + target)]

In [7]:
print(len(train.columns.values), len(num_cols), len(cat_cols))

71 8 62


In [8]:
# Converting to categorical and one hot encoding
for col in cat_cols:
    train[col] = train[col].astype('category',copy=False)
    temp = get_dummies(train[col])
    temp.columns = [col+'_'+str(i) for i in temp.columns]
    train = train.join(temp)
    train = train.drop(col,axis=1)
    print(col)

IFATHER
IIHHSIZ2
IIKI17_2
IIHH65_2
PRXRETRY
PRXYDATA
MEDICARE
CAIDCHIP
CHAMPUS
PRVHLTIN
GRPHLTIN
HLTINNOS
HLCNOTYR
HLLOSRSN
HLNVCOST
HLNVOFFR
HLNVREF
HLNVNEED
HLNVSOR
IRMCDCHP
IIMCDCHP
IRMEDICR
IIMEDICR
IRCHMPUS
IICHMPUS
IRPRVHLT
IIPRVHLT
IROTHHLT
IIOTHHLT
HLCALLFG
HLCALL99
ANYHLTI2
IRINSUR4
IIINSUR4
OTHINS
CELLNOTCL
CELLWRKNG
IRFAMSOC
IIFAMSOC
IRFAMSSI
IIFAMSSI
IRFSTAMP
IIFSTAMP
IRFAMPMT
IIFAMPMT
IRFAMSVC
IIFAMSVC
IIWELMOS
IRPINC3
IRFAMIN3
IIPINC3
IIFAMIN3
GOVTPROG
POVERTY3
TOOLONG
TROUBUND
PDEN10
COUTYP2
MAIIN102
AIIND102
VESTR
VEREP


In [9]:
train.head(2)

Unnamed: 0,NRCH17_2,IRHHSIZ2,IRKI17_2,IRHH65_2,HLCNOTMO,HLCLAST,IRWELMOS,ANALWT_C,Criminal,IFATHER_1.0,...,VESTR_40043.0,VESTR_40044.0,VESTR_40045.0,VESTR_40046.0,VESTR_40047.0,VESTR_40048.0,VESTR_40049.0,VESTR_40050.0,VEREP_1.0,VEREP_2.0
0,2.0,4.0,3.0,1.0,99.0,99.0,99.0,3884.805998,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1.0,3.0,2.0,1.0,99.0,99.0,99.0,1627.108106,1,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# Check for duplicate rows

In [None]:
# Missing value check
train.isnull().sum()

In [None]:
# Outliers
fig, ax = plt.subplots(figsize=(15,  15))
# X_train.boxplot(by='target', ax=ax)

In [None]:
# Bar plots
train.iloc[:, :4].hist()

In [None]:
# Finding best distribution for each feature

cdfs = [
    "norm",            #Normal (Gaussian)
    "alpha",           #Alpha
    "beta",            #Beta
    "expon",           #Exponential
    "gamma",           #Gamma
    "laplace",         #Laplace
    "rayleigh",        #Rayleigh
    "uniform",         #Uniform
       ]

col_name=list(X_train.columns.values)
X_train.fillna(0, inplace=True)
trans = {}
for i in range(X_train.shape[1]):
    p_max = -100
    dist = ''
    temp = X_train[col_name[i]].transpose().values.tolist()
    # fit our data set against every probability distribution
    for cdf in cdfs:
        parameters = eval("stats."+cdf+".fit(temp)")
        #Applying the Kolmogorov-Smirnof one sided test
        D, p = stats.kstest(temp, cdf, args=parameters)
        if p > p_max:
            p_max = p
            dist = cdf
            #pretty-print the results
        #print cdf.ljust(16) + ("p: "+str(p)).ljust(25)+"D: "+str(D)
    #trans.append(dist)
    trans[col_name[i]]=dist
    print(col_name[i], ":", dist, "distribution")

# Feature Engineering / Selection

In [None]:
# Checking collinearity (using correlation)
correl = train.corr()
# train["feat_1"].corr(train["feat_2"])

In [None]:
cols = train.columns.values
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        curr_cor = correl.loc[cols[i], cols[j]]
        if (curr_cor >= 0.9) and (curr_cor < 1):
            print(cols[i], cols[j], curr_cor)

### Variance Threshold Check

In [None]:
vt = VarianceThreshold()
vt_train = vt.fit(train)

In [None]:
# vt.variances_
vt_df = DataFrame({'feature': list(train.columns.values), 'variance': vt.variances_}).sort_values(by='variance', ascending=True)
print(vt_df.tail(10))

### Splitting data

In [10]:
y = train['Criminal']
X = train[[col for col in train.columns.values if col not in ['PERID', 'Criminal']]]
# X['download_time'].fillna(0, inplace=True)

In [12]:
# Splitting Train test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.4, random_state=11)
print(X_train.shape, X_test.shape)

(27175, 278) (18118, 278)


### Normalization

In [None]:
# Normalizing data
norm_train = DataFrame(normalize(X_train))
norm_train.columns = list(X_train.columns.values)
norm_train.head(2)

### PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=len(norm_train.columns.values))

In [None]:
pca_train = DataFrame(pca.fit_transform(norm_train))

In [None]:
sum(pca.explained_variance_[:20])

### Feature Importance

In [None]:
rf = ExtraTreesClassifier()#n_estimators=100, max_depth=10)
rf.fit(X_train, y_train)

In [None]:
feat_imp = Series(rf.feature_importances_, index=X_train.columns.values).sort_values(ascending=False)

In [None]:
feat_imp[:20].plot(kind='bar', title='Feature Importance with Random Forest', figsize=(12,8))
plt.ylabel('Feature Importance values')
#plt.subplots_adjust(bottom=0.25)
#plt.savefig('FeatImportance.png')
plt.show()

In [None]:
imp_feats = list(feat_imp[:20].index)
print(imp_feats)

### Feature Selection

In [None]:
# Select From Model
feats = list(X_train.columns.values)

rf = RandomForestClassifier(n_estimators=100, verbose=2, random_state=1, max_depth=20)

# define Boruta feature selection method
feat_selector = SelectFromModel(rf)

# find all relevant features - 20 features should be selected
feat_selector.fit(X_train, y_train)

In [None]:
sfmodel_feats = [feats[i] for i in feat_selector.get_support(indices=True)]
print(sfmodel_feats)

# Model Training

## Pipeline (AdaBoost, RF, SVM, ET, KNN)

In [None]:
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
ada = AdaBoostClassifier(base_estimator=et)
gb = GradientBoostingClassifier()

rfe = RFE(rf, step=0.2)
select = SelectFromModel(rf)
kbest = SelectKBest(chi2)

pipe = Pipeline([('feat_sel', rfe), ('model', rf)])

feat_sel_params = [
    {
        'feat_sel': [kbest],
        'feat_sel__k': [20, 30]},
    {
        'feat_sel': [rfe],
        'feat_sel__estimator': [ada], #rf, et, 
        'feat_sel__n_features_to_select': [20]},
    {
        'feat_sel': [select],
        'feat_sel__estimator': [ada]} #rf, et, 
]

model_params = [
    {
        'model': [gb],
        'model__n_estimators': [10], #500, 1000, 2000, 4000
        'model__learning_rate': [0.05]}, #0.01, 0.04, 0.1, 0.5, 1
    {
        'model': [ada],
        'model__n_estimators': [10], #500, 1000, 2000, 4000
        'model__learning_rate': [0.05], #0.01, 0.04, 0.1, 0.5, 1
        'model__random_state': [2]},
    {
        'model': [rf],
        'model__n_estimators': [10], #500, 1000, 2000, 4000
        'model__criterion': ['gini', 'entropy'],
        'model__max_features': ['sqrt'], #, 'log2'
        'model__min_samples_leaf': [3], #3, 5, 7, 9
        'model__max_depth': [9]}, #8, 10, 14
    {
        'model': [et],
        'model__n_estimators': [1000], #500, 1000, 2000, 4000
        'model__criterion': ['gini', 'entropy'],
        'model__max_features': ['sqrt'], #, 'log2'
        'model__min_samples_leaf': [3], #3, 5, 7
        'model__max_depth': [9]} #8, 10, 14
]

params = []
for feat_sel in feat_sel_params:
    for model in model_params:
        # Merge dictionaries and append to list
        params.append({**feat_sel, **model})

In [None]:
grid = GridSearchCV(estimator=pipe, param_grid=params, scoring=make_scorer(matthews_corrcoef), verbose=20, n_jobs=-1)
grid.fit(X_train, y_train)

In [None]:
# CV results
cv_result_pipe = DataFrame(grid.cv_results_).to_csv('cv_result_pipe.csv', index=False)
print(grid.best_score_)
print(grid.best_estimator_)

In [None]:
imp_feats = X-train.columns.values[grid.best_params_['feat_sel'].get_support(indices=True)]
print(imp_feats)

In [None]:
y_pred = grid.predict(X_test[imp_feats])
# print(y_pred[:4])

print('MCC:', matthews_corrcoef(y_test, y_pred))
print('Acc:', accuracy_score(y_test, y_pred))
print('Confusion Matrix\n', confusion_matrix(y_test, y_pred))

### XGBoost

In [None]:
import xgboost as xgb

# Doing gridsearch to find best params configuration
clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric='error')

params = {
    'learning_rate': [0.01, 0.03],   # Learning rate alpha
    'max_depth': [8, 10, 14],   # maximum depth of the tree
    'gamma': [0.5, 1],   # minimum eval_score deduction at each split
    'min_child_weight': [6],  # minimum number of datapoints in a split
    'subsample': [0.9],  # sample size row-wise during bootstrap
    'colsample_bytree': [0.5],  # column-wise sample size
    'n_estimators': [1000],   # number of trees to build
    }

grid = GridSearchCV(clf, params, cv=5, verbose=20, n_jobs=-1, refit=True)

grid.fit(X_train[imp_feats], y_train)

# CV results
cv_result = DataFrame(grid.cv_results_).to_csv('cv_results_xgb.csv', index=False)

In [None]:
print(grid.best_score_)
print(grid.best_estimator_)

In [None]:
# Testing on X_test
pred = grid.predict(X_test[imp_feats])
print('MCC:', matthews_corrcoef(y_test, y_pred))
print('Acc:', accuracy_score(y_test, y_pred))
print('Confusion Matrix\n', confusion_matrix(y_test, y_pred))

In [None]:
# Using best params to find optimum number of iterations
grid_output = grid.best_params_
params = {
    'objective': 'binary:logistic', 
    'eval_metric': 'error', 
    'num_class': 2
    }

best_params = {**grid_output, **params}
#best_params['learning_rate'] = 0.02
print(best_params)

In [None]:
train_xgb = xgb.DMatrix(X_train[imp_feats], y_train)

cv_results = xgb.cv(best_params, train_xgb, num_boost_round=10000, nfold=5, stratified=True, as_pandas=True, 
                    seed=1, shuffle=True, early_stopping_rounds=20, verbose_eval=True)

In [None]:
nround = cv_results.shape[0]  # Where the best iteration happened
print('Best Iteration:', nround)
xgb_clf = xgb.train(best_params, train_xgb, num_boost_round=nround, verbose_eval=True)

# Predicting on the test set
test_xgb  = xgb.DMatrix(test_xgb_org)
test_pred = xgb_clf.predict(test_xgb)
Class_1, Class_2, Class_3, Class_4, Class_5, Class_6, Class_7, Class_8, Class_9 = map(list, zip(*test_pred))
output = DataFrame({'id': test['id'],
                    'Class_1': Class_1, 
                    'Class_2': Class_2, 
                    'Class_3': Class_3, 
                    'Class_4': Class_4, 
                    'Class_5': Class_5, 
                    'Class_6': Class_6, 
                    'Class_7': Class_7, 
                    'Class_8': Class_8, 
                    'Class_9': Class_9})
output = output[['id', 'Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']]

output.to_csv('output.csv', index=False)
output.head(2)