In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import cross_val_score
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression

import xgboost
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE

In [95]:
train = pd.read_csv('./criminal_train.csv')
test = pd.read_csv('./criminal_test.csv')

In [96]:
train.head()

Unnamed: 0,PERID,IFATHER,NRCH17_2,IRHHSIZ2,IIHHSIZ2,IRKI17_2,IIKI17_2,IRHH65_2,IIHH65_2,PRXRETRY,...,TOOLONG,TROUBUND,PDEN10,COUTYP2,MAIIN102,AIIND102,ANALWT_C,VESTR,VEREP,Criminal
0,25095143,4,2,4,1,3,1,1,1,99,...,1,2,1,1,2,2,3884.805998,40026,1,0
1,13005143,4,1,3,1,2,1,1,1,99,...,2,2,2,3,2,2,1627.108106,40015,2,1
2,67415143,4,1,2,1,2,1,1,1,99,...,2,2,2,3,2,2,4344.95798,40024,1,0
3,70925143,4,0,2,1,1,1,1,1,99,...,2,2,1,1,2,2,792.521931,40027,1,0
4,75235143,1,0,6,1,4,1,1,1,99,...,2,2,2,2,2,2,1518.118526,40001,2,0


In [97]:
train.Criminal.value_counts()

0    42543
1     3175
Name: Criminal, dtype: int64

In [98]:
train.ANALWT_C.median()

2719.3351599999996

In [99]:
drop_list = ['PERID']

In [100]:
ANALWT_C_train = train['ANALWT_C']
train = train.drop(['ANALWT_C'] , axis=1)

In [101]:
ANALWT_C_test = test['ANALWT_C']
test = test.drop(['ANALWT_C'] , axis=1)

In [102]:
train = train.drop(drop_list , axis=1)
test = test.drop(drop_list , axis=1)


In [103]:
le = LabelEncoder()
for col in train.columns:
    if col != 'Criminal':
        data=train[[col]].append(test[[col]])
        le.fit(data);
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

In [104]:
from sklearn.preprocessing import OneHotEncoder
enc=OneHotEncoder(sparse=False)
for col in train.columns:
    if col != 'Criminal':
        data=train[[col]].append(test[[col]])
        enc.fit(data)
        # Fitting One Hot Encoding on train data
        temp = enc.transform(train[[col]])
        # Changing the encoded features into a data frame with new column names
        temp=pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col]
                .value_counts().index])
        # In side by side concatenation index values should be same
        # Setting the index values similar to the X_train data frame
        temp=temp.set_index(train.index.values)
        # adding the new One Hot Encoded varibales to the train data frame
        train=pd.concat([train,temp],axis=1)
        # fitting One Hot Encoding on test data
        temp = enc.transform(test[[col]])
        # changing it into data frame and adding column names
        temp=pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col]
                .value_counts().index])
        # Setting the index for proper concatenation
        temp=temp.set_index(test.index.values)
        # adding the new One Hot Encoded varibales to test data frame
        test=pd.concat([test,temp],axis=1)


In [105]:
y_train = train['Criminal']
train = train.drop( ['Criminal'] , axis=1)

In [106]:
train['ANALWT_C'] = ANALWT_C_train
test['ANALWT_C'] = ANALWT_C_test


In [107]:
print train.shape
print test.shape
print y_train.shape

(45718, 471)
(11430, 471)
(45718,)


In [108]:
from sklearn.cluster import KMeans
cluster = KMeans(n_clusters = 2)
train['predicted_cluster'] = cluster.fit_predict(train)
test['predicted_cluster'] = cluster.predict(test)

In [109]:
sm= SMOTE(random_state=42)
X_new , Y_new = sm.fit_sample(train , y_train )

In [87]:
y = cluster.fit_predict(X_new)

In [89]:
np.bincount(y)

array([72798, 12288])

In [92]:
print np.bincount(yp)
print np.bincount(y_train)

[39838  5880]
[42543  3175]


In [93]:
precision_score(yp , y_train)

0.16881889763779528

In [110]:
print X_new.shape
print Y_new.shape

(85086, 472)
(85086,)


In [111]:
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
            S_test[:, i] = S_test_i.mean(axis=1)

        results = cross_val_score(self.stacker, S_train, y, cv=3)
        print("Stacker score: %.5f" % (results.mean()))

        self.stacker.fit(S_train, y)
        res = self.stacker.predict_proba(S_test)[:,1]
        return self.stacker, res , S_train , S_test

In [112]:
lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 650
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['feature_fraction'] = 0.9
lgb_params['bagging_freq'] = 1
lgb_params['seed'] = 200

lgb_params2 = {}
lgb_params2['n_estimators'] = 1090
lgb_params2['learning_rate'] = 0.02
lgb_params2['colsample_bytree'] = 0.3   
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16

lgb_params2['feature_fraction'] = 0.9
lgb_params2['bagging_freq'] = 1
lgb_params2['seed'] = 200


lgb_params3 = {}
lgb_params3['n_estimators'] = 1100
lgb_params3['max_depth'] = 4
lgb_params3['learning_rate'] = 0.02
lgb_params3['feature_fraction'] = 0.9
lgb_params3['bagging_freq'] = 1
lgb_params3['seed'] = 200


xgb_params = {}
xgb_params['n_estimators'] = 1000
xgb_params['max_depth'] = 4
xgb_params['learning_rate'] = 0.02

In [113]:
lgb_model = LGBMClassifier(**lgb_params)

lgb_model2 = LGBMClassifier(**lgb_params2)

lgb_model3 = LGBMClassifier(**lgb_params3)

xgbmodel = xgboost.XGBClassifier(**xgb_params)

In [114]:
log_model = LogisticRegression()      
stack = Ensemble(n_splits=5,
        stacker = log_model,
        base_models = (lgb_model, lgb_model2, lgb_model3))        
        
model ,y_pred , S_train , S_test = stack.fit_predict(X_new, Y_new, test)

Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Fit LGBMClassifier fold 5
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Fit LGBMClassifier fold 5
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Fit LGBMClassifier fold 5
Stacker score: 0.96098


In [115]:
y_pred = model.predict(S_train)
precision_score(y_pred, Y_new)

0.97155818818607054

In [116]:
from sklearn.ensemble import RandomForestClassifier

lr = RandomForestClassifier()
lr.fit(S_train , Y_new)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [117]:
y_lr = lr.predict(S_train)

In [118]:
precision_score(y_pred, Y_new)
#0.97195778388924148

0.97155818818607054

In [49]:
wrong_indixes = (np.where(y_pred != Y_new))
len(wrong_indixes[0])

2147

In [50]:
Y_filterd = Y_new[wrong_indixes[0]]
len(Y_filterd)

In [53]:
S_train_filtered = S_train[wrong_indixes[0]]
len(S_train_filtered)

2147

In [72]:
rf = RandomForestClassifier()
rf.fit(S_train_filtered , Y_filterd)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [78]:
len(y_pred_2)

85086

In [80]:
y_pred4 = model2.predict(S_train_2)
precision_score(y_pred4 , Y_filterd)

0.99161777032690701

In [81]:
log_model = LogisticRegression()      
stack = Ensemble(n_splits=5,
        stacker = log_model,
        base_models = (lgb_model, lgb_model2, lgb_model3))        
        
model2 ,y_pred_2 , S_train_2 , S_test_2 = stack.fit_predict(S_train_filtered, Y_filterd, S_test)

Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Fit LGBMClassifier fold 5
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Fit LGBMClassifier fold 5
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Fit LGBMClassifier fold 5
Stacker score: 0.99068


In [82]:
len(S_test_2)

11430

In [58]:
y_pred3 = model2.predict(S_train)
precision_score(y_pred3 , Y_new)

0.97219284018522434

In [85]:
y1= model2.predict(S_test_2)

array([1, 1, 1, ..., 1, 1, 1])

In [66]:
y2 = model.predict(S_test)
len(y2)

11430

In [84]:
print np.bincount(y1)
print np.bincount(y2)

[  743 10687]
[10688   742]


In [120]:
test = pd.read_csv('./criminal_test.csv')
test['Criminal'] = lr.predict(S_test)

In [121]:
test[['PERID' , 'Criminal']].to_csv('./result.csv' , index= False)

In [40]:
# 97.2 precision score
#  accuracy score 

In [99]:
test.Criminal.value_counts()

0    10688
1      742
Name: Criminal, dtype: int64

In [199]:
np.bincount(Y_new)

array([42543, 42543])

In [198]:
np.bincount(y_pred)

array([42974, 42112])

In [103]:
742.0/10688

0.06942365269461077

In [155]:
import xgboost

In [186]:
xbb = xgboost.XGBClassifier( n_estimators=500)

In [187]:
xbb.fit(S_train , Y_new)


XGBClassifier(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=500,
       nthread=-1, objective='binary:logistic', seed=0, silent=True,
       subsample=1)

In [188]:
y_pred = xbb.predict(S_train)

In [189]:
precision_score(y_pred, Y_new)
#0.97195778388924148

0.97075899677972877

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,461,462,463,464,465,466,467,468,469,470
0,4.000000,3.000000,4.000000,1.0,3.000000,1.0,1.000000,1.0,5.0,6.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.0,1.000000,0.000000,3884.805998
1,4.000000,2.000000,3.000000,1.0,2.000000,1.0,1.000000,1.0,5.0,6.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,1.000000,1627.108106
2,4.000000,2.000000,2.000000,1.0,2.000000,1.0,1.000000,1.0,5.0,6.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.0,1.000000,0.000000,4344.957980
3,4.000000,1.000000,2.000000,1.0,1.000000,1.0,1.000000,1.0,5.0,6.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.0,1.000000,0.000000,792.521931
4,1.000000,1.000000,6.000000,1.0,4.000000,1.0,1.000000,1.0,5.0,1.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,1.000000,1518.118526
5,4.000000,1.000000,2.000000,1.0,1.000000,1.0,1.000000,1.0,5.0,6.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,1.000000,9129.229124
6,4.000000,4.000000,6.000000,1.0,4.000000,1.0,1.000000,1.0,5.0,6.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,1.000000,6561.895497
7,4.000000,3.000000,4.000000,1.0,3.000000,1.0,1.000000,1.0,5.0,6.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,1.000000,3341.718873
8,4.000000,2.000000,3.000000,1.0,2.000000,1.0,1.000000,1.0,5.0,6.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,1.000000,3384.147888
9,4.000000,1.000000,1.000000,1.0,1.000000,1.0,2.000000,1.0,5.0,6.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.0,1.000000,0.000000,2636.943978


In [206]:
colmn = train.columns

In [207]:
newdf = pd.DataFrame(X_new  , columns=colmn)

In [210]:
newdf['Pred_Criminal'] = y_pred

In [211]:
newdf['Criminal']= Y_new

In [220]:
incorrectdf = newdf[newdf.Pred_Criminal != newdf.Criminal]
y_incorrectdf = newdf['Criminal']

In [218]:
incorrectdf = incorrectdf.drop(['Criminal' , 'Pred_Criminal'] , axis=1)

In [225]:
incorrectdf.ANALWT_C.median()

2600.987919