In [13]:
#basic
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from tqdm import tqdm_notebook
from sklearn.decomposition import PCA
#Pretreatment
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
#model
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import svm, neighbors, linear_model, neural_network
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.covariance import GraphicalLasso
from sklearn.mixture import GaussianMixture
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

In [3]:
def get_mean_cov(x,y):
    model = GraphicalLasso()
    ones = (y==1).astype(bool)
    x2 = x[ones]
    model.fit(x2)
    p1 = model.precision_
    m1 = model.location_
    
    onesb = (y==0).astype(bool)
    x2b = x[onesb]
    model.fit(x2b)
    p2 = model.precision_
    m2 = model.location_
    
    ms = np.stack([m1,m2])
    ps = np.stack([p1,p2])
    return ms,ps

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [15]:
# INITIALIZE VARIABLES
cols = [c for c in train.columns if c not in ['id', 'target']]
cols.remove('wheezy-copper-turtle-magic')
oof_gm = np.zeros(len(train))
preds_gm = np.zeros(len(test))

# BUILD 512 SEPARATE MODELS
for i in tqdm_notebook(range(512)):
    # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
    train2 = train[train['wheezy-copper-turtle-magic']==i]
    test2 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train2.index; idx2 = test2.index
    train2.reset_index(drop=True,inplace=True)
    
    # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
    sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
    train3 = sel.transform(train2[cols])
    test3 = sel.transform(test2[cols])
    
    # STRATIFIED K-FOLD
    skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True)
    for train_index, test_index in skf.split(train3, train2['target']):
        
        # MODEL AND PREDICT WITH QDA
        ms, ps = get_mean_cov(train3[train_index,:],train2.loc[train_index]['target'].values)
        
        gm = GaussianMixture(n_components=2, init_params='random', covariance_type='full', tol=0.001,reg_covar=0.001, max_iter=100, n_init=1,means_init=ms, precisions_init=ps)
        gm.fit(np.concatenate([train3[train_index,:],test3],axis = 0))
        oof_gm[idx1[test_index]] = gm.predict_proba(train3[test_index,:])[:,0]
        preds_gm[idx2] += gm.predict_proba(test3)[:,0] / skf.n_splits

        
# PRINT CV AUC
auc = roc_auc_score(train['target'],oof_gm)
print('QDA scores CV =',round(auc,5))

HBox(children=(IntProgress(value=0, max=512), HTML(value='')))

QDA scores CV = 0.96878


In [21]:
cols = [c for c in train.columns if c not in ['id', 'target', 'wheezy-copper-turtle-magic']]
oof_svnu = np.zeros(len(train)) 
pred_te_svnu = np.zeros(len(test))

oof_svc = np.zeros(len(train)) 
pred_te_svc = np.zeros(len(test))

for i in tqdm_notebook(range(512)):
    train2 = train[train['wheezy-copper-turtle-magic']==i]
    test2 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train2.index; idx2 = test2.index
    train2.reset_index(drop=True,inplace=True)

    data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
    data2 = StandardScaler().fit_transform(PCA(svd_solver='full',n_components='mle').fit_transform(data[cols]))
    train3 = data2[:train2.shape[0]]; test3 = data2[train2.shape[0]:]
    
    # STRATIFIED K FOLD (Using splits=25 scores 0.002 better but is slower)
    skf = StratifiedKFold(n_splits=5, random_state=42)
    for train_index, test_index in skf.split(train2, train2['target']):

        clf = NuSVC(probability=True, kernel='poly', degree=4, gamma='auto', random_state=4, nu=0.59, coef0=0.053)
        clf.fit(train3[train_index,:],train2.loc[train_index]['target'])
        oof_svnu[idx1[test_index]] = clf.predict_proba(train3[test_index,:])[:,1]
        pred_te_svnu[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits
                
        clf = svm.SVC(probability=True, kernel='poly', degree=4, gamma='auto', random_state=42)
        clf.fit(train3[train_index,:],train2.loc[train_index]['target'])
        oof_svc[idx1[test_index]] = clf.predict_proba(train3[test_index,:])[:,1]
        pred_te_svc[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        
print('svc', roc_auc_score(train['target'], oof_svc))
print('svcnu', roc_auc_score(train['target'], oof_svnu))

HBox(children=(IntProgress(value=0, max=512), HTML(value='')))

svc 0.9506446727431673
svcnu 0.9602912865680392


In [16]:
cols = [c for c in train.columns if c not in ['id', 'target','wheezy-copper-turtle-magic']]
oof_qda = np.zeros(len(train)) 
pred_qda = np.zeros(len(test))

for i in tqdm_notebook(range(512)):
    train1 = train[train['wheezy-copper-turtle-magic']==i]
    test1 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train1.index; idx2 = test1.index
    train1.reset_index(drop=True,inplace=True)
    
    data = pd.concat([pd.DataFrame(train1[cols]), pd.DataFrame(test1[cols])])
    data2 =VarianceThreshold(threshold=1.5).fit_transform(data[cols])
    train2 = data2[:train1.shape[0]]; test2 = data2[train1.shape[0]:]
    
    
    #oof
    skf = StratifiedKFold(n_splits=11, random_state=42)
    for train_index, test_index in skf.split(train1, train1['target']):

        clf = QuadraticDiscriminantAnalysis(0.1)
        clf.fit(train2[train_index,:],train1.loc[train_index]['target'])
        oof_qda[idx1[test_index]] = clf.predict_proba(train2[test_index,:])[:,1]
        pred_qda[idx2] += clf.predict_proba(test2)[:,1] / skf.n_splits
                
       
        
print('QDA scores CV', roc_auc_score(train['target'], oof_qda))
# test예측값 할당
test['target'] = pred_qda

oof_qda_pl = np.zeros(len(train)) 
pred_qda_pl = np.zeros(len(test))

for i in tqdm_notebook(range(512)):
    train1 = train[train['wheezy-copper-turtle-magic']==i]
    train_pl = train1.copy()
    test1 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train1.index; idx2 = test1.index
    
    #pseudo labeling
    test_pl = test1[(test1['target']>0.99) | (test1['target']<0.01)].copy()
    test_pl.loc[ test_pl['target']>=0.5, 'target' ] = 1
    test_pl.loc[ test_pl['target']< 0.5, 'target' ] = 0 
    train_pl2 =  pd.concat([train_pl,test_pl],axis=0)
    train_pl2.reset_index(drop=True,inplace=True)
    
    useful = VarianceThreshold(threshold=1.5).fit(train_pl2[cols])     
    train_pl3 = useful.transform(train_pl2[cols])
    train2 = useful.transform(train1[cols])
    test2 = useful.transform(test1[cols])
    
    #oof
    skf = StratifiedKFold(n_splits=11, random_state=42,shuffle=True)
    for train_index, test_index in skf.split(train_pl3, train_pl2['target']):
        
        #pseudo label을 무시하는 코드 중요!!
        # 처음 StratifiedKFold로 분할할 때 shuffle=True옵션을 안넣어줘 계속 오류발생
        test_index3 = test_index[ test_index<len(train2) ]
        
        clf = QuadraticDiscriminantAnalysis(0.1)
        clf.fit(train_pl3[train_index,:],train_pl2.loc[train_index]['target'])
        oof_qda_pl[idx1[test_index3]] = clf.predict_proba(train2[test_index3,:])[:,1]
        pred_qda_pl[idx2] += clf.predict_proba(test2)[:,1] / skf.n_splits
                
       
     
print('Pseudo Labeled QDA scores CV =',round(roc_auc_score(train['target'],oof_qda_pl),5))

HBox(children=(IntProgress(value=0, max=512), HTML(value='')))

QDA scores CV 0.9648973416075834


HBox(children=(IntProgress(value=0, max=512), HTML(value='')))

Pseudo Labeled QDA scores CV = 0.97024


In [22]:
oof_svnu = oof_svnu.reshape(-1, 1)
pred_te_svnu = pred_te_svnu.reshape(-1, 1)
oof_svc = oof_svc.reshape(-1, 1)
pred_te_svc = pred_te_svc.reshape(-1, 1)
oof_qda_pl = oof_qda_pl.reshape(-1, 1)
pred_qda_pl = pred_qda_pl.reshape(-1, 1)
oof_gm = oof_gm.reshape(-1, 1)
pred_te_gm=preds_gm.reshape(-1, 1)

tr = np.concatenate((oof_svnu, oof_svc, oof_qda_pl,oof_gm), axis=1)
te = np.concatenate((pred_te_svnu, pred_te_svc,  pred_qda_pl,pred_te_gm), axis=1)
print(tr.shape, te.shape)

(262144, 4) (131073, 4)


In [23]:
from xgboost import XGBClassifier


oof_xgb = np.zeros(len(train)) 
pred_te_xgb = np.zeros(len(test))
skf = StratifiedKFold(n_splits=5, random_state=42)

for train_index, test_index in skf.split(tr, train['target']):
    # lrr = svm.NuSVC(probability=True, kernel='poly', degree=2, gamma='auto', random_state=42, nu=0.6, coef0=0.6) 
    lrr = XGBClassifier() # solver='liblinear',penalty='l1',C=0.1
    lrr.fit(tr[train_index], train['target'][train_index])
    oof_xgb[test_index] = lrr.predict_proba(tr[test_index,:])[:,1]
    pred_te_xgb += lrr.predict_proba(te)[:,1] / skf.n_splits
    
print('stack CV score =',round(roc_auc_score(train['target'],oof_xgb),6))

stack CV score = 0.971757
