In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.svm import *
from sklearn.feature_selection import *
from sklearn.feature_extraction import *
from sklearn.naive_bayes import *
import xgboost
from scipy.sparse import *
from sklearn.decomposition import *
from sklearn.neural_network import *
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from joblib import Parallel, delayed
import pickle

%matplotlib inline

Using TensorFlow backend.


In [2]:
data = np.loadtxt('training_data.txt', skiprows=1)
y = data[:,0]
X = data[:,1:]
Xtest = np.loadtxt('test_data.txt', skiprows=1)

In [3]:
def gen_TfidX(X):
    Tfid = text.TfidfTransformer(norm='l2')
    Tfid.fit(X)
    TfidXsparse = Tfid.transform(X)
    TfidX = np.asarray(csr_matrix.todense(TfidXsparse))
    return TfidX

def gen_svX(X):
    TfidX = gen_TfidX(X)
    svd = TruncatedSVD(n_components=100, algorithm='arpack')
    sv_tfidX = svd.fit_transform(TfidX)
    return sv_tfidX

def gen_combX(X, TfidX, svX):
    Xvecsum = np.sum(X, axis=1)
    Xcomb = []

    for i in range(0, len(X)):
        temp_arr = np.concatenate((TfidX[i], svX[i]))
        temp_arr = np.append(temp_arr, Xvecsum[i])
        Xcomb.append(temp_arr)
    Xcomb = np.array(Xcomb)
    return Xcomb


In [4]:
def create_baseline(input_N):
    def build_fn():
        do = 0.05
        model = Sequential()
        model.add(Dense(100, input_dim=input_N, kernel_initializer='normal', activation='relu'))
        model.add(Dropout(do))
        model.add(Dense(30, activation='relu'))
        model.add(Dropout(do))
        model.add(Dense(30, activation='relu'))
        model.add(Dropout(do))
        model.add(Dense(30, activation='relu'))
        model.add(Dropout(do))


        model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
        # Compile model
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        return model
    return build_fn

In [5]:
def meta_model_iter(X, y):
    X1, X2, y1, y2 = train_test_split(X, y, test_size=0.2)
    TfidX1 = gen_TfidX(X1)
    svX1 = gen_svX(X1)
    combX1 = gen_combX(X1, TfidX1, svX1)
    TfidX2 = gen_TfidX(X2)
    svX2 = gen_svX(X2)
    combX2 = gen_combX(X2, TfidX2, svX2)
    
    rbf_svm = SVC(C=1.5, gamma=0.015)    
    svd_clf = SVC(C=50, gamma=0.2)
    pn_svm = SVC(C=0.05, coef0=10, degree=3)
    nn = create_baseline(combX1.shape[1])()    
    mn_bayes = MultinomialNB(alpha=0.5)
    xg_lin = xgboost.XGBClassifier(booster='gblinear', reg_lambda=0, eval_metric='error')
    xg_tree = xgboost.XGBClassifier(max_depth=5, objective='binary:logistic', eta=0.6)
    
    rbf_svm.fit(X1, y1)
    svd_clf.fit(svX1, y1)
    pn_svm.fit(X1, y1)
    nn.fit(combX1, y1, epochs=30, batch_size=20, verbose=False)
    mn_bayes.fit(TfidX1, y1)
    xg_lin.fit(TfidX1, y1)
    xg_tree.fit(TfidX1, y1)
    
    rbf_svm_pred = rbf_svm.predict(X2)
    svd_clf_pred = svd_clf.predict(svX2)
    pn_svm_pred = pn_svm.predict(X2)
    nn_pred_sig = nn.predict(combX2)
    nn_pred = np.round(nn_pred_sig)[:,0]
    bayes_pred = mn_bayes.predict(TfidX2)
    xg_lin_pred = xg_lin.predict(TfidX2)
    xg_tree_pred = xg_tree.predict(TfidX2)
    
    preds = [rbf_svm_pred, svd_clf_pred, pn_svm_pred, nn_pred, bayes_pred, xg_lin_pred, xg_tree_pred]
    preds = np.array(preds)

    return preds, y2

In [12]:
def make_test_preds(X, y, Xtest):
    TfidX = gen_TfidX(X)
    svX = gen_svX(X)
    combX = gen_combX(X, TfidX, svX)
    
    svXtest = gen_svX(Xtest)
    TfidXtest = gen_TfidX(Xtest)
    combXtest = gen_combX(Xtest, TfidXtest, svXtest)
    
    rbf_svm = SVC(C=1.5, gamma=0.015)    
    svd_clf = SVC(C=50, gamma=0.2)
    pn_svm = SVC(C=0.05, coef0=10, degree=3)
    nn = create_baseline(combX.shape[1])()    
    mn_bayes = MultinomialNB(alpha=0.5)
    xg_lin = xgboost.XGBClassifier(booster='gblinear', reg_lambda=0, eval_metric='error')
    xg_tree = xgboost.XGBClassifier(max_depth=5, objective='binary:logistic', eta=0.6)
    
    rbf_svm.fit(X, y)
    svd_clf.fit(svX, y)
    pn_svm.fit(X, y)
    nn.fit(combX, y, epochs=30, batch_size=20, verbose=False)
    mn_bayes.fit(TfidX, y)
    xg_lin.fit(TfidX, y)
    xg_tree.fit(TfidX, y)
    
    rbf_svm_pred = rbf_svm.predict(Xtest)
    svd_clf_pred = svd_clf.predict(svXtest)
    pn_svm_pred = pn_svm.predict(Xtest)
    nn_pred_sig = nn.predict(combXtest)
    nn_pred = np.round(nn_pred_sig)[:,0]
    bayes_pred = mn_bayes.predict(TfidXtest)
    xg_lin_pred = xg_lin.predict(TfidXtest)
    xg_tree_pred = xg_tree.predict(TfidXtest)
    
    preds = [rbf_svm_pred, svd_clf_pred, pn_svm_pred, nn_pred, bayes_pred, xg_lin_pred, xg_tree_pred]
    preds = np.array(preds)

    return preds

In [29]:
def make_pred_df(test_pred):
    preds = []
    for pred in test_pred:
        preds.append(int(pred))
    out_df = pd.DataFrame({'Prediction':preds})
    out_df.index += 1
    return out_df

In [13]:
test_preds = make_test_preds(X, y, Xtest)

In [None]:
fi = open('TestEnsemblePreds.dat', 'wb')
pickle.dump(test_preds, fi)

In [48]:
lr = xgboost.XGBClassifier(max_depth=3, eta=0.3)

In [53]:
lr_preds = []
for i in range(0, 41):
    fi = open('EnsemblePred_{}.dat'.format(0), 'wb')
    runpred, y = pickle.load(open('EnsemblePred_11.dat', 'rb'))
    runpred = runpred[:,0:3]
    lr.fit(runpred.T, y)
    lr_preds.append(lr.predict(test_preds[:,0:3].T))

XGBoostError: b'[11:18:06] src/objective/regression_obj.cc:44: Check failed: preds.size() == info.labels.size() (3 vs. 4000) labels are not correctly providedpreds.size=3, label.size=4000\n\nStack trace returned 6 entries:\n[bt] (0) 0   libxgboost.dylib                    0x0000001a0d43ffc8 _ZN4dmlc15LogMessageFatalD2Ev + 40\n[bt] (1) 1   libxgboost.dylib                    0x0000001a0d4ad347 _ZN7xgboost3obj10RegLossObjINS0_22LogisticClassificationEE11GetGradientERKNSt3__16vectorIfNS4_9allocatorIfEEEERKNS_8MetaInfoEiPNS5_INS_6detail18bst_gpair_internalIfEENS6_ISG_EEEE + 599\n[bt] (2) 2   libxgboost.dylib                    0x0000001a0d43c616 _ZN7xgboost11LearnerImpl13UpdateOneIterEiPNS_7DMatrixE + 1014\n[bt] (3) 3   libxgboost.dylib                    0x0000001a0d4554ef XGBoosterUpdateOneIter + 79\n[bt] (4) 4   _ctypes.cpython-36m-darwin.so       0x00000001064802c7 ffi_call_unix64 + 79\n[bt] (5) 5   ???                                 0x00007ffeea53dac0 0x0 + 140732829784768\n'

In [50]:
out_df = make_pred_df(np.median(lr_preds, axis=0))
out_df.to_csv('Ens_xg.txt', index_label='Id')

In [37]:
fi = open('EnsemblePred_{}.dat'.format(0), 'wb')
pickle.dump(out, fi)

In [41]:
test = pickle.load(open('EnsemblePred_11.dat', 'rb'))

In [42]:
test

(array([[ 0.,  1.,  0., ...,  1.,  0.,  1.],
        [ 1.,  1.,  1., ...,  0.,  0.,  0.],
        [ 1.,  1.,  1., ...,  1.,  1.,  1.],
        ..., 
        [ 0.,  1.,  0., ...,  1.,  0.,  1.],
        [ 0.,  1.,  0., ...,  1.,  0.,  1.],
        [ 0.,  1.,  0., ...,  1.,  1.,  1.]]),
 array([ 1.,  1.,  0., ...,  1.,  0.,  1.]))

In [43]:
test = pickle.load(open('EnsemblePred_12.dat', 'rb'))

In [44]:
test

(array([[ 1.,  0.,  0., ...,  0.,  0.,  1.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 1.,  1.,  1., ...,  1.,  1.,  1.],
        ..., 
        [ 1.,  0.,  0., ...,  0.,  0.,  1.],
        [ 1.,  0.,  0., ...,  0.,  0.,  1.],
        [ 1.,  1.,  0., ...,  0.,  0.,  0.]]),
 array([ 1.,  0.,  0., ...,  0.,  0.,  1.]))