In [67]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.svm import *
from sklearn.feature_selection import *
from sklearn.feature_extraction import *
from sklearn.naive_bayes import *
import xgboost
from scipy.sparse import *
from sklearn.decomposition import *
from sklearn.neural_network import *


%matplotlib inline

In [3]:
data = np.loadtxt('training_data.txt', skiprows=1)
y = data[:,0]
X = data[:,1:]

In [4]:
Xtest = np.loadtxt('test_data.txt', skiprows=1)

In [5]:
Xvecsum = np.sum(X, axis=1)

Tfid = text.TfidfTransformer(norm='l2')
Tfid.fit(X, y)
TfidXsparse = Tfid.transform(X)
TfidXtest = Tfid.transform(Xtest)
TfidX = np.asarray(csr_matrix.todense(TfidXsparse))
temp_arr = []
for i in range(0, len(TfidX)):
    temp_arr.append(np.append(TfidX[i], Xvecsum[i]))
TfidX_sum = np.array(temp_arr)

In [10]:
svd = TruncatedSVD(n_components=100, algorithm='arpack')
sv_tfidX = svd.fit_transform(TfidX)

In [10]:
binX = []
for i in range(0, len(X)):
    binX.append(X[i] > 0)
binX = np.array(binX)

In [11]:
vocab_count = np.sum(X, axis=0)

In [11]:
Xvecsum = np.sum(X, axis=1)
Xcomb = []

for i in range(0, len(X)):
    temp_arr = np.concatenate((TfidX[i], sv_tfidX[i]))
    temp_arr = np.append(temp_arr, Xvecsum[i])
    Xcomb.append(temp_arr)
Xcomb = np.array(Xcomb)


In [13]:
Xcomb.shape

(20000, 1101)

### Reducing feature space

In [14]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
slimX = model.transform(X)

lsvc = LinearSVC(C=0.05, penalty="l1", dual=False).fit(Xnorm, y)
model = SelectFromModel(lsvc, prefit=True)
slimXnorm = model.transform(Xnorm)

In [15]:
slimXnorm.shape

(20000, 246)

### Regularized logistic regression

In [8]:
def fit_lr(X, y):
    lr_clf = LogisticRegressionCV()
    lr_clf.fit(X, y)
    best_C = lr_clf.C_[0]

    lr_clf = LogisticRegression(C=best_C)
    kfold = StratifiedKFold(n_splits=5, shuffle=True)
    scores = cross_val_score(lr_clf, X, y, cv=kfold)
    return np.mean(scores)

In [17]:
fit_lr(Xnorm, y)

0.8515996924812308

In [14]:
fit_lr(Xcomb, y)

0.85089970501560663

### Linear-SVM

In [10]:
def linear_svmfit(X, y):
    svc_params = {'C': np.logspace(-4, 4, 7)}
    svc = LinearSVC(dual=False)
    search_clf = GridSearchCV(svc, svc_params)
    search_clf.fit(X, y)
    clf = search_clf.best_estimator_
    kfold = StratifiedKFold(n_splits=5, shuffle=True)
    scores = cross_val_score(clf, X, y, cv=kfold)
    return np.mean(scores)

In [375]:
linear_svmfit(binX, y)

0.84204977965311123

In [376]:
linear_svmfit(TfidX, y)

0.85119914267494645

### Neural-net

In [18]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


In [19]:
def create_baseline(input_N):
    def build_fn():
        model = Sequential()
        model.add(Dense(100, input_dim=input_N, kernel_initializer='normal', activation='relu'))
        model.add(Dropout(0.05))
        model.add(Dense(30, activation='relu'))
        model.add(Dropout(0.05))
        model.add(Dense(30, activation='relu'))
        model.add(Dropout(0.05))

        model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
        # Compile model
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        return model
    return build_fn

In [20]:
def test_NN(X, y):
    estimator = KerasClassifier(build_fn=create_baseline(X.shape[1]), epochs=5, batch_size=20, verbose=0)
    kfold = StratifiedKFold(n_splits=5, shuffle=True)
    results = cross_val_score(estimator, X, y, cv=kfold)
    return results.mean()

In [21]:
test_NN(Xcomb,y)

0.85044867954880343

### Gradient boosted trees

In [433]:
xg_tree = xgboost.XGBClassifier(max_depth=10, objective='binary:logistic', eta=0.6)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
scores = cross_val_score(xg_tree, TfidX_sum, y, cv=kfold)
scores.mean()

0.82295144120009012

### Gradient boosted linear

In [22]:
xg_model = xgboost.XGBClassifier(booster='gblinear', reg_lambda=0, eval_metric='error', alpha=0.1)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
scores = cross_val_score(xg_model, Xcomb, y, cv=kfold)
scores.mean()

0.84674965490935339

In [19]:
xg_model = xgboost.XGBClassifier(booster='gblinear', reg_lambda=0, eval_metric='error')
kfold = StratifiedKFold(n_splits=5, shuffle=True)
scores = cross_val_score(xg_model, sv_tfidX, y, cv=kfold)
scores.mean()

0.82809961642497609

In [90]:
xgtree = xgboost.XGBClassifier()
xgtree.fit(TfidX, y)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
scores = cross_val_score(xgtree, TfidX, y, cv=kfold)

In [91]:
scores.mean()

0.80035241990060491

In [102]:
xg_model = xgboost.XGBClassifier(booster='gblinear', reg_lambda=0, 'objective':'binary:logistic')
xg_model.fit(TfidX, y)
test_pred = xg_model.predict(TfidXtest)

In [119]:
def make_pred_df(test_pred):
    preds = []
    for pred in test_pred:
        preds.append(int(pred))
    out_df = pd.DataFrame({'Prediction':preds})
    out_df.index += 1
    return out_df

In [120]:
out_df = make_pred_df(test_pred)
out_df.to_csv('XGlinear_Tfid_01.txt', index_label='Id')

### Naive Bayes

In [390]:
mn_bayes = MultinomialNB(alpha=0.5)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
scores = cross_val_score(mn_bayes, TfidX, y, cv=kfold)
scores.mean()

0.83019890401555652

0.81569986570624164

### Restricted boltzmann machine

### Ensemble models (not worth it?)

In [45]:
X1, X2, y1, y2 = train_test_split(X, y, test_size=0.25)
partial_Tfid = text.TfidfTransformer()
partial_Tfid.fit(X1, y1)
tfX1 = partial_Tfid.transform(X1)
tfX2 = partial_Tfid.transform(X2)
svd = TruncatedSVD(n_components=100, algorithm='arpack')
svX1 = svd.fit_transform(tfX1)
svX2 = svd.fit_transform(tfX2)

In [46]:
nn = Sequential()
nn.add(Dense(100, input_dim=tfX1.shape[1], kernel_initializer='normal', activation='relu'))
nn.add(Dropout(0.2))
nn.add(Dense(30, activation='relu'))
nn.add(Dropout(0.2))

nn.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
# Compile model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
nn.fit(tfX1, y1, epochs=10, batch_size=20, verbose=0)

<keras.callbacks.History at 0x1a2d3eb978>

In [49]:
nn.evaluate(tfX2, y2)



[0.63085801222324367, 0.83779999999999999]

In [50]:
nn_notfid = Sequential()
nn_notfid.add(Dense(100, input_dim=X1.shape[1], kernel_initializer='normal', activation='relu'))
nn_notfid.add(Dropout(0.2))
nn_notfid.add(Dense(30, activation='relu'))
nn_notfid.add(Dropout(0.2))

nn_notfid.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
# Compile model
nn_notfid.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
nn_notfid.fit(X1, y1, epochs=10, batch_size=20, verbose=0)

<keras.callbacks.History at 0x1a2e94d5f8>

In [56]:
nn_notfid.evaluate(X2, y2)



[0.81601017638444906, 0.83720000000000006]

In [58]:
nn_sv = Sequential()
nn_sv.add(Dense(60, input_dim=svX1.shape[1], kernel_initializer='normal', activation='relu'))
nn_sv.add(Dropout(0.1))

nn_sv.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
# Compile model
nn_sv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
nn_sv.fit(svX1, y1, epochs=10, batch_size=20, verbose=0)

<keras.callbacks.History at 0x1a2d3f3e48>

In [59]:
nn_sv.evaluate(svX2, y2)



[1.3100045047760009, 0.49659999999999999]

In [68]:
xg_model = xgboost.XGBClassifier()
xg_model.fit(svX1, y1)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [69]:
xg_model.score(svX2, y2)

0.50980000000000003

In [37]:
nn_pred = np.round(nn.predict(tfX2))[:,0]
nn_notfid_pred = np.round(nn_notfid.predict(X2))[:,0]
nn_sv_pred = np.round(nn_sv.predict(sv_tfidX))[0]

In [29]:
ens_X = [nn_pred, nn_notfid_pred, nn_sv_pred]
ens_X = np.array(ens_X).T

In [33]:
np.mean(y2 == nn_sv_pred)

0.4924

In [283]:
ens_Xtest = [lr_predtest, nn_predtest, mn_predtest, xglin_predtest, xgtree_predtest, nn_notfid_predtest]
ens_Xtest = np.array(ens_Xtest).T

In [284]:
test_pred = xgtree_ens.predict(ens_Xtest)

In [286]:
out_df = make_pred_df(test_pred)
out_df.to_csv('Ensemble_01.txt', index_label='Id')