In [1]:
import numpy as np
import pandas as pd

In [289]:
from scipy.special import expit

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

In [2]:
data = np.load("train.dat.npz")
train_users = data["users"]
train_tokens = data["users_tokens"]
del(data)
data = np.load("test.dat.npz")
test_users = data["users"]
test_tokens = data["users_tokens"]
del(data)

In [3]:
TRAINING_SET_URL = "../twitter_train.csv"
TESTING_SET_URL = "../twitter_test.csv"
df_train = pd.read_csv(TRAINING_SET_URL)
df_test = pd.read_csv(TESTING_SET_URL)

In [10]:
def f(x):
    if x[0] == 1:
        return 1
    if x[1] == 1:
        return 2
    if x[2] == 1:
        return 3

Y = df_train[['is_1', 'is_2', 'is_3']].apply(f, axis=1).values

In [20]:
v = DictVectorizer()
X = v.fit_transform(np.append(train_tokens, test_tokens))

In [71]:
features_counts = np.asarray((X > 0).sum(axis=0)).ravel()
X_tmp = X.tocsc()[:, features_counts > 200].toarray()
train_x = X_tmp[:train_users.size]
test_x = X_tmp[train_users.size:]

In [190]:
tfidf = TfidfTransformer()
train_x_tfidf = tfidf.fit_transform(train_x)
test_x_tfidf = tfidf.fit_transform(test_x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_x_tfidf, Y, test_size=0.33)

# Vowpal Wabbit

In [274]:
def save_to_vw(data, fname, target=None):
    with open(fname, 'w') as fout:
        for i, tokens in enumerate(data):
            text = ' '.join([word for word in tokens.keys() if not word.isnumeric() and len(word) > 3])
            if target is not None:
                fout.write('{0} |t {1}\n'.format(target[i], text))
            else:
                fout.write('|t {0}\n'.format(text))

In [469]:
def save_to_vw_tf(data, fname, target=None):
    with open(fname, 'w') as fout:
        for i, row in enumerate(data):
            s = ''
            if target is not None:
                s = '%s ' % target[i]
            for j, word in enumerate(row):
                s += '|{0} {1} '.format(j, word)
            
            fout.write(s + '\n')

In [396]:
def get_normalized_wv_pred(fname):
    pred_vw_raw = pd.read_table(fname, sep=' |:', engine='python', header=None)[[1, 3, 5]].values
    pred_vw = expit(pred_vw_raw)
    for i in range(pred_vw.shape[0]):
        s = pred_vw[i].sum()
        for j in range(pred_vw.shape[1]):
            pred_vw[i, j] = pred_vw[i, j] / s
        
    return pred_vw

In [489]:
train_idx, test_idx = train_test_split(np.arange(train_tokens.size), test_size=0.33)

In [496]:
save_to_vw_tf(train_x_tfidf.toarray(), "train.vw", Y)

In [497]:
save_to_vw_tf(test_x_tfidf.toarray(), "test.vw")

In [498]:
!vw -d train.vw -c -k -f model.vw --passes 50 --loss_function logistic --oaa 3 --nn 5

final_regressor = model.vw
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = train.vw.cache
Reading datafile = train.vw
num sources = 1
average    since         example     example  current  current  current
loss       last          counter      weight    label  predict features
0.333333   0.333333          3      3.0          3        3     2419
0.166667   0.000000          6      6.0          3        3     2419
0.272727   0.400000         11     11.0          3        3     2419
0.590909   0.909091         22     22.0          1        3     2419
0.590909   0.590909         44     44.0          1        3     2419
0.540230   0.488372         87     87.0          3        3     2419
0.540230   0.540230        174    174.0          1        3     2419
0.557471   0.574713        348    348.0          3        3     2419
0.574713   0.591954        696    696.0          1        1     2419
0.555316   0.535920       1392   1

In [499]:
!vw -d test.vw -i model.vw -t -r pred.txt

only testing
Num weight bits = 18
learning rate = 10
initial_t = 1
power_t = 0.5
raw predictions = pred.txt
using no cache
Reading datafile = test.vw
num sources = 1
average    since         example     example  current  current  current
loss       last          counter      weight    label  predict features
1.000000   1.000000          3      3.0         -1        3     2419
1.000000   1.000000          6      6.0         -1        3     2419
1.000000   1.000000         11     11.0         -1        1     2419
1.000000   1.000000         22     22.0         -1        1     2419
1.000000   1.000000         44     44.0         -1        1     2419
1.000000   1.000000         87     87.0         -1        1     2419
1.000000   1.000000        174    174.0         -1        3     2419
1.000000   1.000000        348    348.0         -1        3     2419
1.000000   1.000000        696    696.0         -1        1     2419
1.000000   1.000000       1392   1392.0         -1        1     2419


In [500]:
pred_vw = get_normalized_wv_pred("pred.txt")

# Gradient Boosting

In [202]:
from sklearn.ensemble import GradientBoostingClassifier

In [203]:
gbc = GradientBoostingClassifier()

In [207]:
gbc.fit(train_x_tfidf, Y)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [206]:
log_loss(y_test, gbc.predict_proba(x_test.toarray()))

0.80953145377040381

In [216]:
Y_pred = gbc.predict_proba(test_x_tfidf.toarray())

# Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

In [37]:
clf = LogisticRegression()

In [211]:
clf.fit(train_x_tfidf, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
log_loss(y_test, clf.predict_proba(x_test))

In [212]:
Y_pred = clf.predict_proba(test_x_tfidf)

# Writing predict

In [7]:
pred_df = pd.DataFrame()

In [4]:
Y_pred = np.ones((test_users.size, 3))

In [8]:
pred_df['twitter_id'] = test_users
pred_df['is_1'] = Y_pred[:, 0]
pred_df['is_2'] = Y_pred[:, 1]
pred_df['is_3'] = Y_pred[:, 2]

In [9]:
pred_df.to_csv("pred.csv", index=False)