In [39]:
import numpy as np
from scipy import sparse
from sklearn.metrics import f1_score, make_scorer, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

import matplotlib.pyplot as plt

In [2]:
def read_data_from_file(filename, shape):
    values = list()
    rows = list()
    cols = list()
    
    header = True
    for line in open(filename):
        if header:
            header = False
            continue
        row, col, value = [x for x in line.strip().split(',')]
        row, col = int(row), int(col)
        value = float(value)
        row -= 1
        col -= 1
        values.append(value)
        rows.append(row)
        cols.append(col)
        
    return sparse.csr_matrix((values, (rows, cols)), shape=shape)

In [3]:
X_train = read_data_from_file('data/X_train.csv', (15000, 30000)).astype(float)
X_test = read_data_from_file('data/X_test.csv', (15000, 30000)).astype(float)
print(X_train.shape, X_test.shape)

(15000, 30000) (15000, 30000)


In [4]:
from sklearn.preprocessing import scale

X_all = sparse.vstack([X_train, X_test])
X_all = scale(X_all, with_mean=False)
X_train = X_all[:15000, :]
X_test = X_all[15000:, :]

del X_all

In [5]:
def read_labels_from_file(filename, shape):
    labels = np.zeros(shape).astype(int)

    header = True
    for line in open(filename):
        if header:
            header = False
            continue
        row, indeces = line.strip().split(',')
        row = int(row) - 1
        indeces = [int(x) - 1 for x in indeces.split()]
        labels[row, indeces] = 1
    
    return labels

In [13]:
y_train = read_labels_from_file('data/y_train.csv', (15000, 98))

In [6]:
def labels_to_target(labels):
    lbls_size = labels.sum()
    indexes = np.zeros(lbls_size, dtype=np.int)
    target = np.zeros(lbls_size, dtype=np.int)
    
    k = 0
    for i, line in enumerate(labels):
        for j, l in enumerate(line):
            if l == 1:
                indexes[k] = i
                target[k] = j
                k += 1
        
    return indexes, target

In [7]:
def target_to_labels(indexes, target, label_size):
    un_idx = np.unique(indexes).size
    labels = np.zeros((un_idx, label_size), dtype=np.int)

    labels[(indexes, target)] = 1

    return labels

In [152]:
def write_labels_to_file(labels, filename):
    outfile = open(filename, 'w')
    print("Id,Labels", file=outfile)
    for i, line in enumerate(labels):
        elements = [str(x) for x in list(np.nonzero(line)[0] + 1)]
        
        print("%d,%s" % (i + 1, ' '.join(elements)), file=outfile)

In [76]:
indx_train, y_train_ml = labels_to_target(y_train)

In [101]:
while 7:
    indx_tr, indx_te, y_tr, y_te = train_test_split(indx_train, y_train_ml)
    if np.unique(y_tr).shape[0] == 98:
        break

In [102]:
np.unique(y_tr).shape[0]

98

### kNN

In [154]:
clf = KNeighborsClassifier(weights='distance', metric='cosine', algorithm='brute', n_neighbors=50, n_jobs=-1)

In [120]:
clf.fit(X_train[indx_tr], y_tr)

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=-1, n_neighbors=50, p=2,
           weights='distance')

In [121]:
y_pred_proba = clf.predict_proba(X_train[indx_te])

In [122]:
y_pred = y_pred_proba.argmax(axis=1)

In [133]:
f1_score(y_train[indx_te], target_to_labels(np.arange(y_te.shape[0]), y_pred, 98), average='samples')

0.5548744035390919

test

In [155]:
clf.fit(X_train[indx_train], y_train_ml)

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=-1, n_neighbors=50, p=2,
           weights='distance')

In [156]:
y_test_pred = clf.predict_proba(X_test)

In [157]:
y_test_pr = target_to_labels(np.arange(X_test.shape[0]), y_test_pred.argmax(axis=1), 98)

In [158]:
write_labels_to_file(y_test_pr, 'ans_knn.csv')

### XGB

In [54]:
from xgboost import XGBClassifier

In [138]:
clf = XGBClassifier(n_estimators=200)

In [139]:
clf.fit(X_train[indx_tr], y_tr)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [140]:
y_pred_proba = clf.predict_proba(X_train[indx_te])

In [141]:
f1_score(y_train[indx_te], target_to_labels(np.arange(y_te.shape[0]), y_pred_proba.argmax(axis=1), 98), average='samples')

0.53445288638166977

test

In [144]:
clf.fit(X_train[indx_train], y_train_ml)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [145]:
y_test_pred = clf.predict_proba(X_test)

In [146]:
y_test_pr = target_to_labels(np.arange(X_test.shape[0]), y_test_pred.argmax(axis=1), 98)

In [153]:
write_labels_to_file(y_test_pr, 'ans.csv')