In [15]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import matplotlib.pyplot as pl
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [16]:
data = pd.read_csv("linear_train.txt", header=None).values.T

In [17]:
test_data = list(map(lambda x: x[0], np.array(pd.read_csv("linear_test.txt", header=None))))

In [18]:
letter_dict = {}
bigramm_dict = {}
trigramm_dict = {}
letter_index = []
bigramm_index = []
trigramm_index = []


def add_in_dict(d, l, elem):
        if d.get(elem) is None:
            d[elem] = len(d)
            l.append(elem)

for surname in data[0]:
    for i in range(len(surname) - 2):
        add_in_dict(trigramm_dict, trigramm_index, surname[i:i + 3])
        add_in_dict(bigramm_dict, bigramm_index, surname[i:i + 2])
        add_in_dict(letter_dict, letter_index, surname[i])
    add_in_dict(letter_dict, letter_index, surname[-1])
    if len(surname) > 1:
        add_in_dict(letter_dict, letter_index, surname[-2])
        add_in_dict(bigramm_dict, bigramm_index, surname[-2:])

In [19]:
tri_num = len(trigramm_dict)
bi_num = len(bigramm_dict)

In [20]:
def get_features(data, valid_indexes):
    def elem_index(elem):
        if len(elem) == 3:
            return trigramm_dict[elem]
        if len(elem) == 2:
            return bigramm_dict[elem] + tri_num
        if len(elem) == 1:
            return letter_dict[elem] + tri_num + bi_num


    def try_to_add(features, d, elem, valid_indexes):
        if d.get(elem) is not None:
            ind = elem_index(elem)
            if ind in valid_indexes:
                features[valid_indexes.index(ind)] += 1

    result = np.zeros([len(data), len(valid_indexes)])
    for i, surname in enumerate(data):
        for j in range(len(surname) - 2):
            try_to_add(result[i], letter_dict, surname[j], valid_indexes)
            try_to_add(result[i], bigramm_dict, surname[j:j + 2], valid_indexes)
            try_to_add(result[i], trigramm_dict, surname[j:j+3], valid_indexes)
        try_to_add(result[i], letter_dict, surname[-1], valid_indexes)
        if len(surname) > 1:
            try_to_add(result[i], letter_dict, surname[-2], valid_indexes)
            try_to_add(result[i], bigramm_dict, surname[-2:], valid_indexes)
    return result

In [21]:
def get_important_features(indexes):
    features = get_features(data[0], indexes)
    est = SGDClassifier(loss='log', penalty='l1')
    est.fit(features, list(map(int, data[1] + 1)))
    return [indexes[i] for i, coef in enumerate(est.coef_[0]) if coef != 0]

In [22]:
parts_num = 10
additional_indexes = ([list(range(tri_num, tri_num + bi_num + len(letter_dict)))] + 
                      [list(range(int(i*tri_num/parts_num), 
                                  int((i + 1)*tri_num/parts_num))) for i in range(parts_num)])
                     
current_indexes = []
for new_ind in additional_indexes:
    current_indexes = get_important_features(current_indexes + new_ind)

In [23]:
train_features = get_features(data[0], current_indexes)

In [24]:
estimator = SGDClassifier(loss='log', penalty='l1')

In [25]:
cross_val_score(estimator, train_features, list(map(int, data[1] + 1)), cv=5)

array([ 0.85607928,  0.82585544,  0.87205404,  0.89946255,  0.87747153])

In [26]:
estimator.fit(train_features, list(map(int, data[1] + 1)))

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [27]:
test_features = get_features(test_data, current_indexes)

In [28]:
final_answer = estimator.predict_proba(test_features)

In [29]:
sample_submission = pd.read_csv('linear_ans_example.txt')
sample_submission['Answer'] = final_answer
sample_submission.to_csv("submission.tsv", sep=',', index=False)