In [1]:
import numpy as np
import pandas as pd
import re
from collections import defaultdict
from itertools import product
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [40]:
train = pd.read_csv('train.txt', header=None, names=['word', 'target'])
test = pd.read_csv('linear_test.txt', header=None, names=['word'])
data = train['word']
target = train['target']

In [41]:
test.head()

Unnamed: 0,word
0,Аалто
1,ААР
2,Аара
3,Ааре
4,Аарон


In [3]:
data = data.apply(lambda s: s.decode('utf-8'))

In [73]:
alphabet = u'йцукенгшщзхъфывапролджэячсмитьбюё'
consonants = u'йцкнгшщзхъфвпрлджчсмтьб'
consonant_triples = list(product(consonants, consonants, consonants))
consonant_triples = map(lambda triple: triple[0] + triple[1] + triple[2], consonant_triples)

In [74]:
pairs, all_triples = defaultdict(lambda: 0), defaultdict(lambda: 0)
for word in data:
    for k in np.arange(len(word) - 1):
        pairs[word[k: k+2]] += 1
    for k in np.arange(len(word) - 2):
        all_triples[word[k: k+3]] += 1

In [75]:
#triples = {word : all_triples[word] for word in set(all_triples) - set(consonant_triples)}
triples = {k: v for k, v in triples.items() if v >= 20}
pairs   = {k: v for k, v in pairs.items()   if v >= 20}

In [76]:
print len(triples), len(pairs)

4729 1310


In [77]:
features = set(triples) | set(pairs)

In [78]:
ix = {feature: k for k, feature in enumerate(features)}

In [79]:
len(features)

6039

In [80]:
ix_rows, ix_cols, ix_data = [], [], []
for i, word in enumerate(data):
    for q in [2, 3, 4, 5]:
        for k in np.arange(len(word) - q + 1):
            if word[k: k+q] in features:
                ix_data.append(1)
                ix_rows.append(i)
                ix_cols.append(ix[word[k: k+q]])

In [81]:
len(ix_data)

1316143

In [82]:
ij = [ix_rows, ix_cols]

In [83]:
X_train = csr_matrix((ix_data, ij))

In [84]:
clf = LogisticRegression()

In [85]:
print cross_val_score(clf, X_train, target, scoring='roc_auc').mean()

0.762303325564


In [86]:
corpus = [
     'This is the first document.',
     'This is the second second document.',
     'And the third one.',
     'Is this the first document?',
]

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [5]:
x1, x2, y1, y2 = train_test_split(data, target, test_size=0.25)

In [33]:
def check_cv_score(df, ngram, C):
    pipeline = Pipeline([("vectorizer", CountVectorizer(min_df=df, analyzer='char_wb',
                                                        binary=False, ngram_range=ngram)), ("algo", LogisticRegression(C=C))])
    pipeline.fit(x1, y1)
    print cross_val_score(pipeline, x2, y2, scoring='roc_auc').mean()

In [212]:
CountVectorizer()

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [36]:
check_cv_score(1, (2,6), 0.55)

0.878377438808


In [42]:
ppl = Pipeline([("vectorizer", CountVectorizer(min_df=1, analyzer='char_wb', ngram_range=(2,6))),
                ("algo", LogisticRegression(C=0.55))])

In [30]:

params = {'vectorizer__min_df': np.arange(1, 102, 10)}
#'vectorizer__min_df': np.arange(1, 2), 
#'algo__C': np.linspace(0.1, 1, 2)
grid_search = GridSearchCV(ppl, param_grid=params, n_jobs=-1, scoring='roc_auc')
grid_search.fit(x1, y1)
print grid_search.best_score_, grid_search.best_params_

0.904304073241 {'vectorizer__min_df': 1}


In [45]:
ppl.fit(data, target)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer='char_wb', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 6), preprocessor=None, stop_words=None,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [46]:
res = ppl.predict(test['word'])

In [53]:
example = pd.read_csv('ans_example.txt')

In [55]:
example['Answer'] = res

In [57]:
example.to_csv("linear1.tsv", sep=',', index=False)

In [None]:
# 0.905322442304 {'vectorizer__ngram_range': (2, 6)}
# 0.905821481113 {'algo__C': 0.55000000000000004}