In [32]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import time
import re
import timeit

from multiprocessing import Pool
from sklearn import cross_validation
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, BaggingClassifier
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.util import ngrams
from scipy.sparse import lil_matrix
from sklearn.decomposition import TruncatedSVD

## Чтение данных

In [3]:
def read_data(file_name):
    data = []
    with open(file_name, 'r') as f:
        for line in f:
            line = line[:-1].replace(' ', '')
            data.append(line.split(','))
    return data

In [4]:
train = pd.DataFrame(read_data('linear_train.txt'), columns=['Name', 'Flag'])
test = pd.DataFrame(read_data('linear_test.txt'), columns=['Name'])
sample = read_data('linear_ans_example.txt')
sample_submission = pd.DataFrame(sample[1:], columns=sample[0])

## Посмотрим на данные

In [5]:
train.head()

Unnamed: 0,Name,Flag
0,Аалтонен,1
1,Аар,0
2,Аарон,0
3,ААРОН,0
4,Аарона,0


In [6]:
test.head()

Unnamed: 0,Name
0,﻿Аалто
1,ААР
2,Аара
3,Ааре
4,Аарон


In [7]:
sample_submission.head()

Unnamed: 0,Id,Answer
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0


## Cоставим словари для ответов на уже известных данных

In [8]:
train_dict = {
    k : int(v)
    for k, v in train.values
}

In [9]:
train_dict

{'Аалтонен': 1,
 'Аар': 0,
 'Аарон': 0,
 'ААРОН': 0,
 'Аарона': 1,
 'Аароне': 0,
 'Ааронов': 0,
 'Аахена': 0,
 'Абабков': 1,
 'абажур': 0,
 'абажуром': 0,
 'абажуры': 0,
 'Абажуры': 0,
 'абак': 0,
 'абаками': 0,
 'Абакана': 0,
 'абаком': 0,
 'Абакумов': 1,
 'Абалкина': 1,
 'Абатуровым': 1,
 'Абашев': 1,
 'Абашидзе': 1,
 'Абашкина': 1,
 'Аббас': 1,
 'АББАС': 1,
 'Аббаса': 1,
 'Аббасов': 1,
 'аббатами': 0,
 'аббате': 0,
 'аббатиса': 0,
 'Аббатиса': 0,
 'аббатисе': 0,
 'аббатисой': 0,
 'Аббатисой': 0,
 'аббатом': 0,
 'аббатств': 0,
 'аббатства': 0,
 'аббатствами': 0,
 'Аббатстве': 0,
 'аббатству': 0,
 'Абботом': 1,
 'аббревиатура': 1,
 'аббревиатуры': 0,
 'Абвилю': 0,
 'Абделем': 0,
 'Абдрашитов': 1,
 'Абдул': 0,
 'Абдуле': 0,
 'АБДУЛЛА': 0,
 'АБДУЛЛАЕВЫХ': 1,
 'Абдулле': 0,
 'Абдуллу': 0,
 'Абдулова': 1,
 'Абдуловна': 0,
 'Абдуловым': 1,
 'Абдулом': 0,
 'Абдурахмана': 0,
 'Абдурахманов': 0,
 'Абдурахманович': 0,
 'Абдурахманом': 0,
 'Абелев': 1,
 'Абелевы': 1,
 'АБЕЛЕВЫ': 1,
 'АБЕЛЕВЫХ':

In [10]:
unique_name_train = set(train['Name'].get_values())
unique_name_test = set(test['Name'].get_values())
print('train size = ', len([x for x in unique_name_train]))
print('test size = ', len([x for x in unique_name_test]))

train size =  100683
test size =  186347


In [11]:
print('Same data of train and test = ', 
      np.array([x for x in unique_name_test.intersection(unique_name_train)]).size)

Same data of train and test =  2785


Незначительно повлияет на результат

## Теория: если маленькой буквы, то не фамилия

In [12]:
buf_train = train.copy()
for i in buf_train.index:
    if not(re.search(u'[А-Я]', str(buf_train['Name'][i])[0]) is None 
           and int(buf_train['Flag'][i]) == 1):
        buf_train.drop([i], axis=0, inplace=True)

In [13]:
buf_train.head()

Unnamed: 0,Name,Flag
43,аббревиатура,1
446,автокаско,1
475,автоматизация,1
486,автоматов,1
658,агент,1


Посмотрим какую часть выборки это занимает

In [14]:
size = 0
for i in train.index:
    if re.search(u'[А-Я]', str(train['Name'][i])[0]) is None:
        size +=1
print(size)

55721


Теория оправдалась

Посмотрим какую часть выборки это покроет(забъем на пересечения с инфой от словарей)

In [15]:
size = 0
for i in test.index:
    if re.search(u'[А-Я]', str(test['Name'][i])[0]) is None:
        size +=1
print(size)

103573


## Теория: если есть большая буква не первая, то не фамилия

In [16]:
buf_train = train.copy()
for i in buf_train.index:
    if not(re.search(u'[А-Я]', str(buf_train['Name'][i])[1:]) is not None 
           and int(buf_train['Flag'][i]) == 1):
        buf_train.drop([i], axis=0, inplace=True)

In [17]:
buf_train.head()

Unnamed: 0,Name,Flag
25,АББАС,1
51,АБДУЛЛАЕВЫХ,1
64,АБЕЛЕВЫ,1
65,АБЕЛЕВЫХ,1
270,АВГУСТ,1


Посмотрим какую часть выборки это занимает

In [18]:
size = 0
for i in train.index:
    if re.search(u'[А-Я]', str(train['Name'][i])[1:]) is not None:
        size +=1
print(size)

9837


Теория оправдалась

Посмотрим какую часть выборки это покроет(забъем на пересечения с инфой от словарей и первой теорией)

In [19]:
size = 0
for i in test.index:
    if re.search(u'[А-Я]', str(test['Name'][i])[1:]) is not None:
        size +=1
print(size)

18584


А обе теории

In [20]:
size = 0
for i in test.index:
    if (re.search(u'[А-Я]', str(test['Name'][i])[1:]) is not None
        or re.search(u'[А-Я]', str(test['Name'][i])[0]) is None):
        size +=1
print(size)

122147


## Теория: если есть не буквенный символ, то это не фамилия

In [21]:
re.search('[^а-яА-Я]', 'ыdя')

<_sre.SRE_Match object; span=(1, 2), match='d'>

In [22]:
buf_train = train.copy()
for i in buf_train.index:
    if not(re.search('[^а-яА-Я]', str(buf_train['Name'][i])[:]) is not None 
           and int(buf_train['Flag'][i]) == 1):
        buf_train.drop([i], axis=0, inplace=True)

In [23]:
buf_train.head()

Unnamed: 0,Name,Flag
1318,Аксёновым,1
1767,Алфёрова,1
1829,Альма-Тадема,1
1901,Алёшина,1
1902,Алёшкиной,1


Посмотрим какую часть выборки это занимает

In [24]:
size = 0
for i in train.index:
    if re.search('[^а-яА-Я]', str(train['Name'][i])[:]) is not None:
        size +=1
print(size)

1673


Теория оправдалась

Посмотрим какую часть выборки это покроет(забъем на пересечения с инфой от словарей и первой теорией)

In [25]:
size = 0
for i in test.index:
    if re.search(u'[^а-яА-Я]', str(test['Name'][i])[:]) is not None:
        size +=1
print(size)

3191


А все теории

In [26]:
size = 0
for i in test.index:
    if (re.search(u'[А-Я]', str(test['Name'][i])[1:]) is not None
        or re.search(u'[А-Я]', str(test['Name'][i])[0]) is None
        or re.search(u'[^а-яА-Я]', str(test['Name'][i])[:]) is not None):
        size +=1
print(size)

122726


Примерно две трети ответов уже знаем, а для оставшихся данных будем использовать линейную модель

Реализуем функцию, которая будет отсекать заведомо известные "не фамилии"

In [33]:
def not_surname(word):
    return (re.search(u'[А-Я]', word[1:]) is not None
            or re.search(u'[А-Я]', word[0]) is None
            or re.search(u'[^а-яА-Я]', word[:]) is not None)

In [38]:
print(not_surname((test['Name'][1])), (test['Name'][1]))

True ААР


## Линейная модель

### Подготовка признаков(n-grams)

In [29]:
[x for x in ngrams('sentence word', 3)]

[('s', 'e', 'n'),
 ('e', 'n', 't'),
 ('n', 't', 'e'),
 ('t', 'e', 'n'),
 ('e', 'n', 'c'),
 ('n', 'c', 'e'),
 ('c', 'e', ' '),
 ('e', ' ', 'w'),
 (' ', 'w', 'o'),
 ('w', 'o', 'r'),
 ('o', 'r', 'd')]

In [30]:
ngrams('sentence word', 3)    

<generator object ngrams at 0x7fbe23223468>

Составим множества n-gamm

In [102]:
lm = {n:set() for n in range(1,4)}
def extract_n_grams(word):
    for n in range(1,4):
        ngram = ngrams(word, n)
        for item in ngram:
            lm[n].add(item)

for i in train.index:
    if not not_surname(train['Name'][i]):
        extract_n_grams(train['Name'][i])
        
for i in test.index:
    if not not_surname(test['Name'][i]):
        extract_n_grams(test['Name'][i])



In [103]:
lm[3]

{('Д', 'у', 'п'),
 ('а', 'ю', 'р'),
 ('л', 'э', 'т'),
 ('П', 'ы', 'ш'),
 ('Р', 'о', 'ч'),
 ('б', 'е', 'ц'),
 ('Л', 'и', 'к'),
 ('Х', 'о', 'п'),
 ('Д', 'е', 'м'),
 ('в', 'я', 'т'),
 ('м', 'т', 'е'),
 ('Г', 'о', 'я'),
 ('х', 'л', 'ы'),
 ('Е', 'в', 'р'),
 ('д', 'м', 'э'),
 ('Б', 'и', 'ч'),
 ('ц', 'з', 'ы'),
 ('о', 'щ', 'а'),
 ('Ч', 'а', 'и'),
 ('э', 'т', 'с'),
 ('Н', 'е', 'е'),
 ('а', 'ц', 'п'),
 ('и', 'о', 'а'),
 ('д', 'и', 'б'),
 ('Щ', 'у', 'п'),
 ('З', 'у', 'н'),
 ('д', 'в', 'л'),
 ('с', 'у', 'е'),
 ('К', 'и', 'л'),
 ('Р', 'у', 'к'),
 ('Г', 'р', 'ы'),
 ('м', 'л', 'ю'),
 ('л', 'е', 'б'),
 ('з', 'г', 'у'),
 ('ш', 'ь', 'е'),
 ('М', 'б', 'о'),
 ('ч', 'ф', 'и'),
 ('н', 'м', 'у'),
 ('У', 'м', 'н'),
 ('п', 'о', 'г'),
 ('с', 'а', 'ш'),
 ('м', 'а', 'м'),
 ('Ф', 'а', 'ш'),
 ('Ш', 'н', 'и'),
 ('Ш', 'р', 'и'),
 ('а', 'о', 'н'),
 ('у', 'д', 'д'),
 ('У', 'и', 'м'),
 ('Т', 'а', 'ч'),
 ('ф', 'е', 'и'),
 ('у', 'а', 'н'),
 ('о', 'а', 'м'),
 ('д', 'о', 'й'),
 ('В', 'а', 'б'),
 ('й', 'е', 'й'),
 ('е', 'ш'

In [56]:
print(type(lm.keys()))

<class 'dict_keys'>


In [104]:
def get_array_from_cont(c):
    return [x for x in c]

features = []
for n in lm.keys():
    features.extend(lm[n])
features = np.array(features)

In [105]:
features_dict = {features[i]: i for i in range(len(features))}

In [106]:
print(features[:3], features[-3:])
print(features_dict[features[2]])

[('а',) ('И',) ('р',)] [('р', 'б', 'и') ('Ш', 'к', 'о') ('М', 'э', 'с')]
2


## Создание разреженной tf-idf таблицы признаков

In [78]:
ex = csr_matrix((4, 3))

In [79]:
ex[1,1] = 2
ex[2,2] = 3
ex[3,2] = 5
ex[0,1] = 1
ex[1,0] = 4

ex.toarray()



array([[ 0.,  1.,  0.],
       [ 4.,  2.,  0.],
       [ 0.,  0.,  3.],
       [ 0.,  0.,  5.]])

In [81]:
ex.sum(axis=1)

matrix([[ 1.],
        [ 6.],
        [ 3.],
        [ 5.]])

In [117]:
[ex[:, col].count_nonzero() for col in range(ex.shape[1])]

[1, 2, 2]

In [85]:
ex[:,1].count_nonzero()

2

In [122]:
class Vectorizer:
    @staticmethod
    def count_fit_transform(data, features, features_dict, lm):
        '''
        Params
        ------
        data : array of words after cut of not surnames
        
        Returns
        -------
        X : scipy.sparse matrix, shape = (n_samples, n_features)
            Document-term matrix.
        '''
        X = lil_matrix((data.size, features.size))
        for i in range(data.size):
            for n in lm.keys():
                for gr in ngrams(data[i], n):
                    X[i, features_dict[gr]] += 1
        return X
              
    @staticmethod
    def tfidf_transform(X):
        '''
        Params
        ------
        X : sparse matrix after count_fit_transform
        
        Returns
        -------
        X : scipy.sparse matrix, shape = (n_samples, n_features)
            Document-term tfidf matrix.
        '''
        print(1)
        # amount of nonzero terms in each colomn
        c_nonzero = [X[:, col].count_nonzero() for col in range(X.shape[1])]
        print(2)
        # sums of amounts of terms in each row
        r_sum = [x.item((0, 0)) for x in X.sum(axis=1)]
        print(3)
        
        doc_size = float(X.shape[0])
        
        print(4)
        # tfidf
        for i, j in X.nonzero():
            X[i, j] = X[i, j]*np.log(doc_size/c_nonzero[j]) / r_sum[i]
            
        print(5)
        return X

Подготовка обучающих данных

In [89]:
%%time
buf_train = train.copy()
for i in buf_train.index:
    if not_surname(buf_train['Name'][i]):
        buf_train.drop([i], axis=0, inplace=True)

CPU times: user 8min 49s, sys: 136 ms, total: 8min 49s
Wall time: 8min 49s


In [93]:
data = np.array([buf_train['Name'][i] for i in buf_train.index])
target = np.array([buf_train['Flag'][i] for i in buf_train.index])

In [124]:
%time X = Vectorizer.count_fit_transform(data, features, features_dict, lm)
#%time X = Vectorizer.tfidf_transform(X)



CPU times: user 12.7 s, sys: 40 ms, total: 12.7 s
Wall time: 12.7 s


In [125]:
X.shape

(35574, 14152)

## Обучение модели

In [158]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, target, test_size = 0.01, random_state=42)

In [159]:
%%time
model = LogisticRegression()
model.fit(X_train, y_train)
print(accuracy_score(model.predict(X_train), y_train))
print(accuracy_score(model.predict(X_test), y_test))

0.908200352093
0.873595505618
CPU times: user 2.13 s, sys: 28 ms, total: 2.16 s
Wall time: 1.1 s


# Предсказание результатов

In [133]:
%%time
preds = np.zeros(test.shape[0])
buf_test = test.copy()
for i in buf_test.index:
    if not_surname(buf_test['Name'][i]):
        buf_test.drop([i], axis=0, inplace=True)

CPU times: user 25min 31s, sys: 404 ms, total: 25min 31s
Wall time: 25min 33s


In [134]:
test_data = np.array([buf_test['Name'][i] for i in buf_test.index])

In [135]:
%time test_X = Vectorizer.count_fit_transform(test_data, features, features_dict, lm)
#%time test_X = Vectorizer.tfidf_transform(test_X)



CPU times: user 23 s, sys: 72 ms, total: 23 s
Wall time: 23 s


In [162]:
model_preds = model.predict_proba(test_X)

In [164]:
model_preds[:2]

array([[ 0.82042442,  0.17957558],
       [ 0.80339182,  0.19660818]])

In [165]:
%%time
j = 0
for i in buf_test.index:
    preds[i] = model_preds[j][0]
    j += 1

CPU times: user 68 ms, sys: 0 ns, total: 68 ms
Wall time: 65.6 ms


# Запись результатов

In [167]:
sample_submission['Answer'] = np.double(preds)
print(sample_submission.head(5))
print(sample_submission[sample_submission['Answer'] < 0])
sample_submission['Answer'] = sample_submission['Answer'].map(lambda x: x if x > 0 else 0.0)
sample_submission.to_csv("baseline_submission.tsv", sep=',', index=False)

  Id    Answer
0  0  0.000000
1  1  0.000000
2  2  0.820424
3  3  0.803392
4  4  0.811601
Empty DataFrame
Columns: [Id, Answer]
Index: []


Данная отправка имеет плохой результат из-за константных ответов(ROC-AUC = 0.65)

## Чтение данных

In [2]:
def read_data(file_name):
    data = []
    with open(file_name, 'r') as f:
        for line in f:
            line = line[:-1].replace(' ', '')
            data.append(line.split(','))
    return data

In [3]:
train = pd.DataFrame(read_data('linear_train.txt'), columns=['Name', 'Flag'])
test = pd.DataFrame(read_data('linear_test.txt'), columns=['Name'])
sample = read_data('linear_ans_example.txt')
sample_submission = pd.DataFrame(sample[1:], columns=sample[0])

Приведем все к нихнему регистру

In [4]:
train['Name'] = train['Name'].map(lambda x: x.lower())
test['Name'] = test['Name'].map(lambda x: x.lower())

In [5]:
test.head()

Unnamed: 0,Name
0,﻿аалто
1,аар
2,аара
3,ааре
4,аарон


## Cоставим словари для ответов на уже известных данных

In [6]:
train_dict = {
    k : int(v)
    for k, v in train.values
}

In [7]:
train_dict

{'аалтонен': 1,
 'аар': 0,
 'аарон': 0,
 'аарона': 1,
 'аароне': 0,
 'ааронов': 0,
 'аахена': 0,
 'абабков': 1,
 'абажур': 0,
 'абажуром': 0,
 'абажуры': 0,
 'абак': 0,
 'абаками': 0,
 'абакана': 0,
 'абаком': 0,
 'абакумов': 1,
 'абалкина': 1,
 'абатуровым': 1,
 'абашев': 1,
 'абашидзе': 1,
 'абашкина': 1,
 'аббас': 1,
 'аббаса': 1,
 'аббасов': 1,
 'аббатами': 0,
 'аббате': 0,
 'аббатиса': 0,
 'аббатисе': 0,
 'аббатисой': 0,
 'аббатом': 0,
 'аббатств': 0,
 'аббатства': 0,
 'аббатствами': 0,
 'аббатстве': 0,
 'аббатству': 0,
 'абботом': 1,
 'аббревиатура': 1,
 'аббревиатуры': 0,
 'абвилю': 0,
 'абделем': 0,
 'абдрашитов': 1,
 'абдул': 0,
 'абдуле': 0,
 'абдулла': 0,
 'абдуллаевых': 1,
 'абдулле': 0,
 'абдуллу': 0,
 'абдулова': 1,
 'абдуловна': 0,
 'абдуловым': 1,
 'абдулом': 0,
 'абдурахмана': 0,
 'абдурахманов': 0,
 'абдурахманович': 0,
 'абдурахманом': 0,
 'абелев': 1,
 'абелевы': 1,
 'абелевых': 1,
 'абеля': 0,
 'абердин': 0,
 'абернети': 1,
 'аберраций': 0,
 'аберрация': 0,
 'аберр

In [8]:
unique_name_train = set(train['Name'].get_values())
unique_name_test = set(test['Name'].get_values())
print('train size = ', len([x for x in unique_name_train]))
print('test size = ', len([x for x in unique_name_test]))

train size =  88560
test size =  148043


In [9]:
print('Same data of train and test = ', 
      np.array([x for x in unique_name_test.intersection(unique_name_train)]).size)

Same data of train and test =  34466


## Линейная модель

### Подготовка признаков(n-grams)

In [10]:
[x for x in ngrams('sentence word', 3)]

[('s', 'e', 'n'),
 ('e', 'n', 't'),
 ('n', 't', 'e'),
 ('t', 'e', 'n'),
 ('e', 'n', 'c'),
 ('n', 'c', 'e'),
 ('c', 'e', ' '),
 ('e', ' ', 'w'),
 (' ', 'w', 'o'),
 ('w', 'o', 'r'),
 ('o', 'r', 'd')]

In [11]:
ngrams('sentence word', 3)    

<generator object ngrams at 0x7f16d0de9c50>

Составим множества n-gamm

In [12]:
lm = {n:set() for n in range(1,7)}
def extract_n_grams(word):
    for n in range(1,7):
        ngram = ngrams(word, n)
        for item in ngram:
            lm[n].add(item)

for i in train.index:
        extract_n_grams(train['Name'][i])
        
for i in test.index:
        extract_n_grams(test['Name'][i])



In [13]:
lm[3]

{('е', 'х', 'с'),
 ('в', 'ш', 'а'),
 ('к', 'о', 'л'),
 ('р', 'е', 'к'),
 ('с', 'о', 'к'),
 ('д', 'н', 'ы'),
 ('р', 'э', 'й'),
 ('в', 'ш', 'е'),
 ('и', 'с', 'ф'),
 ('л', 'у', 'с'),
 ('о', 'р', 'я'),
 ('р', 'ф', 'р'),
 ('м', 'о', 'ш'),
 ('ш', 'в', 'ы'),
 ('с', 'б', 'ы'),
 ('о', 'т', 'е'),
 ('е', 'т', 'т'),
 ('л', 'ю', 'м'),
 ('ш', 'а', 'п'),
 ('е', 'п', 'о'),
 ('б', 'э', 'к'),
 ('ю', 'г', 'у'),
 ('ш', 'к', 'е'),
 ('з', 'ъ', 'ё'),
 ('а', 'л', 'в'),
 ('о', 'н', 'ц'),
 ('а', 'ц', 'п'),
 ('д', 'е', 'р'),
 ('л', 'я', 'у'),
 ('-', 'д', '`'),
 ('и', 'ч', 'е'),
 ('о', 'ц', 'и'),
 ('в', '-', 'п'),
 ('о', 'ц', 'п'),
 ('ю', '-', 'х'),
 ('э', 'й', 'с'),
 ('у', 'в', 'щ'),
 ('з', 'а', 'с'),
 ('и', 'к', 'ш'),
 ('ф', 'т', 'г'),
 ('е', 'й', 'к'),
 ('ц', 'и', 'и'),
 ('и', 'г', 'р'),
 ('ó', 'р', 'о'),
 ('ф', 'о', 'к'),
 ('я', 'м', 'б'),
 ('ф', 'и', 'е'),
 ('а', 'у', 'х'),
 ('р', 'с', 'м'),
 ('а', 'й', 'ю'),
 ('н', 'у', 'у'),
 ('т', 'ю', 'м'),
 ('т', 'л', 'у'),
 ('е', 'е', 'в'),
 ('-', 'а', 'б'),
 ('с', 'р'

In [14]:
print(type(lm.keys()))

<class 'dict_keys'>


In [15]:
def get_array_from_cont(c):
    return [x for x in c]

features = []
for n in lm.keys():
    features.extend(lm[n])
features = np.array(features)

In [16]:
features_dict = {features[i]: i for i in range(len(features))}

In [17]:
print(features[:3], features[-3:])
print(features_dict[features[2]])

[('д',) ('ó',) ('&',)] [('и', 'с', 'к', 'е', 'т', 'о') ('с', 'ъ', 'е', 'з', 'д', 'ы')
 ('б', 'о', 'б', 'ы', 'л', 'я')]
2


## Создание разреженной tf-idf таблицы признаков

In [18]:
ex = lil_matrix((40000, 3000))

In [19]:
ex[1,1] = 2
ex[2,2] = 3
ex[3,2] = 5
ex[0,1] = 1
ex[1,0] = 4

ex.toarray()

array([[ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 4.,  2.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  3., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [20]:
ex.sum(axis=1)

matrix([[ 1.],
        [ 6.],
        [ 3.],
        ..., 
        [ 0.],
        [ 0.],
        [ 0.]])

In [23]:
ex[:,1].count_nonzero()

2

In [24]:
%time ex[:,2].count_nonzero()
%time ex.getcol(2).count_nonzero()

CPU times: user 276 ms, sys: 4 ms, total: 280 ms
Wall time: 278 ms
CPU times: user 24 ms, sys: 0 ns, total: 24 ms
Wall time: 40.5 ms


2

In [25]:
def f(i, n_proc, X, c_nonzero):
    beg = i*np.floor(X.shape[1]/n_proc)
    end = (i + 1)*np.floor(X.shape[1]/n_proc)
    if i == n_proc - 1:
        end = X.shape[1]
        
    if beg >= X.shape[1]:
        return False
    
    for j in range(beg, end):
        c_nonzero[j] = X.getcol(j).count_nonzero()
        
    return True


class Vectorizer:
    @staticmethod
    def count_fit_transform(data, features, features_dict, lm):
        '''
        Params
        ------
        data : array of words after cut of not surnames
        
        Returns
        -------
        X : scipy.sparse matrix, shape = (n_samples, n_features)
            Document-term matrix.
        '''
        X = lil_matrix((data.size, features.size))
        for i in range(data.size):
            for n in lm.keys():
                for gr in ngrams(data[i], n):
                    X[i, features_dict[gr]] += 1
        return X
              
    @staticmethod
    def tfidf_transform(X):
        '''
        Params
        ------
        X : sparse matrix after count_fit_transform
        
        Returns
        -------
        X : scipy.sparse matrix, shape = (n_samples, n_features)
            Document-term tfidf matrix.
        '''        
        print(0)
        n_proc = 4
        pool = Pool(processes=n_proc)
        res = []
        # amount of nonzero terms in each colomn
        c_nonzero = np.zeros((X.shape[1]))
        
        print('01')
        start = timeit.timeit()
        for i in range(n_proc):
            res.append(pool.apply_async(f, [i, n_proc, X, c_nonzero]))
            
        finish  = timeit.timeit() 
        print('02', finish - start)
        
        start = timeit.timeit()
        for i in range(n_proc):
            answer = res[i].get(timeout=10)
            if answer is not True:
                print(answer)
        finish  = timeit.timeit() 
        print('1', finish - start)
                
        #%time c_nonzero = [X.getcol(col).count_nonzero() for col in range(X.shape[1])]
        print(2)
        # sums of amounts of terms in each row
        %time r_sum = [x.item((0, 0)) for x in X.sum(axis=1)]
        print(3)
        
        doc_size = float(X.shape[0])
        
        print(4)
        # tfidf
        for i, j in X.nonzero():
            X[i, j] = X[i, j]*np.log(doc_size/c_nonzero[j]) / r_sum[i]
            
        print(5)
        return X

Подготовка обучающих данных

In [26]:
%%time
buf_train = train.copy()

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.7 ms


In [27]:
data = np.array([buf_train['Name'][i] for i in buf_train.index])
target = np.array([buf_train['Flag'][i] for i in buf_train.index])

In [28]:
%time X = Vectorizer.count_fit_transform(data, features, features_dict, lm)
#%time X = Vectorizer.tfidf_transform(X)



CPU times: user 1min 7s, sys: 132 ms, total: 1min 7s
Wall time: 1min 7s


In [29]:
X.shape

(101408, 346803)

In [61]:
X.shape

(101408, 505811)

## Обучение модели

In [49]:
def print_test(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    
    vf = np.vectorize(lambda x: int(x))
    proba_train = model.predict_proba(X_train)[:,1]
    proba_test = model.predict_proba(X_test)[:,1]
    
    print('Train_acc = ', accuracy_score(model.predict(X_train), y_train))
    print('Test_acc = ', accuracy_score(model.predict(X_test), y_test))
    print('Train_roc = ', roc_auc_score(vf(y_train), proba_train))
    print('Test_roc = ', roc_auc_score(vf(y_test), proba_test))

In [59]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, target, test_size = 0.95, random_state=42)

In [73]:
model = LogisticRegression(C=0.04)
print_test(model, X_train, X_test, y_train, y_test)

Train_acc =  0.899802761341
Test_acc =  0.896312981378
Train_roc =  0.933943506236
Test_roc =  0.81653110239


## Обучение модели(TF-IDF)

In [74]:
trans = TfidfTransformer()
XX = trans.fit_transform(X, target)

In [75]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(XX, target, test_size = 0.95, random_state=42)

In [87]:
model = LogisticRegression(C=0.04)
print_test(model, X_train, X_test, y_train, y_test)

Train_acc =  0.89541935802
Test_acc =  0.895016455681
Train_roc =  0.864677524628
Test_roc =  0.811181093018


## Обучение модели(bagging)

In [115]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, target, test_size = 0.8, random_state=42, stratify=target
)

In [116]:
%%time
model = BaggingClassifier(base_estimator=LogisticRegression(C=5), n_estimators=100, n_jobs=3)
print_test(model, X_train, X_test, y_train, y_test)

Train_acc =  0.991617770327
Test_acc =  0.913851122314
Train_roc =  0.99976231947
Test_roc =  0.863257142932
CPU times: user 10.3 s, sys: 4.84 s, total: 15.2 s
Wall time: 8min 6s


# Предсказание результатов

In [117]:
%%time
preds = np.zeros(test.shape[0])
buf_test = test.copy()

CPU times: user 8 ms, sys: 12 ms, total: 20 ms
Wall time: 175 ms


In [110]:
test_data = np.array([buf_test['Name'][i] for i in buf_test.index])

In [111]:
%time test_X = Vectorizer.count_fit_transform(test_data, features, features_dict, lm)
test_XX = trans.fit_transform(test_X)
#%time test_X = Vectorizer.tfidf_transform(test_X)



CPU times: user 2min 12s, sys: 440 ms, total: 2min 13s
Wall time: 2min 25s


In [118]:
model_preds = model.predict_proba(test_X)

In [119]:
preds = model_preds[:, 0]

# Запись результатов

In [120]:
sample_submission['Answer'] = np.double(preds)
print(sample_submission.head(5))
print(sample_submission[sample_submission['Answer'] < 0])
sample_submission['Answer'] = sample_submission['Answer'].map(lambda x: x if x > 0 else 0.0)
sample_submission.to_csv("baseline_submission.tsv", sep=',', index=False)

  Id    Answer
0  0  0.976959
1  1  0.874981
2  2  0.873328
3  3  0.963534
4  4  0.799708
Empty DataFrame
Columns: [Id, Answer]
Index: []
