In [1]:
import pandas as pd
import numpy as np

## Metadata

In [2]:
PATH_TRAIN = "source/evo_train.csv"

In [3]:
target_label = u'GROUP_ID'
pred_labels = u'NAME'

## Preparing data ##

In [4]:
dtrain = pd.read_csv(PATH_TRAIN)

In [5]:
target = dtrain[target_label]
preds = dtrain[pred_labels]

### Partition

In [6]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    preds, target, 
    test_size=0.33, 
    random_state=42)

### Functions

In [7]:
def check_accuracy(clf, x, y, x_test, y_test):
    clf.fit(x, y)
    t = clf.predict(x_test)
    from sklearn.metrics import accuracy_score
    print "accuracy_score: %f" % accuracy_score(y_test, t)

## Model

### Baseline ###

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
baseline_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
check_accuracy(baseline_clf, X_train, y_train, X_test, y_test)

accuracy_score: 0.812249


### Best model

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
clf_v4 = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer(use_idf=False)),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-3, 
                                         n_iter=5, 
                                         random_state=42))])
check_accuracy(clf_v4, X_train, y_train, X_test, y_test)

accuracy_score: 0.892210


## Best features

In [14]:
# SelectKBest

from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline


clf_v5 = Pipeline([('vect', CountVectorizer()),
#                    ('tfidf', TfidfTransformer(use_idf=False)),
                   ('ch2', SelectKBest(chi2, k=5000)),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-3, 
                                         n_iter=5, 
                                         random_state=42))
                  ])

# t = clf_v5.fit_transform(X_train, y_train)
check_accuracy(clf_v5, X_train, y_train, X_test, y_test)

accuracy_score: 0.872898


In [15]:
# SelectFromModel

from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline


clf_v5 = Pipeline([('vect', CountVectorizer()),
#                    ('tfidf', TfidfTransformer(use_idf=False)),
                   ('select', SelectFromModel(LogisticRegression(C=0.01, penalty="l2", dual=False), threshold="0.01*mean")),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-3, 
                                         n_iter=5, 
                                         random_state=42))
                  ])

# t = clf_v5.fit_transform(X_train, y_train)
check_accuracy(clf_v5, X_train, y_train, X_test, y_test)

accuracy_score: 0.907779


In [30]:
# RFE

from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline


clf_v5 = Pipeline([('vect', CountVectorizer()),
#                    ('tfidf', TfidfTransformer(use_idf=False)),
                   ('select', RFE(LogisticRegression(C=0.01, penalty="l2", dual=False), 
                                  n_features_to_select=10000,
                                  step=1000)),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-3, 
                                         n_iter=5, 
                                         random_state=42))
                  ])

# t = clf_v5.fit_transform(X_train, y_train)
check_accuracy(clf_v5, X_train, y_train, X_test, y_test)

accuracy_score: 0.903928


In [31]:
%%time
counter = CountVectorizer()

t, test, ty, testy = X_train, X_test, y_train, y_test
# train_test_split(
#     X_train, y_train, 
#     test_size=0.33, 
#     random_state=17)

counter.fit(t)

t = counter.transform(t)
test = counter.transform(test)

nrow = t.shape[0]
ncol = t.shape[1]
print nrow, ncol

import random
from deap import creator, base, tools, algorithms

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()

# toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("attr_bool", lambda: np.random.random_sample() < 0.95)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=ncol)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

clf = SGDClassifier(loss='hinge', 
                    penalty='l2',
                    alpha=1e-3, 
                    n_iter=5,
                    random_state=42)

def get_accuracy(clf, x, y, x_test, y_test):
    clf.fit(x, y)
    t = clf.predict(x_test)
    from sklearn.metrics import accuracy_score
    res = accuracy_score(y_test, t)
    print "accuracy_score: %f; shape %d" % (res, x.shape[1])
    return res        

def evalAccuracy(individual):
    indices = np.array(individual).astype(bool)
    return get_accuracy(clf, t[:,indices], ty, test[:,indices], testy),

def mutate(individual):
    size = int(sum(individual) * 0.01)
    indices = np.random.randint(ncol, size=size)
    for i in indices:
        individual[i] = 0
    return individual,

def mate(ind1, ind2):
    return (creator.Individual(np.array(ind1).astype(bool) & np.array(ind2).astype(bool)),
           creator.Individual(np.array(ind1).astype(bool)))

toolbox.register("evaluate", evalAccuracy)
toolbox.register("mate", mate)
toolbox.register("mutate", mutate)
toolbox.register("select", tools.selTournament, tournsize=3)

population = toolbox.population(n=300)

NGEN=40
try:
    for gen in range(NGEN):
        offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.1)
        fits = toolbox.map(toolbox.evaluate, offspring)
        for fit, ind in zip(fits, offspring):
            ind.fitness.values = fit
        population = toolbox.select(offspring, k=len(population))
finally:
    top10 = tools.selBest(population, k=10)
    print top10[0]

37426 37474
accuracy_score: 0.895953; shape 35670
accuracy_score: 0.895682; shape 35652
accuracy_score: 0.897364; shape 35563
accuracy_score: 0.901866; shape 35645
accuracy_score: 0.888901; shape 35690
accuracy_score: 0.900673; shape 35579
accuracy_score: 0.893783; shape 35562
accuracy_score: 0.899588; shape 35629
accuracy_score: 0.900564; shape 35264
accuracy_score: 0.895465; shape 35271
accuracy_score: 0.899208; shape 35303
accuracy_score: 0.901432; shape 35651
accuracy_score: 0.898828; shape 35616
accuracy_score: 0.892427; shape 35626
accuracy_score: 0.902083; shape 35616
accuracy_score: 0.900727; shape 35585
accuracy_score: 0.896279; shape 33906
accuracy_score: 0.901432; shape 35617
accuracy_score: 0.890745; shape 33494
accuracy_score: 0.897201; shape 35572
accuracy_score: 0.896658; shape 35599
accuracy_score: 0.897364; shape 35305
accuracy_score: 0.899045; shape 35595
accuracy_score: 0.897743; shape 35539
accuracy_score: 0.899859; shape 35247
accuracy_score: 0.901703; shape 35654


KeyboardInterrupt: 

In [27]:
rfe.get_support()

array([ True,  True, False, ...,  True, False, False], dtype=bool)

In [21]:
t.shape[1]

(37426, 37474)

### GA

In [12]:
import random
from deap import creator, base, tools, algorithms

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()

toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=100)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

def evalOneMax(individual):
    return sum(individual),

toolbox.register("evaluate", evalOneMax)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

population = toolbox.population(n=300)

NGEN=40
for gen in range(NGEN):
    offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.1)
    fits = toolbox.map(toolbox.evaluate, offspring)
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit
    population = toolbox.select(offspring, k=len(population))
top10 = tools.selBest(population, k=10)

In [54]:
counter = CountVectorizer()
t = counter.fit_transform(X_train)
k=0.01 * t.shape[1]
k=100
chi = SelectKBest(chi2, k=k)
t =chi.fit_transform(t, y_train)
print t.shape
print t.shape[1]

(37426, 100)
100


In [79]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

counter = CountVectorizer()
t = counter.fit_transform(X_train)

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(t, y_train)
model = SelectFromModel(lsvc)
X_new = model.fit_transform(t)
X_new.shape

ValueError: bad input shape ()

In [76]:
pd.Series(counter.vocabulary_)[model.get_support()]

0mg             380
10              396
100г            453
100мл           461
150мл          1391
1кг            1851
1л             1857
20             1896
30             2799
30ml           2876
30мл           2888
37             3241
3mg            3333
3мг            3368
40             3389
5л             4345
75             4874
adalya         5835
blue           6549
compact        7181
eleaf          7821
fanatik        8049
giay           8480
gsh            8641
iphone         9062
mg            10092
red           11324
silver        11862
smok          11945
strobbs       12204
              ...  
тени          33511
термос        33560
тетрадь       33612
товар         33733
томат         33768
тон           33785
трости        34076
тумба         34149
турка         34166
туф           34182
туфли         34185
тушь          34196
унитаза       34475
услуги        34588
филе          34932
фонарь        35142
хлеб          35553
цифра         35946
чай           35997


In [55]:
print len(chi.get_support())
print len(counter.vocabulary_)

37474
37474


In [56]:
pd.Series(counter.vocabulary_)[chi.get_support()]

00               0
5л            4345
6s            4707
bb            6345
bg            6440
fanatik       8049
ip            9050
iphone        9062
vandoren     12741
алк          13706
альта        13794
анальная     13892
белое        14893
ботинки      15442
брелок       15508
вешалка      16298
винный       16413
вино         16414
водка        16645
волос        16752
вох          16841
гавриш       17081
гер          17271
горшок       17678
детские      18412
для          18653
жен          19297
жидкость     19368
заказ        19547
защитное     19724
             ...  
саксофона    30835
сапоги       30983
сверло       31100
светлое      31117
свободная    31204
свободной    31205
свободные    31206
семена       31363
серьги       31504
сигареты     31565
собак        32113
сопрано      32266
столовое     32780
сувенир      32942
сумка        32978
таб          33236
табл         33243
тенора       33519
тетрадь      33612
трости       34076
тумба        34149
туфли       

## Batch

In [10]:
from sklearn.utils import resample

n_samples = 10000

Bpreds, Btarget = resample(preds, target, 
                           replace=False,
                           n_samples=n_samples,
                           random_state=17)

In [11]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    Bpreds, Btarget, 
    test_size=0.33, 
    random_state=42)

### Baseline ###

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
baseline_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
check_accuracy(baseline_clf, X_train, y_train, X_test, y_test)

accuracy_score: 0.721515


### Best model

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
clf_v4 = Pipeline([('vect', CountVectorizer()),
#                    ('tfidf', TfidfTransformer(use_idf=False)),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-3, 
                                         n_iter=5, 
                                         random_state=42))])
check_accuracy(clf_v4, X_train, y_train, X_test, y_test)

accuracy_score: 0.853636


In [14]:
import re
def replace_digits(line):
    pat = ur'[\d]{2,1000}([\W][\d]+)*'
    repl = " "
    return re.sub(pat, repl, line)

In [15]:
repl = [" ", " %DIGIT% "]
max_df = [0.5, None]
use_idf = [True, False]

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
clf_v4 = Pipeline([('vect', CountVectorizer()),
#                    ('tfidf', TfidfTransformer()),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-3, 
                                         n_iter=5, 
                                         random_state=42))])
check_accuracy(clf_v4, X_train.apply(lambda x: replace_digits(x)), y_train, X_test.apply(lambda x: replace_digits(x)), y_test)

accuracy_score: 0.857879


In [18]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
def normal_form(word):
    return morph.parse(word)[0].normal_form

def normal_form_line(line):    
    return u' '.join([normal_form(unicode(x, 'utf-8')) for x in line.split()])

ImportError: No module named pymorphy2

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
clf_v4 = Pipeline([('vect', CountVectorizer()),
#                    ('tfidf', TfidfTransformer()),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-3, 
                                         n_iter=5, 
                                         random_state=42))])
check_accuracy(clf_v4, X_train.apply(lambda x: replace_digits(x)).apply(lambda x: normal_form_line(x)), y_train, X_test.apply(lambda x: replace_digits(x)).apply(lambda x: normal_form_line(x)), y_test)

NameError: global name 'normal_form_line' is not defined

In [None]:
vect =  CountVectorizer()
t = vect.fit_transform(X_train.apply(lambda x: replace_digits(x)))
vocab = pd.Series(vect.vocabulary_)

In [None]:
vocab.count()

In [None]:
pd.DataFrame(t.todense())

In [None]:
vocab.to_csv("vocab_batch.csv", encoding="UTF-8", sep='\t')