In [1]:
import pandas as pd
import numpy as np

## Metadata

In [2]:
PATH_TRAIN = "source/evo_train.csv"

In [3]:
target_label = u'GROUP_ID'
pred_labels = u'NAME'

## Preparing data ##

In [4]:
dtrain = pd.read_csv(PATH_TRAIN)

In [5]:
target = dtrain[target_label]
preds = dtrain[pred_labels]

### Partition

In [87]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    preds, target, 
    test_size=0.33, 
    random_state=42)

### Functions

In [7]:
def check_accuracy(clf, x, y, x_test, y_test):
    clf.fit(x, y)
    t = clf.predict(x_test)
    from sklearn.metrics import accuracy_score
    print "accuracy_score: %f" % accuracy_score(y_test, t)

## Model

### Baseline ###

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
baseline_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
check_accuracy(baseline_clf, X_train, y_train, X_test, y_test)

accuracy_score: 0.812249


### Best model

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
clf_v4 = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer(use_idf=False)),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-3, 
                                         n_iter=5, 
                                         random_state=42))])
check_accuracy(clf_v4, X_train, y_train, X_test, y_test)

accuracy_score: 0.892210


## Batch

In [10]:
from sklearn.utils import resample

n_samples = 10000

Bpreds, Btarget = resample(preds, target, 
                           replace=False,
                           n_samples=n_samples,
                           random_state=17)

In [95]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    Bpreds, Btarget, 
    test_size=0.33, 
    random_state=42)

### Baseline ###

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
baseline_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
check_accuracy(baseline_clf, X_train, y_train, X_test, y_test)

accuracy_score: 0.721515


### Best model

In [96]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
clf_v4 = Pipeline([('vect', CountVectorizer()),
#                    ('tfidf', TfidfTransformer(use_idf=False)),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-3, 
                                         n_iter=5, 
                                         random_state=42))])
check_accuracy(clf_v4, X_train, y_train, X_test, y_test)

accuracy_score: 0.853636


In [97]:
import re
def replace_digits(line):
    pat = ur'[\d]{2,1000}([\W][\d]+)*'
    repl = " "
    return re.sub(pat, repl, line)

In [98]:
repl = [" ", " %DIGIT% "]
max_df = [0.5, None]
use_idf = [True, False]

In [99]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
clf_v4 = Pipeline([('vect', CountVectorizer()),
#                    ('tfidf', TfidfTransformer()),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-3, 
                                         n_iter=5, 
                                         random_state=42))])
check_accuracy(clf_v4, X_train.apply(lambda x: replace_digits(x)), y_train, X_test.apply(lambda x: replace_digits(x)), y_test)

accuracy_score: 0.857879


In [101]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
clf_v4 = Pipeline([('vect', CountVectorizer()),
#                    ('tfidf', TfidfTransformer()),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-3, 
                                         n_iter=5, 
                                         random_state=42))])
check_accuracy(clf_v4, X_train.apply(lambda x: replace_digits(x)).apply(lambda x: normal_form_line(x)), y_train, X_test.apply(lambda x: replace_digits(x)).apply(lambda x: normal_form_line(x)), y_test)

accuracy_score: 0.852727


In [100]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
clf_v4 = Pipeline([('vect', CountVectorizer()),
                   ('lr', LogisticRegression())])
check_accuracy(clf_v4, X_train.apply(lambda x: replace_digits(x)), y_train, X_test.apply(lambda x: replace_digits(x)), y_test)
# check_accuracy(clf_v4, X_train, y_train, X_test, y_test)

accuracy_score: 0.814242


In [75]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
def normal_form(word):
    return morph.parse(word)[0].normal_form

def normal_form_line(line):    
    return u' '.join([normal_form(unicode(x, 'utf-8')) for x in line.split()])

In [89]:
X_train.count()

37426

In [90]:
vect =  CountVectorizer()
t = X_train
t = t.apply(lambda x: replace_digits(x))
t = t.apply(lambda x: normal_form_line(x))
t = vect.fit_transform(t)

# t = vect.fit_transform(X_train.apply(lambda x: replace_digits(x)))
vocab = pd.Series(vect.vocabulary_)

In [91]:
vocab.count()

27702

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
clf_v4 = Pipeline([('vect', CountVectorizer()),
#                    ('tfidf', TfidfTransformer()),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-3, 
                                         n_iter=5, 
                                         random_state=42))])
check_accuracy(clf_v4, X_train.apply(lambda x: replace_digits(x)), y_train, X_test.apply(lambda x: replace_digits(x)), y_test)

12341234


In [92]:
vect =  CountVectorizer()
t = vect.fit_transform(X_train.apply(lambda x: replace_digits(x)))
vocab = pd.Series(vect.vocabulary_)

In [93]:
vocab.count()

30990

In [None]:
pd.DataFrame(t.todense())

In [None]:
vocab.to_csv("vocab_batch.csv", encoding="UTF-8", sep='\t')