In [1]:
import pandas as pd
import numpy as np

## Metadata

In [230]:
PATH_TRAIN = "source/evo_train.csv"
PATH_TEST = "source/evo_test.csv"
PATH_CATS = "source/categories.csv"

In [3]:
target_label = u'GROUP_ID'
pred_labels = u'NAME'

## Preparing data ##

In [231]:
dtrain = pd.read_csv(PATH_TRAIN)
dtest = pd.read_csv(PATH_TEST)
cats = pd.read_csv(PATH_CATS) 

In [232]:
target = dtrain[target_label]
preds = dtrain[pred_labels]
preds_test = dtest[pred_labels]

### Partition

In [6]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    preds, target, 
    test_size=0.33, 
    random_state=42)

### Functions

In [12]:
from sklearn.metrics import accuracy_score
def check_accuracy(clf, x, y, x_test, y_test):
    clf.fit(x, y)
    t = clf.predict(x_test)
    from sklearn.metrics import accuracy_score
    print "accuracy_score: %f" % accuracy_score(y_test, t)

## Model

### Baseline ###

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
baseline_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
check_accuracy(baseline_clf, X_train, y_train, X_test, y_test)

accuracy_score: 0.812249


### Best model

In [10]:
# SelectFromModel

from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score


clf_v5 = Pipeline([('vect', CountVectorizer()),
#                    ('tfidf', TfidfTransformer(use_idf=False)),
                   ('select', SelectFromModel(LogisticRegression(C=0.01, penalty="l2", dual=False), 
                                              threshold="0.01*mean")),
                   ('clf', SGDClassifier(loss='hinge', 
                                         penalty='l2',
                                         alpha=1e-4, 
                                         n_iter=50, 
                                         random_state=42))
                  ])

# t = clf_v5.fit_transform(X_train, y_train)
check_accuracy(clf_v5, X_train, y_train, X_test, y_test)
# print cross_val_score(clf_v5, X_train, y_train, scoring='accuracy', cv=4, n_jobs=-1)

accuracy_score: 0.930889


In [13]:
preds_sgd = clf_v5.predict(X_test)
print "accuracy_score: %f" % accuracy_score(y_test, preds_sgd)

accuracy_score: 0.930889


## Best features

In [160]:
# counter = CountVectorizer(min_df=5, binary=True)
counter = Pipeline([
        ('counter', CountVectorizer(min_df=5, binary=True)),
#         ('tfidf', TfidfTransformer(use_idf=False))
#         ('tfidf', TfidfTransformer())
    ])
t = counter.fit_transform(X_train1)
print t.shape
# vocal = counter.vocabulary_
vocal = counter.named_steps['counter'].vocabulary_

(25075, 4048)


In [161]:
dtrain = pd.DataFrame(t.todense())
dtrain.columns = pd.Series(vocal).index

In [162]:
dtrain["target"] = y_train1.reset_index()["GROUP_ID"]

In [163]:
# m = dtrain[:10000].groupby("target").max()

In [164]:
m1 = dtrain[:int(10e3)].groupby("target").max()
m2 = dtrain[int(10e3):int(20e3)].groupby("target").max()
m3 = dtrain[int(20e3):int(30e3)].groupby("target").max()
m4 = dtrain[int(30e3):].groupby("target").max()

In [165]:
m = pd.concat([m1,m2,m3,m4]).reset_index().groupby("target").max()

In [166]:
tt = counter.transform(X_test1)
dtest = pd.DataFrame(tt.todense())
dtest.columns = pd.Series(vocal).index

In [167]:
# np.dot(m.as_matrix(), dtest.transpose().as_matrix()).transpose()
x = np.dot(m.as_matrix(), dtest.transpose().as_matrix()).transpose()
preds_evr = m.index[np.argmax(x, axis=(1))]

In [168]:
correct_test = dtest.apply(lambda x: sum(x), axis=1) > 0

In [171]:
from sklearn.metrics import accuracy_score
print "accuracy_score: %f" % accuracy_score(y_test1, preds_evr)

accuracy_score: 0.763582


### Classifier

In [191]:
from sklearn.base import BaseEstimator, ClassifierMixin

class HardClassifier(BaseEstimator, ClassifierMixin):  
    def __init__(self, min_df=5, binary=True):
        self.counter = CountVectorizer(min_df=5, binary=True)

    def fit(self, X, y=None):
        counter = self.counter
        t = counter.fit_transform(X)
        self.vocal = counter.vocabulary_
        dtrain = pd.DataFrame(t.todense())
        dtrain.columns = pd.Series(self.vocal).index
        if type(y) == pd.Series:
            y = y.reset_index(drop=True)
        dtrain["target"] = y
        m1 = dtrain[:int(10e3)].groupby("target").max()
        m2 = dtrain[int(10e3):int(20e3)].groupby("target").max()
        m3 = dtrain[int(20e3):int(30e3)].groupby("target").max()
        m4 = dtrain[int(30e3):].groupby("target").max()
        self.M = pd.concat([m1,m2,m3,m4]).reset_index().groupby("target").max()
        return self

    def predict(self, X, y=None):
        vocal = self.vocal
        m = self.M
#         counter = self.counter
        counter = CountVectorizer(min_df=5, binary=True, vocabulary=vocal)
        tt = counter.fit_transform(X)
        dtest = pd.DataFrame(tt.todense())
        assert len(m.columns) == len(dtest.columns)
        dtest.columns = pd.Series(vocal).index
        x = np.dot(m.as_matrix(), dtest.transpose().as_matrix()).transpose()
        self.correct = dtest.apply(lambda x: sum(x), axis=1) > 0
        return m.index[np.argmax(x, axis=(1))]

In [192]:
hard = HardClassifier(min_df=5, binary=True)
check_accuracy(hard, X_train, y_train, X_test, y_test)


accuracy_score: 0.779809


## ONE more model

In [136]:
clf_v4 = Pipeline([('vect', CountVectorizer()),
                   ('clf', LogisticRegression(C=20000,
                                              penalty="l2",
                                              n_jobs=4,
                                              random_state=17,
                                              max_iter=500, 
#                                               solver='newton-cg',#0.925...
                                              solver='liblinear', #0.926169
#                                               solver='lbfgs', #0.925030
#                                               multi_class='multinomial',#0.923891
                                              class_weight='balanced', #0.927254
                                              dual=False))
                  ])
# check_accuracy(clf_v4, X_train, y_train, X_test, y_test)
# print cross_val_score(clf_v4, X_train, y_train, scoring='accuracy', cv=4, n_jobs=-1)

In [87]:
preds_lr = clf_v4.predict(X_test)
print "accuracy_score: %f" % accuracy_score(y_test, preds_lr)

accuracy_score: 0.927254


## Voting model

In [193]:
from sklearn.ensemble import VotingClassifier
voter = VotingClassifier([('lr', clf_v4), 
                          ('sgd', clf_v5), 
                          ('hard', HardClassifier(min_df=5, binary=True))],
                        voting='hard' # 0.926766
                        )
check_accuracy(voter, X_train, y_train, X_test, y_test)

accuracy_score: 0.928176


In [None]:
check_accuracy.fit(preds, target)

## Create submit

In [234]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
#     X_train, y_train, 
    preds, target,
    test_size=0.33, 
    random_state=42)

In [236]:
train_sgd = clf_v5.fit(X_train1, y_train1).predict(X_test1)
train_lr = clf_v4.fit(X_train1, y_train1).predict(X_test1)
train_hard = hard.fit(X_train1, y_train1).predict(X_test1)

dfpreds = pd.DataFrame({"preds_sgd": train_sgd, 
                        "preds_hard":train_hard,
                        "preds_lr": train_lr,
                       "y_test": y_test1.reset_index(drop=True)})

dummy_preds = pd.get_dummies(dfpreds[["preds_sgd", "preds_hard", "preds_lr"]].astype(str), drop_first=True)
dummies = dummy_preds.columns

sgd = SGDClassifier(loss='hinge', penalty='l2',
                    alpha=1e-4, n_iter=50, 
                    random_state=42)
sgd.fit(dummy_preds, dfpreds['y_test'])

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)

In [238]:
X_train1, y_train1, X_test1 = preds, target, preds_test

In [239]:
train_sgd = clf_v5.fit(X_train1, y_train1).predict(X_test1)
train_lr = clf_v4.fit(X_train1, y_train1).predict(X_test1)
train_hard = hard.fit(X_train1, y_train1).predict(X_test1)
dfpreds = pd.DataFrame({"preds_sgd": train_sgd, 
                        "preds_hard":train_hard,
                        "preds_lr": train_lr})

dummy_preds = pd.get_dummies(dfpreds[["preds_sgd", "preds_hard", "preds_lr"]].astype(str), drop_first=True)

assert len(dummy_preds.columns) == len(dummies)

pred_ensemble = sgd.predict(dummy_preds)
print "accuracy_score: %f" % accuracy_score(y_test, pred_ensemble)

AssertionError: 

In [242]:
pred_ensemble = sgd.predict(dummy_preds[dummies])

In [257]:
submit = pd.Series(pred_ensemble).reset_index()
submit.columns = "id","GROUP_ID"
submit.to_csv('submit_ensemble.csv', index=False)