In [1]:
import pandas
import sklearn

In [2]:
train = pandas.read_csv("linear_train.txt", names=["word", "y"])
test = pandas.read_csv("linear_test.txt", names=["word"])
ans = pandas.read_csv("linear_ans_example.txt")

In [3]:
from sklearn.linear_model import LogisticRegression

In [4]:
from sklearn.model_selection import cross_val_score
from collections import defaultdict

In [59]:
class ExtractFeatures(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.extract(word) for word in X]

    def extract(self, word):
        features = defaultdict(lambda: 0)
        word = unicode(word, "utf-8")
        if word[0].isupper():
            features['capitalized'] = 1
        if word.isupper():
            features['allcaps'] = 1
        word = word.lower()
        for i in range(len(word)):
            features["suf" + word[-i:]] += 1
        for length in range(1, 6):
            for i in range(len(word) - length + 1):
                features[word[i:i + length]] += 1
            for offset in range(1, 4):
                if len(word) - offset - length < 0:
                    continue
                features[word[len(word) - offset - length: len(word) - offset] + "*"*offset] += 1
        features["length"] = len(word)
        return features
        

In [60]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold

In [61]:
vectorizer = DictVectorizer()
train_data = vectorizer.fit_transform(ExtractFeatures().transform(train.word))

In [62]:
test_data = vectorizer.transform(ExtractFeatures().transform(test.word))

In [79]:
#est = LogisticRegression(C=1)
est = sklearn.linear_model.LogisticRegressionCV(scoring="roc_auc")
est = make_pipeline(sklearn.preprocessing.MaxAbsScaler(), est)

In [77]:
cross_val_score(est, train_data, train.y, scoring="roc_auc", cv=StratifiedKFold(shuffle=True))

array([ 0.90883896,  0.90778775,  0.91071863])

In [80]:
est.fit(train_data, train.y)
answers = est.predict_proba(test_data)[:, 1]

In [69]:
ans.Answer = answers

In [36]:
answers = est.predict(train_data)

In [37]:
train[train.y != 1].count()

word    90770
y       90770
dtype: int64

In [70]:
ans.to_csv("answer.txt", index=False)

In [84]:
est.steps[-1][1].Cs_

array([  1.00000000e-04,   7.74263683e-04,   5.99484250e-03,
         4.64158883e-02,   3.59381366e-01,   2.78255940e+00,
         2.15443469e+01,   1.66810054e+02,   1.29154967e+03,
         1.00000000e+04])

In [None]:
import keras
import keras.preprocessing.sequence
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
words = train['word'].apply(lambda x: np.array([ord(ch) for ch in unicode(x, 'utf-8').lower()]))
words = keras.preprocessing.sequence.pad_sequences(words)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(words, train.y)

In [None]:
from keras.models import load_model
model = load_model("test.h5")

In [None]:
y_ans = model.predict(words)

In [None]:
sklearn.metrics.roc_auc_score(y_ans, y_test)