In [2]:
import pandas
import sklearn

In [3]:
train = pandas.read_csv("linear_train.txt", names=["word", "y"])
test = pandas.read_csv("linear_test.txt", names=["word"])
ans = pandas.read_csv("linear_ans_example.txt")

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
from sklearn.model_selection import cross_val_score
from collections import defaultdict

In [9]:
class ExtractFeatures(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.extract(word) for word in X]

    def extract(self, word):
        features = defaultdict(lambda: 0)
        if word[0].isupper():
            features['capitalized'] = 1
        if word.isupper():
            features['allcaps'] = 1   
        word = word.lower()
        for i in range(len(word)):
            features["suf" + word[-i:]] += 1
        for length in range(2, 4):
            for i in range(len(word) - length + 1):
                features[word[i:i + length]] += 1
            for offset in range(1, 3):
                features[word[len(word) - offset - length: len(word) - offset] + "*"*offset] += 1
        features["length"] = len(word)
        return features
        

In [10]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold

In [11]:
vectorizer = DictVectorizer()
train_data = vectorizer.fit_transform(ExtractFeatures().transform(train.word))

In [12]:
test_data = vectorizer.transform(ExtractFeatures().transform(test.word))

In [24]:
#est = LogisticRegression(class_weight='balanced', C=1)
est = sklearn.linear_model.LogisticRegressionCV(class_weight='balanced')

In [14]:
cross_val_score(est, train_data, train.y, scoring="roc_auc", cv=StratifiedKFold(shuffle=True))

array([ 0.8939391 ,  0.89360533,  0.90189708])

In [25]:
est.fit(train_data, train.y)
answers = est.predict_proba(test_data)[:, 1]

In [27]:
ans.Answer = answers

In [17]:
answers = est.predict(train_data)

In [18]:
train[train.y != 1].count()

word    90770
y       90770
dtype: int64

In [28]:
ans.to_csv("answer.txt", index=False)

In [26]:
est.scores_

{1: array([[ 0.62038872,  0.69236458,  0.74901636,  0.77611455,  0.7974736 ,
          0.81436559,  0.81309351,  0.81117061,  0.82054847,  0.8190989 ],
        [ 0.71218531,  0.80037275,  0.84306127,  0.86385824,  0.87370943,
          0.87344319,  0.877644  ,  0.88042481,  0.88163772,  0.87968524],
        [ 0.64605645,  0.75820957,  0.82938879,  0.87169398,  0.89145613,
          0.89660375,  0.89728418,  0.89731377,  0.89923673,  0.89849713]])}