In [1]:
import re
import pandas as pd
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
docs_titles = pd.read_table('docs_titles.tsv')
train_groups = pd.read_csv('train_groups.csv')

data = pd.merge(train_groups, docs_titles, how='left', on='doc_id')[['group_id', 'title', 'target']]

def transform_data(data):
    pattern = r"[^\w\d' ']"
    data['title'] = np.asarray([re.sub(pattern, " ", str(title).lower()) for title in data['title']])

def get_features(data):
    onehot_groupid = np.zeros((len(data), max(data['group_id'])))
    for i, group_id in enumerate(data['group_id']):
        onehot_groupid[i, group_id - 1] = 1
    max_features = 20
    vectorizer = TfidfVectorizer(max_features=max_features)
    X = np.zeros((1, max_features))
    for _, group in data.groupby('group_id'):
        titles_tfidf = vectorizer.fit_transform(np.asarray(group['title'])).toarray()
        X = np.concatenate((X, titles_tfidf))
    return X[1:]

transform_data(data)

scaler = StandardScaler()
X = get_features(data)
# X = scaler.fit_transform(X)
y = np.asarray(data['target'])

from collections import defaultdict

gkf = GroupKFold(n_splits=5)
scores = defaultdict(list)
for threshold in np.linspace(0, 1, 11):
    for train, test, in gkf.split(X, y, groups=data['group_id']):
        clf = MySGDClassifier(batch_generator(batch_size=100000), max_epoch=20, C=1, alpha=0.001, model_type='log_reg')
    #     clf = LogisticRegression(C=1)
        clf.fit(X[train], y[train])
        prediction = clf.predict(X[test], threshold)
        scores[threshold].append(f1_score(y[test], prediction))

for threshold, s in scores.items():
    print(threshold, np.mean(s))