In [1]:
import numpy as np
import pandas as pd
import MeCab
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from gensim.models import word2vec
from sklearn.mixture import GaussianMixture
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from collections import defaultdict
from tqdm import tqdm, tqdm_pandas, tqdm_notebook
import time
tqdm.pandas(tqdm_notebook)

In [2]:
news_df = pd.read_csv('../data/news.csv.gz')

In [3]:
news_df.head()

Unnamed: 0,label,text
0,movie-enter,【DVDエンター！】誘拐犯に育てられた女が目にした真実は、孤独か幸福か2005年11月から翌...
1,movie-enter,藤原竜也、中学生とともにロケット打ち上げに成功「アンテナを張りながら生活をしていけばいい」2...
2,movie-enter,『戦火の馬』ロイヤル・プレミアにウィリアム王子＆キャサリン妃が出席3月2日より全国ロードショ...
3,movie-enter,香里奈、女子高生100人のガチンコ質問に回答「ラーメンも食べる」女優の香里奈が18日、都内で...
4,movie-enter,ユージの前に立ちはだかったJOY「僕はAKBの高橋みなみを守る」5日、東京・千代田区の内幸町...


In [4]:
news_df.label.value_counts()

sports-watch      900
dokujo-tsushin    870
movie-enter       870
smax              870
it-life-hack      870
kaden-channel     864
peachy            842
topic-news        770
livedoor-homme    511
Name: label, dtype: int64

In [5]:
def get_wakati_text(text):
    tagger = MeCab.Tagger('-Owakati')
    wakati_text = tagger.parse(text).strip()
    return wakati_text

In [6]:
news_df['wakati_text'] = news_df.text.progress_apply(get_wakati_text)

100%|██████████| 7367/7367 [00:07<00:00, 964.30it/s] 


In [7]:
news_df.head()

Unnamed: 0,label,text,wakati_text
0,movie-enter,【DVDエンター！】誘拐犯に育てられた女が目にした真実は、孤独か幸福か2005年11月から翌...,【 DVD エンター ！ 】 誘拐 犯 に 育て られ た 女 が 目 に し た 真実 は...
1,movie-enter,藤原竜也、中学生とともにロケット打ち上げに成功「アンテナを張りながら生活をしていけばいい」2...,藤原 竜也 、 中学生 とともに ロケット 打ち上げ に 成功 「 アンテナ を 張り なが...
2,movie-enter,『戦火の馬』ロイヤル・プレミアにウィリアム王子＆キャサリン妃が出席3月2日より全国ロードショ...,『 戦火 の 馬 』 ロイヤル ・ プレミア に ウィリアム 王子 ＆ キャサリン 妃 が ...
3,movie-enter,香里奈、女子高生100人のガチンコ質問に回答「ラーメンも食べる」女優の香里奈が18日、都内で...,香里奈 、 女子高 生 100 人 の ガチンコ 質問 に 回答 「 ラーメン も 食べる ...
4,movie-enter,ユージの前に立ちはだかったJOY「僕はAKBの高橋みなみを守る」5日、東京・千代田区の内幸町...,ユージ の 前 に 立ちはだかっ た JOY 「 僕 は AKB の 高橋 みなみ を 守る...


In [8]:
y = news_df.label.values
accs_dict = {}
elapsed_times_dict = {}

In [9]:
def train_and_get_oof_accuracies(X, y, params):
    START_TIME = time.time()
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    accuracies = []

    for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
        print(f'Start: fold {i+1}')
        X_train, y_train = X[train_index, :], y[train_index]
        X_valid, y_valid = X[valid_index, :], y[valid_index]
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train,
            y_train,
            eval_set=(X_valid, y_valid),
            early_stopping_rounds=100,
            verbose=100
        )
        y_pred = model.predict(X_valid)
        accuracy = accuracy_score(y_valid, y_pred)
        print(f'Accuracy is {accuracy} \n')
        accuracies.append(accuracy)

    elapsed_time = time.time() - START_TIME
    print(f'Elapsed time is {elapsed_time}.')
    return accuracies, elapsed_time

In [10]:
params = {
    'objective': 'multiclass',
    'num_class': news_df.label.nunique(),
    'n_estimators': 10000,
    'random_seed': 0
}

## BoW

In [11]:
vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
X = vectorizer.fit_transform(news_df.wakati_text.values)
X = X.toarray()

In [12]:
X.shape

(7367, 71646)

In [13]:
accs_dict['bow'], elapsed_times_dict['bow'] = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.159019
[200]	valid_0's multi_logloss: 0.173628
Early stopping, best iteration is:
[114]	valid_0's multi_logloss: 0.156125
Accuracy is 0.9525423728813559 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.148067
[200]	valid_0's multi_logloss: 0.157974
Early stopping, best iteration is:
[127]	valid_0's multi_logloss: 0.144632
Accuracy is 0.9572591587516961 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.125745
[200]	valid_0's multi_logloss: 0.122956
Early stopping, best iteration is:
[136]	valid_0's multi_logloss: 0.118081
Accuracy is 0.9640190088255262 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.129558
[200]	valid_0's multi_logloss: 0.143817
Early stopping, best iteration is:
[109]	valid_0

## BoW+TruncatedSVD

In [14]:
tsvd = TruncatedSVD(n_components=100)
X_reduced = tsvd.fit_transform(X)

In [15]:
X_reduced.shape

(7367, 100)

In [16]:
accs_dict['bow_tsvd'], elapsed_times_dict['bow_tsvd'] = train_and_get_oof_accuracies(X_reduced, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.427786
[200]	valid_0's multi_logloss: 0.431585
Early stopping, best iteration is:
[149]	valid_0's multi_logloss: 0.408679
Accuracy is 0.8738983050847458 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.424189
[200]	valid_0's multi_logloss: 0.408522
Early stopping, best iteration is:
[166]	valid_0's multi_logloss: 0.399322
Accuracy is 0.8772048846675712 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.411262
[200]	valid_0's multi_logloss: 0.398127
Early stopping, best iteration is:
[147]	valid_0's multi_logloss: 0.387545
Accuracy is 0.8750848608282417 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.404523
[200]	valid_0's multi_logloss: 0.408925
Early stopping, best iteration is:
[132]	valid_0

## TF-IDF

In [17]:
vectorizer = TfidfVectorizer(use_idf=True, token_pattern=u'(?u)\\b\\w+\\b')
X = vectorizer.fit_transform(news_df.wakati_text.values)
X = X.toarray()

In [18]:
X.shape

(7367, 71646)

In [19]:
accs_dict['tfidf'], elapsed_times_dict['tfidf'] = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.168869
[200]	valid_0's multi_logloss: 0.189717
Early stopping, best iteration is:
[101]	valid_0's multi_logloss: 0.168724
Accuracy is 0.9491525423728814 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.164664
[200]	valid_0's multi_logloss: 0.180684
Early stopping, best iteration is:
[105]	valid_0's multi_logloss: 0.163739
Accuracy is 0.9497964721845319 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.145545
[200]	valid_0's multi_logloss: 0.145236
Early stopping, best iteration is:
[117]	valid_0's multi_logloss: 0.141311
Accuracy is 0.9599456890699253 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.140006
[200]	valid_0's multi_logloss: 0.162954
Early stopping, best iteration is:
[108]	valid_0

## TF-IDF+TruncatedSVD

In [20]:
tsvd = TruncatedSVD(n_components=100)
X_reduced = tsvd.fit_transform(X)

In [21]:
X_reduced.shape

(7367, 100)

In [22]:
accs_dict['tfidf_tsvd'], elapsed_times_dict['tfidf_tsvd']  = train_and_get_oof_accuracies(X_reduced, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.270505
Early stopping, best iteration is:
[93]	valid_0's multi_logloss: 0.268477
Accuracy is 0.9159322033898305 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.305707
Early stopping, best iteration is:
[90]	valid_0's multi_logloss: 0.300025
Accuracy is 0.9063772048846676 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.32101
Early stopping, best iteration is:
[97]	valid_0's multi_logloss: 0.319156
Accuracy is 0.8934147997284454 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.296192
Early stopping, best iteration is:
[83]	valid_0's multi_logloss: 0.291762
Accuracy is 0.9124236252545825 

Start: fold 5
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_loglos

## Word2Vec -mean

In [23]:
corpus = [doc.split() for doc in news_df.wakati_text.values]
model_w2v = word2vec.Word2Vec(corpus, size=300, min_count=20, window=10)
model_w2v.save('../model/news_w2v.model')

In [24]:
def get_doc_mean_vector(doc, model):
    doc_vector = np.zeros(model.vector_size)
    words = doc.split()
    word_cnt = 0
    for word in words:
        try:
            word_vector = model.wv[word]
            doc_vector += word_vector
            word_cnt += 1
        except KeyError:
            pass
    doc_vector /= word_cnt
    return doc_vector

In [25]:
X = np.zeros((len(news_df), model_w2v.wv.vector_size))

for i, doc in tqdm_notebook(enumerate(news_df.wakati_text.values)):
    X[i, :] = get_doc_mean_vector(doc, model_w2v)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [26]:
X.shape

(7367, 300)

In [27]:
accs_dict['w2v_mean'], elapsed_times_dict['w2v_mean']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.349747
Early stopping, best iteration is:
[96]	valid_0's multi_logloss: 0.348576
Accuracy is 0.8888135593220339 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.324592
Early stopping, best iteration is:
[93]	valid_0's multi_logloss: 0.323728
Accuracy is 0.8955223880597015 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.370943
Early stopping, best iteration is:
[88]	valid_0's multi_logloss: 0.367494
Accuracy is 0.8839103869653768 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.329032
Early stopping, best iteration is:
[86]	valid_0's multi_logloss: 0.326502
Accuracy is 0.9049558723693143 

Start: fold 5
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_loglo

## SWEM-MAX

In [28]:
def get_doc_swem_max_vector(doc, model):
    words = doc.split()
    word_cnt = 0
    vector_size = model.vector_size
    
    doc_vector = np.zeros((len(words), vector_size))

    for i, word in enumerate(words):
        try:
            word_vector = model.wv[word]
        except KeyError:
            word_vector = np.zeros(vector_size)
        
        doc_vector[i, :] = word_vector

    doc_vector = np.max(doc_vector, axis=0)
    return doc_vector

In [29]:
X = np.zeros((len(news_df), model_w2v.wv.vector_size))

for i, doc in tqdm_notebook(enumerate(news_df.wakati_text.values)):
    X[i, :] = get_doc_swem_max_vector(doc, model_w2v)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [30]:
X.shape

(7367, 300)

In [31]:
accs_dict['swem_max'], elapsed_times_dict['swem_max']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.252278
[200]	valid_0's multi_logloss: 0.282172
Early stopping, best iteration is:
[110]	valid_0's multi_logloss: 0.250547
Accuracy is 0.9179661016949152 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.303795
Early stopping, best iteration is:
[99]	valid_0's multi_logloss: 0.303722
Accuracy is 0.9077340569877883 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.290364
[200]	valid_0's multi_logloss: 0.318144
Early stopping, best iteration is:
[118]	valid_0's multi_logloss: 0.288602
Accuracy is 0.9137813985064495 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.252443
[200]	valid_0's multi_logloss: 0.286379
Early stopping, best iteration is:
[106]	valid_0's multi_logloss: 0.251445
Accuracy is 0.

## SCDV -w2v

In [33]:
#https://github.com/nyk510/scdv-python/blob/master/src/create.py より引用、一部改変
def create_document_vector(documents, w2t, n_embedding):
    """
    学習済みの word topic vector と分かち書き済みの文章, 使用されている単語から
    文章ベクトルを作成するメソッド.
    Args:
        documents(list[list[str]]):
        w2t(dict): 単語 -> 埋め込み次元の dict
        n_embedding(int):
    Returns:
        embedded document vector
    """
    doc_vectors = []

    for doc in documents:
        vector_i = np.zeros(shape=(n_embedding,))
        for w in doc:
            try:
                v = w2t[w]
                vector_i += v
            except KeyError:
                continue
        doc_vectors.append(vector_i)
    return np.array(doc_vectors)

def create_idf_dataframe(documents):
    """
    Args:
        documents(list[str]):
    Returns(pd.DataFrame):
    """

    d = defaultdict(int)

    for doc in documents:
        vocab_i = set(doc)
        for w in list(vocab_i):
            d[w] += 1

    df_idf = pd.DataFrame()
    df_idf['count'] = d.values()
    df_idf['word'] = d.keys()
    df_idf['idf'] = np.log(len(documents) / df_idf['count'])
    return df_idf

def compress_document_vector(doc_vector, p=.04):
    v = np.copy(doc_vector)
    vec_norm = np.linalg.norm(v, axis=1)
    # zero divide しないように
    vec_norm = np.where(vec_norm > 0, vec_norm, 1.)
    v /= vec_norm[:, None]

    a_min = v.min(axis=1).mean()
    a_max = v.max(axis=1).mean()
    threshold = (abs(a_min) + abs(a_max)) / 2. * p
    v[abs(v) < threshold] = .0
    return v

def get_scdv(parsed_docs, word_vec=model_w2v, n_components=60, compress=True):

    n_wv_embed = word_vec.vector_size

    # w2v model と corpus の語彙集合を作成
    vocab_model = set(k for k in word_vec.wv.vocab.keys())
    vocab_docs = set([w for doc in parsed_docs for w in doc])
    out_of_vocabs = len(vocab_docs) - len(vocab_docs & vocab_model)
    print('out of vocabs: {out_of_vocabs}'.format(**locals()))

    # 使う文章に入っているものだけ学習させるため共通集合を取得してその word vector を GMM の入力にする
    use_words = list(vocab_docs & vocab_model)

    # 使う単語分だけ word vector を取得. よって shape = (n_vocabs, n_wv_embed,)
    use_word_vectors = np.array([word_vec[w] for w in use_words])

    # 公式実装: https://github.com/dheeraj7596/SCDV/blob/master/20news/SCDV.py#L32 により tied で学習
    # 共分散行列全部推定する必要が有るほど低次元ではないという判断?
    # -> 多分各クラスの分散を共通化することで各クラスに所属するデータ数を揃えたいとうのがお気持ちっぽい
    clf = GaussianMixture(n_components=n_components, covariance_type='tied', verbose=2)
    clf.fit(use_word_vectors)

    # word probs は各単語のクラスタへの割当確率なので shape = (n_vocabs, n_components,)
    word_probs = clf.predict_proba(use_word_vectors)

    # 単語ごとにクラスタへの割当確率を wv に対して掛け算する
    # shape = (n_vocabs, n_components, n_wv_embed) になる
    word_cluster_vector = use_word_vectors[:, None, :] * word_probs[:, :, None]

    # はじめに文章全体の idf を作成した後, use_word だけの df と left join して
    # 使用している単語の idf を取得
    df_use = pd.DataFrame()
    df_use['word'] = use_words
    df_idf = create_idf_dataframe(parsed_docs)
    df_use = pd.merge(df_use, df_idf, on='word', how='left')
    idf = df_use['idf'].values

    # topic vector を計算するときに concatenation するとあるが
    # 単に 二次元のベクトルに変形して各 vocab に対して idf をかければ OK
    topic_vector = word_cluster_vector.reshape(-1, n_components * n_wv_embed) * idf[:, None]
    # nanで影響が出ないように 0 で埋める
    topic_vector[np.isnan(topic_vector)] = 0
    word_to_topic = dict(zip(use_words, topic_vector))

    n_embedding = topic_vector.shape[1]

    cdv_vector = create_document_vector(parsed_docs, word_to_topic, n_embedding)
    if compress:
        compressed = compress_document_vector(cdv_vector)
        return compressed
    else:
        return cdv_vector

In [34]:
X = get_scdv(news_df.wakati_text.values)

out of vocabs: 2567
Initialization 0




  Iteration 10	 time lapse 0.97443s	 ll change 0.02409
Initialization converged: True	 time lapse 1.62653s	 ll 555.91177


In [35]:
X.shape

(7367, 18000)

In [36]:
accs_dict['scdv_w2v'], elapsed_times_dict['scdv_w2v']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.408476
[200]	valid_0's multi_logloss: 0.455125
Early stopping, best iteration is:
[111]	valid_0's multi_logloss: 0.406118
Accuracy is 0.8684745762711864 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.399024
[200]	valid_0's multi_logloss: 0.433227
Early stopping, best iteration is:
[129]	valid_0's multi_logloss: 0.393422
Accuracy is 0.8799185888738128 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.415187
[200]	valid_0's multi_logloss: 0.455897
Early stopping, best iteration is:
[114]	valid_0's multi_logloss: 0.414618
Accuracy is 0.86693822131704 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.354444
[200]	valid_0's multi_logloss: 0.385248
Early stopping, best iteration is:
[126]	valid_0's

## Doc2Vec - default

In [37]:
corpus = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(news_df.wakati_text.values)]
model = Doc2Vec(vector_size=300)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [38]:
X = np.array([model.infer_vector(doc.split()) for doc in news_df.wakati_text.values])

In [39]:
X.shape

(7367, 300)

In [40]:
accs_dict['d2v_default'], elapsed_times_dict['d2v_default']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.410166
[200]	valid_0's multi_logloss: 0.438682
Early stopping, best iteration is:
[123]	valid_0's multi_logloss: 0.404398
Accuracy is 0.8515254237288136 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.375066
[200]	valid_0's multi_logloss: 0.397589
Early stopping, best iteration is:
[119]	valid_0's multi_logloss: 0.367302
Accuracy is 0.8792401628222524 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.45298
[200]	valid_0's multi_logloss: 0.496435
Early stopping, best iteration is:
[121]	valid_0's multi_logloss: 0.449182
Accuracy is 0.8533604887983707 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.394936
[200]	valid_0's multi_logloss: 0.421278
Early stopping, best iteration is:
[123]	valid_0'

## Doc2Vec - Epochs30

In [41]:
corpus = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(news_df.wakati_text.values)]
model = Doc2Vec(vector_size=300)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=30)

In [42]:
X = np.array([model.infer_vector(doc.split()) for doc in news_df.wakati_text.values])

In [43]:
X.shape

(7367, 300)

In [44]:
accs_dict['d2v_epochs30'], elapsed_times_dict['d2v_epochs30']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.478027
[200]	valid_0's multi_logloss: 0.394044
[300]	valid_0's multi_logloss: 0.414963
Early stopping, best iteration is:
[212]	valid_0's multi_logloss: 0.391872
Accuracy is 0.8745762711864407 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.437851
[200]	valid_0's multi_logloss: 0.340627
[300]	valid_0's multi_logloss: 0.345039
Early stopping, best iteration is:
[236]	valid_0's multi_logloss: 0.337321
Accuracy is 0.8914518317503393 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.452152
[200]	valid_0's multi_logloss: 0.358036
[300]	valid_0's multi_logloss: 0.369286
Early stopping, best iteration is:
[219]	valid_0's multi_logloss: 0.356541
Accuracy is 0.8805159538357095 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	va

## Word2Vec -mean -pretrained fastText

In [45]:
import gensim.models.keyedvectors as keyedvectors
model_fasttext = keyedvectors.KeyedVectors.load_word2vec_format('../model/fasttext.vec')

In [46]:
X = np.zeros((len(news_df), model_fasttext.wv.vector_size))

for i, doc in tqdm_notebook(enumerate(news_df.wakati_text.values)):
    X[i, :] = get_doc_mean_vector(doc, model_fasttext)

  """Entry point for launching an IPython kernel.


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  import sys





In [47]:
X.shape

(7367, 300)

In [48]:
accs_dict['fasttext_mean'], elapsed_times_dict['fasttext_mean']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.411041
[200]	valid_0's multi_logloss: 0.481018
Early stopping, best iteration is:
[102]	valid_0's multi_logloss: 0.410916
Accuracy is 0.8671186440677966 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.36119
[200]	valid_0's multi_logloss: 0.402293
Early stopping, best iteration is:
[108]	valid_0's multi_logloss: 0.359622
Accuracy is 0.8853459972862958 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.43532
Early stopping, best iteration is:
[98]	valid_0's multi_logloss: 0.434781
Accuracy is 0.858112695179905 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.365556
Early stopping, best iteration is:
[94]	valid_0's multi_logloss: 0.364605
Accuracy is 0.879837067209776 

Start: fold 5
Training unt

## SWEM -MAX -pretraned fastText

In [49]:
X = np.zeros((len(news_df), model_fasttext.wv.vector_size))

for i, doc in tqdm_notebook(enumerate(news_df.wakati_text.values)):
    X[i, :] = get_doc_swem_max_vector(doc, model_fasttext)

  """Entry point for launching an IPython kernel.


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  # Remove the CWD from sys.path while we load stuff.





In [50]:
X.shape

(7367, 300)

In [51]:
accs_dict['swem_max_fasttext'], elapsed_times_dict['swem_max_fasttext']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.27356
[200]	valid_0's multi_logloss: 0.273228
Early stopping, best iteration is:
[145]	valid_0's multi_logloss: 0.260954
Accuracy is 0.9138983050847458 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.286736
[200]	valid_0's multi_logloss: 0.286457
Early stopping, best iteration is:
[126]	valid_0's multi_logloss: 0.275669
Accuracy is 0.9158751696065129 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.287594
[200]	valid_0's multi_logloss: 0.275379
Early stopping, best iteration is:
[154]	valid_0's multi_logloss: 0.268537
Accuracy is 0.911744738628649 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.293082
[200]	valid_0's multi_logloss: 0.296688
Early stopping, best iteration is:
[135]	valid_0's

## SCDV -pretrained fastText

In [52]:
X = get_scdv(news_df.wakati_text.values, word_vec=model_fasttext)



out of vocabs: 463
Initialization 0
  Iteration 10	 time lapse 3.31651s	 ll change 0.00775
  Iteration 20	 time lapse 3.05870s	 ll change 0.00595
Initialization converged: True	 time lapse 7.90065s	 ll 23.37736


In [53]:
X.shape

(7367, 18000)

In [54]:
accs_dict['scdv_fasttext'], elapsed_times_dict['scdv_fasttext']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.402941
[200]	valid_0's multi_logloss: 0.448402
Early stopping, best iteration is:
[107]	valid_0's multi_logloss: 0.398791
Accuracy is 0.8772881355932204 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.405631
[200]	valid_0's multi_logloss: 0.447015
Early stopping, best iteration is:
[115]	valid_0's multi_logloss: 0.403606
Accuracy is 0.878561736770692 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.427582
[200]	valid_0's multi_logloss: 0.473779
Early stopping, best iteration is:
[118]	valid_0's multi_logloss: 0.422557
Accuracy is 0.8737270875763747 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.380217
[200]	valid_0's multi_logloss: 0.422549
Early stopping, best iteration is:
[115]	valid_0'

## SCDV - pretrained fastText -raw

In [56]:
X = get_scdv(news_df.wakati_text.values, word_vec=model_fasttext, compress=False)



out of vocabs: 463
Initialization 0
  Iteration 10	 time lapse 3.28870s	 ll change 0.01126
  Iteration 20	 time lapse 3.05809s	 ll change 0.00252
Initialization converged: True	 time lapse 6.65311s	 ll 23.50536


In [57]:
X.shape

(7367, 18000)

In [58]:
accs_dict['scdv_fasttext_raw'], elapsed_times_dict['scdv_fasttext_raw']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.302716
[200]	valid_0's multi_logloss: 0.33929
Early stopping, best iteration is:
[104]	valid_0's multi_logloss: 0.301412
Accuracy is 0.9064406779661017 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.257149
[200]	valid_0's multi_logloss: 0.283197
Early stopping, best iteration is:
[112]	valid_0's multi_logloss: 0.254539
Accuracy is 0.9219810040705563 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.279389
[200]	valid_0's multi_logloss: 0.296254
Early stopping, best iteration is:
[132]	valid_0's multi_logloss: 0.277186
Accuracy is 0.9205702647657841 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.273663
[200]	valid_0's multi_logloss: 0.298073
Early stopping, best iteration is:
[121]	valid_0'

## SCDV -w2v -raw

In [59]:
X = get_scdv(news_df.wakati_text.values, word_vec=model_w2v, compress=False)

out of vocabs: 2567
Initialization 0




  Iteration 10	 time lapse 1.10014s	 ll change 0.00740
Initialization converged: True	 time lapse 1.42006s	 ll 556.31595


In [60]:
X.shape

(7367, 18000)

In [61]:
accs_dict['scdv_w2v_raw'], elapsed_times_dict['scdv_w2v_raw']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.331121
[200]	valid_0's multi_logloss: 0.364291
Early stopping, best iteration is:
[108]	valid_0's multi_logloss: 0.328557
Accuracy is 0.8969491525423728 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.316796
[200]	valid_0's multi_logloss: 0.343936
Early stopping, best iteration is:
[125]	valid_0's multi_logloss: 0.311626
Accuracy is 0.8989145183175034 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.329839
[200]	valid_0's multi_logloss: 0.360964
Early stopping, best iteration is:
[109]	valid_0's multi_logloss: 0.326948
Accuracy is 0.8906992532247114 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.283325
[200]	valid_0's multi_logloss: 0.311122
Early stopping, best iteration is:
[116]	valid_0

In [62]:
results = [accs_dict, elapsed_times_dict]
pd.to_pickle(results, '../data/results_news.pkl')