In [1]:
import numpy as np
import pandas as pd
import MeCab
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from gensim.models import word2vec
from sklearn.mixture import GaussianMixture
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from collections import defaultdict
from tqdm import tqdm, tqdm_pandas, tqdm_notebook
import time
tqdm.pandas(tqdm_notebook)

In [2]:
roomba_df = pd.read_csv('../data/roomba.csv.gz')

In [4]:
#ちょっと前処理
roomba_df = roomba_df[['is_positive', 'tweet_text']]
roomba_df.columns = ['label', 'text']
roomba_df.dropna(inplace=True)

In [5]:
def get_wakati_text(text):
    tagger = MeCab.Tagger('-Owakati')
    wakati_text = tagger.parse(text).strip()
    return wakati_text

In [6]:
roomba_df['wakati_text'] = roomba_df.text.progress_apply(get_wakati_text)

100%|██████████| 1288/1288 [00:00<00:00, 2688.32it/s]


In [8]:
y = roomba_df.label.values
accs_dict = {}
elapsed_times_dict = {}

In [9]:
def train_and_get_oof_accuracies(X, y, params):
    START_TIME = time.time()
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    accuracies = []

    for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
        print(f'Start: fold {i+1}')
        X_train, y_train = X[train_index, :], y[train_index]
        X_valid, y_valid = X[valid_index, :], y[valid_index]
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train,
            y_train,
            eval_set=(X_valid, y_valid),
            eval_metric='auc',
            early_stopping_rounds=100,
            verbose=100
        )
        y_pred = model.predict(X_valid)
        accuracy = accuracy_score(y_valid, y_pred)
        print(f'Accuracy is {accuracy} \n')
        accuracies.append(accuracy)

    elapsed_time = time.time() - START_TIME
    print(f'Elapsed time is {elapsed_time}.')
    return accuracies, elapsed_time

In [10]:
params = {
    'objective': 'binary',
    'n_estimators': 10000,
    'random_seed': 0
}

## BoW

In [11]:
vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
X = vectorizer.fit_transform(roomba_df.wakati_text.values)
X = X.toarray()

In [12]:
X.shape

(1288, 4331)

In [13]:
accs_dict['bow'], elapsed_times_dict['bow'] = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.770763	valid_0's binary_logloss: 0.623451
Early stopping, best iteration is:
[27]	valid_0's auc: 0.778346	valid_0's binary_logloss: 0.558682
Accuracy is 0.7131782945736435 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.809641	valid_0's binary_logloss: 0.543696
Early stopping, best iteration is:
[49]	valid_0's auc: 0.814275	valid_0's binary_logloss: 0.515239
Accuracy is 0.7325581395348837 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.820805	valid_0's binary_logloss: 0.5511
Early stopping, best iteration is:
[54]	valid_0's auc: 0.822972	valid_0's binary_logloss: 0.515779
Accuracy is 0.7325581395348837 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.793593	valid_0's binary_logloss: 0.582118
Early stopping, best iteration is:
[34

## BoW+TruncatedSVD

In [14]:
tsvd = TruncatedSVD(n_components=100)
X_reduced = tsvd.fit_transform(X)

In [15]:
X_reduced.shape

(1288, 100)

In [16]:
accs_dict['bow_tsvd'], elapsed_times_dict['bow_tsvd'] = train_and_get_oof_accuracies(X_reduced, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.78617	valid_0's binary_logloss: 0.616381
Early stopping, best iteration is:
[54]	valid_0's auc: 0.784726	valid_0's binary_logloss: 0.563807
Accuracy is 0.689922480620155 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.761555	valid_0's binary_logloss: 0.659433
Early stopping, best iteration is:
[43]	valid_0's auc: 0.755477	valid_0's binary_logloss: 0.591858
Accuracy is 0.6666666666666666 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.817646	valid_0's binary_logloss: 0.547301
Early stopping, best iteration is:
[54]	valid_0's auc: 0.808438	valid_0's binary_logloss: 0.530274
Accuracy is 0.7403100775193798 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.805424	valid_0's binary_logloss: 0.550697
Early stopping, best iteration is:
[82

## TF-IDF

In [17]:
vectorizer = TfidfVectorizer(use_idf=True, token_pattern=u'(?u)\\b\\w+\\b')
X = vectorizer.fit_transform(roomba_df.wakati_text.values)
X = X.toarray()

In [18]:
X.shape

(1288, 4331)

In [19]:
accs_dict['tfidf'], elapsed_times_dict['tfidf'] = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.766791	valid_0's binary_logloss: 0.65022
Early stopping, best iteration is:
[29]	valid_0's auc: 0.774254	valid_0's binary_logloss: 0.559433
Accuracy is 0.6821705426356589 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.811928	valid_0's binary_logloss: 0.579482
Early stopping, best iteration is:
[41]	valid_0's auc: 0.8026	valid_0's binary_logloss: 0.526978
Accuracy is 0.7364341085271318 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.820234	valid_0's binary_logloss: 0.572954
Early stopping, best iteration is:
[34]	valid_0's auc: 0.819842	valid_0's binary_logloss: 0.51058
Accuracy is 0.7558139534883721 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.782702	valid_0's binary_logloss: 0.618418
Early stopping, best iteration is:
[31]	

## TF-IDF+TruncatedSVD

In [20]:
tsvd = TruncatedSVD(n_components=100)
X_reduced = tsvd.fit_transform(X)

In [21]:
X_reduced.shape

(1288, 100)

In [22]:
accs_dict['tfidf_tsvd'], elapsed_times_dict['tfidf_tsvd']  = train_and_get_oof_accuracies(X_reduced, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.841237	valid_0's binary_logloss: 0.556817
Early stopping, best iteration is:
[35]	valid_0's auc: 0.839552	valid_0's binary_logloss: 0.496335
Accuracy is 0.7596899224806202 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.817525	valid_0's binary_logloss: 0.561323
Early stopping, best iteration is:
[54]	valid_0's auc: 0.809882	valid_0's binary_logloss: 0.529988
Accuracy is 0.7015503875968992 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.854899	valid_0's binary_logloss: 0.496176
Early stopping, best iteration is:
[50]	valid_0's auc: 0.859052	valid_0's binary_logloss: 0.462638
Accuracy is 0.7713178294573644 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.849047	valid_0's binary_logloss: 0.521497
Early stopping, best iteration is:
[

## Word2Vec -mean

In [23]:
corpus = [doc.split() for doc in roomba_df.wakati_text.values]
model_w2v = word2vec.Word2Vec(corpus, size=300, min_count=20, window=10)
model_w2v.save('../model/roomba_w2v.model')

In [24]:
def get_doc_mean_vector(doc, model):
    doc_vector = np.zeros(model.vector_size)
    words = doc.split()
    word_cnt = 0
    for word in words:
        try:
            word_vector = model.wv[word]
            doc_vector += word_vector
            word_cnt += 1
        except KeyError:
            pass
    doc_vector /= word_cnt
    return doc_vector

In [25]:
X = np.zeros((len(roomba_df), model_w2v.wv.vector_size))

for i, doc in tqdm_notebook(enumerate(roomba_df.wakati_text.values)):
    X[i, :] = get_doc_mean_vector(doc, model_w2v)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [26]:
X.shape

(1288, 300)

In [27]:
accs_dict['w2v_mean'], elapsed_times_dict['w2v_mean']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.674862	valid_0's binary_logloss: 0.769171
Early stopping, best iteration is:
[23]	valid_0's auc: 0.656084	valid_0's binary_logloss: 0.659719
Accuracy is 0.6201550387596899 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.655152	valid_0's binary_logloss: 0.781665
Early stopping, best iteration is:
[10]	valid_0's auc: 0.613325	valid_0's binary_logloss: 0.674009
Accuracy is 0.6124031007751938 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.675674	valid_0's binary_logloss: 0.754755
Early stopping, best iteration is:
[16]	valid_0's auc: 0.668272	valid_0's binary_logloss: 0.642768
Accuracy is 0.6046511627906976 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.667395	valid_0's binary_logloss: 0.785201
Early stopping, best iteration is:
[

## SWEM-MAX

In [28]:
def get_doc_swem_max_vector(doc, model):
    words = doc.split()
    word_cnt = 0
    vector_size = model.vector_size
    
    doc_vector = np.zeros((len(words), vector_size))

    for i, word in enumerate(words):
        try:
            word_vector = model.wv[word]
        except KeyError:
            word_vector = np.zeros(vector_size)
        
        doc_vector[i, :] = word_vector

    doc_vector = np.max(doc_vector, axis=0)
    return doc_vector

In [29]:
X = np.zeros((len(roomba_df), model_w2v.wv.vector_size))

for i, doc in tqdm_notebook(enumerate(roomba_df.wakati_text.values)):
    X[i, :] = get_doc_swem_max_vector(doc, model_w2v)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [30]:
X.shape

(1288, 300)

In [31]:
accs_dict['swem_max'], elapsed_times_dict['swem_max']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.644048	valid_0's binary_logloss: 0.795825
Early stopping, best iteration is:
[8]	valid_0's auc: 0.69743	valid_0's binary_logloss: 0.635734
Accuracy is 0.627906976744186 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.69409	valid_0's binary_logloss: 0.73212
Early stopping, best iteration is:
[28]	valid_0's auc: 0.711663	valid_0's binary_logloss: 0.625126
Accuracy is 0.6705426356589147 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.660809	valid_0's binary_logloss: 0.766551
Early stopping, best iteration is:
[23]	valid_0's auc: 0.686928	valid_0's binary_logloss: 0.640245
Accuracy is 0.6550387596899225 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.688205	valid_0's binary_logloss: 0.728406
Early stopping, best iteration is:
[6]	va

## SCDV -w2v

In [32]:
#https://github.com/nyk510/scdv-python/blob/master/src/create.py より引用、一部改変
def create_document_vector(documents, w2t, n_embedding):
    """
    学習済みの word topic vector と分かち書き済みの文章, 使用されている単語から
    文章ベクトルを作成するメソッド.
    Args:
        documents(list[list[str]]):
        w2t(dict): 単語 -> 埋め込み次元の dict
        n_embedding(int):
    Returns:
        embedded document vector
    """
    doc_vectors = []

    for doc in documents:
        vector_i = np.zeros(shape=(n_embedding,))
        for w in doc:
            try:
                v = w2t[w]
                vector_i += v
            except KeyError:
                continue
        doc_vectors.append(vector_i)
    return np.array(doc_vectors)

def create_idf_dataframe(documents):
    """
    Args:
        documents(list[str]):
    Returns(pd.DataFrame):
    """

    d = defaultdict(int)

    for doc in documents:
        vocab_i = set(doc)
        for w in list(vocab_i):
            d[w] += 1

    df_idf = pd.DataFrame()
    df_idf['count'] = d.values()
    df_idf['word'] = d.keys()
    df_idf['idf'] = np.log(len(documents) / df_idf['count'])
    return df_idf

def compress_document_vector(doc_vector, p=.04):
    v = np.copy(doc_vector)
    vec_norm = np.linalg.norm(v, axis=1)
    # zero divide しないように
    vec_norm = np.where(vec_norm > 0, vec_norm, 1.)
    v /= vec_norm[:, None]

    a_min = v.min(axis=1).mean()
    a_max = v.max(axis=1).mean()
    threshold = (abs(a_min) + abs(a_max)) / 2. * p
    v[abs(v) < threshold] = .0
    return v

def get_scdv(parsed_docs, word_vec=model_w2v, n_components=60, compress=True):

    n_wv_embed = word_vec.vector_size

    # w2v model と corpus の語彙集合を作成
    vocab_model = set(k for k in word_vec.wv.vocab.keys())
    vocab_docs = set([w for doc in parsed_docs for w in doc])
    out_of_vocabs = len(vocab_docs) - len(vocab_docs & vocab_model)
    print('out of vocabs: {out_of_vocabs}'.format(**locals()))

    # 使う文章に入っているものだけ学習させるため共通集合を取得してその word vector を GMM の入力にする
    use_words = list(vocab_docs & vocab_model)

    # 使う単語分だけ word vector を取得. よって shape = (n_vocabs, n_wv_embed,)
    use_word_vectors = np.array([word_vec[w] for w in use_words])

    # 公式実装: https://github.com/dheeraj7596/SCDV/blob/master/20news/SCDV.py#L32 により tied で学習
    # 共分散行列全部推定する必要が有るほど低次元ではないという判断?
    # -> 多分各クラスの分散を共通化することで各クラスに所属するデータ数を揃えたいとうのがお気持ちっぽい
    clf = GaussianMixture(n_components=n_components, covariance_type='tied', verbose=2)
    clf.fit(use_word_vectors)

    # word probs は各単語のクラスタへの割当確率なので shape = (n_vocabs, n_components,)
    word_probs = clf.predict_proba(use_word_vectors)

    # 単語ごとにクラスタへの割当確率を wv に対して掛け算する
    # shape = (n_vocabs, n_components, n_wv_embed) になる
    word_cluster_vector = use_word_vectors[:, None, :] * word_probs[:, :, None]

    # はじめに文章全体の idf を作成した後, use_word だけの df と left join して
    # 使用している単語の idf を取得
    df_use = pd.DataFrame()
    df_use['word'] = use_words
    df_idf = create_idf_dataframe(parsed_docs)
    df_use = pd.merge(df_use, df_idf, on='word', how='left')
    idf = df_use['idf'].values

    # topic vector を計算するときに concatenation するとあるが
    # 単に 二次元のベクトルに変形して各 vocab に対して idf をかければ OK
    topic_vector = word_cluster_vector.reshape(-1, n_components * n_wv_embed) * idf[:, None]
    # nanで影響が出ないように 0 で埋める
    topic_vector[np.isnan(topic_vector)] = 0
    word_to_topic = dict(zip(use_words, topic_vector))

    n_embedding = topic_vector.shape[1]

    cdv_vector = create_document_vector(parsed_docs, word_to_topic, n_embedding)
    if compress:
        compressed = compress_document_vector(cdv_vector)
        return compressed
    else:
        return cdv_vector

In [33]:
X = get_scdv(roomba_df.wakati_text.values)

out of vocabs: 1535
Initialization 0
Initialization converged: True	 time lapse 0.03726s	 ll 1768.58434




In [34]:
X.shape

(1288, 18000)

In [35]:
accs_dict['scdv_w2v'], elapsed_times_dict['scdv_w2v']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.693247	valid_0's binary_logloss: 0.736588
Early stopping, best iteration is:
[24]	valid_0's auc: 0.697159	valid_0's binary_logloss: 0.635992
Accuracy is 0.6550387596899225 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.652985	valid_0's binary_logloss: 0.832846
Early stopping, best iteration is:
[11]	valid_0's auc: 0.660538	valid_0's binary_logloss: 0.653324
Accuracy is 0.5968992248062015 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.711874	valid_0's binary_logloss: 0.683685
Early stopping, best iteration is:
[20]	valid_0's auc: 0.724633	valid_0's binary_logloss: 0.609547
Accuracy is 0.6821705426356589 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.743993	valid_0's binary_logloss: 0.629808
Early stopping, best iteration is:
[

## Doc2Vec - default

In [36]:
corpus = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(roomba_df.wakati_text.values)]
model = Doc2Vec(vector_size=300)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [37]:
X = np.array([model.infer_vector(doc.split()) for doc in roomba_df.wakati_text.values])

In [38]:
X.shape

(1288, 300)

In [39]:
accs_dict['d2v_default'], elapsed_times_dict['d2v_default']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.475806	valid_0's binary_logloss: 0.91858
Early stopping, best iteration is:
[1]	valid_0's auc: 0.5263	valid_0's binary_logloss: 0.691946
Accuracy is 0.5232558139534884 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.462085	valid_0's binary_logloss: 0.883262
Early stopping, best iteration is:
[2]	valid_0's auc: 0.606674	valid_0's binary_logloss: 0.682403
Accuracy is 0.6046511627906976 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.472015	valid_0's binary_logloss: 0.881163
Early stopping, best iteration is:
[4]	valid_0's auc: 0.536471	valid_0's binary_logloss: 0.691915
Accuracy is 0.5193798449612403 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.528637	valid_0's binary_logloss: 0.834033
Early stopping, best iteration is:
[1]	val

## Doc2Vec - Epochs30

In [40]:
corpus = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(roomba_df.wakati_text.values)]
model = Doc2Vec(vector_size=300)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=30)

In [41]:
X = np.array([model.infer_vector(doc.split()) for doc in roomba_df.wakati_text.values])

In [42]:
X.shape

(1288, 300)

In [43]:
accs_dict['d2v_epochs30'], elapsed_times_dict['d2v_epochs30']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.690299	valid_0's binary_logloss: 0.815305
Early stopping, best iteration is:
[17]	valid_0's auc: 0.678021	valid_0's binary_logloss: 0.646876
Accuracy is 0.6395348837209303 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.66117	valid_0's binary_logloss: 0.823108
Early stopping, best iteration is:
[7]	valid_0's auc: 0.652383	valid_0's binary_logloss: 0.658736
Accuracy is 0.6201550387596899 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.741213	valid_0's binary_logloss: 0.670422
Early stopping, best iteration is:
[50]	valid_0's auc: 0.747292	valid_0's binary_logloss: 0.602455
Accuracy is 0.6744186046511628 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.695122	valid_0's binary_logloss: 0.772082
Early stopping, best iteration is:
[18

## Word2Vec -mean -pretrained fastText

In [44]:
import gensim.models.keyedvectors as keyedvectors
model_fasttext = keyedvectors.KeyedVectors.load_word2vec_format('../model/fasttext.vec')

In [45]:
X = np.zeros((len(roomba_df), model_fasttext.wv.vector_size))

for i, doc in tqdm_notebook(enumerate(roomba_df.wakati_text.values)):
    X[i, :] = get_doc_mean_vector(doc, model_fasttext)

  """Entry point for launching an IPython kernel.


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  import sys





In [46]:
X.shape

(1288, 300)

In [47]:
accs_dict['fasttext_mean'], elapsed_times_dict['fasttext_mean']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.852792	valid_0's binary_logloss: 0.501117
Early stopping, best iteration is:
[56]	valid_0's auc: 0.846112	valid_0's binary_logloss: 0.483581
Accuracy is 0.7596899224806202 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.861278	valid_0's binary_logloss: 0.48214
Early stopping, best iteration is:
[74]	valid_0's auc: 0.850927	valid_0's binary_logloss: 0.480044
Accuracy is 0.7751937984496124 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.872232	valid_0's binary_logloss: 0.453854
Early stopping, best iteration is:
[96]	valid_0's auc: 0.87157	valid_0's binary_logloss: 0.451187
Accuracy is 0.7984496124031008 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.854933	valid_0's binary_logloss: 0.48944
Early stopping, best iteration is:
[68]

## SWEM -MAX -pretraned fastText

In [48]:
X = np.zeros((len(roomba_df), model_fasttext.wv.vector_size))

for i, doc in tqdm_notebook(enumerate(roomba_df.wakati_text.values)):
    X[i, :] = get_doc_swem_max_vector(doc, model_fasttext)

  """Entry point for launching an IPython kernel.


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




  # Remove the CWD from sys.path while we load stuff.


In [49]:
X.shape

(1288, 300)

In [50]:
accs_dict['swem_max_fasttext'], elapsed_times_dict['swem_max_fasttext']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.834015	valid_0's binary_logloss: 0.524641
Early stopping, best iteration is:
[54]	valid_0's auc: 0.83558	valid_0's binary_logloss: 0.495617
Accuracy is 0.7751937984496124 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.866514	valid_0's binary_logloss: 0.48406
Early stopping, best iteration is:
[57]	valid_0's auc: 0.867176	valid_0's binary_logloss: 0.458468
Accuracy is 0.7945736434108527 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.873736	valid_0's binary_logloss: 0.461715
Early stopping, best iteration is:
[75]	valid_0's auc: 0.872232	valid_0's binary_logloss: 0.444634
Accuracy is 0.7751937984496124 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.826781	valid_0's binary_logloss: 0.573995
Early stopping, best iteration is:
[50

## SCDV -pretrained fastText

In [51]:
X = get_scdv(roomba_df.wakati_text.values, word_vec=model_fasttext)

out of vocabs: 139
Initialization 0




  Iteration 10	 time lapse 1.34112s	 ll change 0.01142
Initialization converged: True	 time lapse 1.83151s	 ll 54.34123


In [52]:
X.shape

(1288, 18000)

In [53]:
accs_dict['scdv_fasttext'], elapsed_times_dict['scdv_fasttext']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.815961	valid_0's binary_logloss: 0.5912
Early stopping, best iteration is:
[43]	valid_0's auc: 0.827215	valid_0's binary_logloss: 0.512622
Accuracy is 0.751937984496124 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.856103	valid_0's binary_logloss: 0.513851
Early stopping, best iteration is:
[39]	valid_0's auc: 0.863746	valid_0's binary_logloss: 0.471718
Accuracy is 0.7751937984496124 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.792549	valid_0's binary_logloss: 0.661613
Early stopping, best iteration is:
[39]	valid_0's auc: 0.779911	valid_0's binary_logloss: 0.571151
Accuracy is 0.7015503875968992 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.854265	valid_0's binary_logloss: 0.519531
Early stopping, best iteration is:
[58]

## SCDV -pretrained fastText -raw

In [54]:
X = get_scdv(roomba_df.wakati_text.values, word_vec=model_fasttext, compress=False)

out of vocabs: 139
Initialization 0




  Iteration 10	 time lapse 1.37528s	 ll change 0.00694
  Iteration 20	 time lapse 1.24726s	 ll change 0.00184
Initialization converged: True	 time lapse 2.74699s	 ll 53.99191


In [55]:
X.shape

(1288, 18000)

In [56]:
accs_dict['scdv_fasttext_raw'], elapsed_times_dict['scdv_fasttext_raw']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.820896	valid_0's binary_logloss: 0.599928
Early stopping, best iteration is:
[44]	valid_0's auc: 0.810123	valid_0's binary_logloss: 0.531217
Accuracy is 0.7325581395348837 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.857667	valid_0's binary_logloss: 0.515166
Early stopping, best iteration is:
[70]	valid_0's auc: 0.856103	valid_0's binary_logloss: 0.478357
Accuracy is 0.7868217054263565 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.789661	valid_0's binary_logloss: 0.674258
Early stopping, best iteration is:
[38]	valid_0's auc: 0.767935	valid_0's binary_logloss: 0.583191
Accuracy is 0.7015503875968992 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.845589	valid_0's binary_logloss: 0.560654
Early stopping, best iteration is:
[

## SCDV -w2v -raw

In [57]:
X = get_scdv(roomba_df.wakati_text.values, word_vec=model_fasttext, compress=False)

out of vocabs: 139
Initialization 0




  Iteration 10	 time lapse 1.31958s	 ll change 0.00499
Initialization converged: True	 time lapse 1.56046s	 ll 55.00013


In [58]:
X.shape

(1288, 18000)

In [59]:
accs_dict['scdv_w2v_raw'], elapsed_times_dict['scdv_w2v_raw']  = train_and_get_oof_accuracies(X, y, params)

Start: fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.854538	valid_0's binary_logloss: 0.516207
Early stopping, best iteration is:
[67]	valid_0's auc: 0.860014	valid_0's binary_logloss: 0.477282
Accuracy is 0.7945736434108527 

Start: fold 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.851709	valid_0's binary_logloss: 0.546184
Early stopping, best iteration is:
[52]	valid_0's auc: 0.836844	valid_0's binary_logloss: 0.509116
Accuracy is 0.7403100775193798 

Start: fold 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.797665	valid_0's binary_logloss: 0.644748
Early stopping, best iteration is:
[35]	valid_0's auc: 0.781054	valid_0's binary_logloss: 0.561
Accuracy is 0.7015503875968992 

Start: fold 4
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.821745	valid_0's binary_logloss: 0.635957
Early stopping, best iteration is:
[23]

In [60]:
results = [accs_dict, elapsed_times_dict]
pd.to_pickle(results, '../data/results_roomba.pkl')