# 精度向上のアプローチ
- パラメータを変える(max features, max len, etc..) -> うまくいかなかった
- 前処理で、embeddingできてない9000単語を読み込ませる ->3600くらいまで減って精度が上がった
- ネットワークを変える(層を増やす)
- 特徴を抽出する(textの長さを変える)
- optimizerを変える, lossを変える
- word2vecの学習をして、vctor化する

best score | 0.6296086


In [165]:
import os, re
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [166]:
train_df = pd.read_csv("../input/train.csv")[:130000]
print("Train shape : ",train_df.shape)

Train shape :  (130000, 3)


In [56]:
train_df.head(3)

Unnamed: 0,qid,question_text,target
37165,07439a6aa2d79f58ea8f,What does the world look like through a Sharin...,0
75838,0ed749144ad0a1890eb5,How does the Facebook or Google case apply wit...,0
9733,01e6a38e49d22aec6cae,How do you go no contact when you have a child...,0


In [167]:
##preprocessing
dic = {
    "what's": "what is",
    "i'm": "i am",
    "isn't":"is not",
    "i've": "i have",
    "you've": "you have",
    "don’t": "do not",
    "i’m": "i am",
    "aren't": "are not",
    "won't":"will not",
    "what’s": "what is",
    "trump's": "trump is",
    "they're": "they are",
    "shouldn't": "should not",
    "haven't": "have not",
    "can’t": "cannot",
    "wouldn't": "would not",
    "he's": "he is",
    "it’s": "it is",
    "quorans": "quoran",
    "wasn't": "was not",
    "today's": "todays",
    "someone's": "someones",
    "india's": "indias",
    "one's": "ones",
    "people's": "peoples",
    "who's": "who is",
    "hasn't": "has not",
    "there's": "there is",
    "brexit": "British exit",
    "couldn't": "could not",
    "doesn’t": "does not",
    "isn’t": "is not",
    "she's": "she is",
    'i’ve': "i have"
}
puncts = ',.":)(-!?|;\'$&/[]>%=#*+\\•~@£·_{}©^®`<→°€™›♥←×§″′Â█½à…“★”–●â►−¢²¬░¶↑±¿▾═¦║―¥▓—‹─▒：¼⊕▼▪†■’▀¨▄♫☆é¯♦¤▲è¸¾Ã⋅‘∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤ïØ¹≤‡√'

def analyze(txt):
    txt = txt.lower()
    
    for bad_word in dic:
        if bad_word in txt:
            txt = txt.replace(bad_word, dic[bad_word])
            
    for punct in puncts:
        txt = txt.replace(punct, f' {punct} ')
        
    words = []
    for word in txt.split(' '):
        if (re.compile(r'^.*[0-9]+.*$').fullmatch(word) is not None):  # 数字が含まれるものは分割
            for w in re.findall(r'(\d+|\D+)', word):
                words.append(w)
            continue
        if len(word) < 1:  # 0文字（空文字）は除外
            continue
        words.append(word)

    return " ".join(words)

train_df["question_text"] = train_df["question_text"].map(analyze)

## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [168]:
ps = nltk.stem.PorterStemmer()
lc = nltk.stem.lancaster.LancasterStemmer()
sb = nltk.stem.snowball.SnowballStemmer('english')

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
true = 0
false = 0
oov_dic = {}
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    word_ = word.lower()
    embedding_vector = embeddings_index.get(word_)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    word_ = word.upper()
    embedding_vector = embeddings_index.get(word_)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    word_ = word.capitalize()
    embedding_vector = embeddings_index.get(word_)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    word_ = ps.stem(word)
    embedding_vector = embeddings_index.get(word_)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    word_ = sb.stem(word)
    embedding_vector = embeddings_index.get(word_)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    word_ = lc.stem(word)
    embedding_vector = embeddings_index.get(word_)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    else:
        oov_dic[word] = tokenizer.word_counts[word]
        false += 1

In [169]:
#43111 6888 0.8622372447448949
print(true, false, true / (true + false))

46535 3464 0.9307186143722874


In [25]:
EMBEDDING_FILE = '../input/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore'))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

In [54]:
ps = nltk.stem.PorterStemmer()
lc = nltk.stem.lancaster.LancasterStemmer()
sb = nltk.stem.snowball.SnowballStemmer('english')

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
true = 0
false = 0
oov_dic = {}
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    word_ = word.lower()
    embedding_vector = embeddings_index.get(word_)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    word_ = word.upper()
    embedding_vector = embeddings_index.get(word_)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    word_ = word.capitalize()
    embedding_vector = embeddings_index.get(word_)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    word_ = ps.stem(word)
    embedding_vector = embeddings_index.get(word_)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    word_ = sb.stem(word)
    embedding_vector = embeddings_index.get(word_)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    word_ = lc.stem(word)
    embedding_vector = embeddings_index.get(word_)
    if embedding_vector is not None: 
        true += 1
        embedding_matrix[i] = embedding_vector
        continue
    else:
        oov_dic[word] = tokenizer.word_counts[word]
        false += 1

In [170]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_8 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 16)                2064      
_________________________________________________________________
dropout_8 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 17        
Total para

In [171]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 117000 samples, validate on 13000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa8fcc0ec18>

In [172]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
f1_scores = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    score = metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))
    f1_scores.append(score)
    print("F1 score at threshold {0} is {1}".format(thresh, score))
print("best is {}".format(np.max(f1_scores)))

F1 score at threshold 0.1 is 0.5722402597402597
F1 score at threshold 0.11 is 0.5803497085761865
F1 score at threshold 0.12 is 0.5858414582450191
F1 score at threshold 0.13 is 0.5926565874730021
F1 score at threshold 0.14 is 0.5985081175954367
F1 score at threshold 0.15 is 0.5999101930848675
F1 score at threshold 0.16 is 0.6029143897996357
F1 score at threshold 0.17 is 0.6048014773776547
F1 score at threshold 0.18 is 0.6062587575899112
F1 score at threshold 0.19 is 0.6111375535459305
F1 score at threshold 0.2 is 0.6160541586073501
F1 score at threshold 0.21 is 0.6202158979391561
F1 score at threshold 0.22 is 0.6193548387096774
F1 score at threshold 0.23 is 0.624248496993988
F1 score at threshold 0.24 is 0.6275303643724696
F1 score at threshold 0.25 is 0.6294779938587514
F1 score at threshold 0.26 is 0.6308169596690797
F1 score at threshold 0.27 is 0.6330036439354504
F1 score at threshold 0.28 is 0.6330708661417324
F1 score at threshold 0.29 is 0.6366525423728813
F1 score at threshold 0

In [14]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

In [15]:
EMBEDDING_FILE = '../input/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 117000 samples, validate on 13000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa90ef890b8>

In [17]:
pred_fasttext_val_y = model.predict([val_X], batch_size=1024, verbose=1)
f1_scores = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    score = metrics.f1_score(val_y, (pred_fasttext_val_y>thresh).astype(int))
    f1_scores.append(score)
    print("F1 score at threshold {0} is {1}".format(thresh, score))
print("best is {}".format(np.max(f1_scores)))

F1 score at threshold 0.1 is 0.5333333333333334
F1 score at threshold 0.11 is 0.544535200311163
F1 score at threshold 0.12 is 0.5495029821073558
F1 score at threshold 0.13 is 0.5575364667747164
F1 score at threshold 0.14 is 0.5632989690721649
F1 score at threshold 0.15 is 0.5692695214105794
F1 score at threshold 0.16 is 0.5731292517006803
F1 score at threshold 0.17 is 0.5717981888745148
F1 score at threshold 0.18 is 0.5764345159877354
F1 score at threshold 0.19 is 0.5770084332001776
F1 score at threshold 0.2 is 0.5800089645898701
F1 score at threshold 0.21 is 0.5817023213472918
F1 score at threshold 0.22 is 0.5852534562211982
F1 score at threshold 0.23 is 0.588235294117647
F1 score at threshold 0.24 is 0.5900094250706881
F1 score at threshold 0.25 is 0.5920990004759638
F1 score at threshold 0.26 is 0.5937349397590361
F1 score at threshold 0.27 is 0.5949367088607596
F1 score at threshold 0.28 is 0.5990147783251232
F1 score at threshold 0.29 is 0.5991058122205662
F1 score at threshold 0.

In [18]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

In [19]:
EMBEDDING_FILE = '../input/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 117000 samples, validate on 13000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa91d953128>

In [21]:
pred_paragram_val_y = model.predict([val_X], batch_size=1024, verbose=1)
f1_scores = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    score = metrics.f1_score(val_y, (pred_paragram_val_y>thresh).astype(int))
    f1_scores.append(score)
    print("F1 score at threshold {0} is {1}".format(thresh, score))
print("best is {}".format(np.max(f1_scores)))

F1 score at threshold 0.1 is 0.5325355272999253
F1 score at threshold 0.11 is 0.5415549597855228
F1 score at threshold 0.12 is 0.5472946671856753
F1 score at threshold 0.13 is 0.55577610162763
F1 score at threshold 0.14 is 0.5617159044921085
F1 score at threshold 0.15 is 0.5679012345679013
F1 score at threshold 0.16 is 0.5721476510067114
F1 score at threshold 0.17 is 0.5765458422174841
F1 score at threshold 0.18 is 0.5838401390095569
F1 score at threshold 0.19 is 0.5869373345101501
F1 score at threshold 0.2 is 0.5882878855610193
F1 score at threshold 0.21 is 0.5924579736483416
F1 score at threshold 0.22 is 0.5951398441082072
F1 score at threshold 0.23 is 0.6014897579143389
F1 score at threshold 0.24 is 0.6029203956665096
F1 score at threshold 0.25 is 0.6038991916310034
F1 score at threshold 0.26 is 0.6075216972034716
F1 score at threshold 0.27 is 0.6108663729809105
F1 score at threshold 0.28 is 0.6126305320735952
F1 score at threshold 0.29 is 0.6130653266331657
F1 score at threshold 0.

In [22]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

In [23]:
pred_val_y = 0.33*pred_glove_val_y + 0.33*pred_fasttext_val_y + 0.34*pred_paragram_val_y
f1_scores = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    score = metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))
    f1_scores.append(score)
    print("F1 score at threshold {0} is {1}".format(thresh, score))
print("best is {}".format(np.max(f1_scores)))

F1 score at threshold 0.1 is 0.5489591364687741
F1 score at threshold 0.11 is 0.5562130177514792
F1 score at threshold 0.12 is 0.5656401944894651
F1 score at threshold 0.13 is 0.572851805728518
F1 score at threshold 0.14 is 0.576027107157984
F1 score at threshold 0.15 is 0.5821152192605331
F1 score at threshold 0.16 is 0.5881838074398249
F1 score at threshold 0.17 is 0.5899152164212405
F1 score at threshold 0.18 is 0.5950338600451467
F1 score at threshold 0.19 is 0.5979760809567618
F1 score at threshold 0.2 is 0.6019598693420438
F1 score at threshold 0.21 is 0.6060606060606061
F1 score at threshold 0.22 is 0.6070226070226071
F1 score at threshold 0.23 is 0.6107317073170732
F1 score at threshold 0.24 is 0.6132542037586548
F1 score at threshold 0.25 is 0.616
F1 score at threshold 0.26 is 0.6167088607594937
F1 score at threshold 0.27 is 0.6193152784874808
F1 score at threshold 0.28 is 0.6214396685655101
F1 score at threshold 0.29 is 0.6227608008429927
F1 score at threshold 0.3 is 0.624733

## References
https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings