## Quora Insincere Questions Challenge

https://www.kaggle.com/c/quora-insincere-questions-classification

In [None]:
#method to install packages directly in jupyterlab 
import sys
!{sys.executable} -m pip install gensim

### Loading and Exploring

In [1]:
import pandas as pd

df = pd.read_csv('data/train.csv')

In [2]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [3]:
vc = df.target.value_counts()

In [4]:
print(vc) 
print(round(vc.values[1] / vc.values[0] * 100,2),'% positive cases.')

0    1225312
1      80810
Name: target, dtype: int64
6.6 % positive cases.


In [5]:
df_pos = df[df.target == 1]
df_neg = df[df.target == 0]

In [6]:
df_pos.question_text.values[20:30]

array(['Why did Pakistan pass the Sikh Marriage Act before India?',
       'How do I train my dogs to kill raccoons?',
       'Is there any woman (other than Muslim) who is happy after marrying a Muslim guy, as there are a lot of answers on Quora about love jihad, and all of them suffered a lot after marrying a Muslim guy?',
       'Do moms have sex with their sons?',
       'Why did the Pope say that "Muslims are a curse to this world"?',
       'How peoples in India seems hypocritical about womans empowerment?',
       'How much more political fumbling will it take for Republicans to turn on Trump?',
       'Why do Americans prefer just to not show up somewhere rather than face even the mildest of conflicts? (My colleagues and I have noted a growing number of people who skip any face to face meetings without any explanation at all.)',
       'Liberals are pro-choice, but the choices must be approved by them first. Is this an accurate description of liberals?',
       'Why are South I

In [7]:
df_neg.question_text.values[:10]

array(['How did Quebec nationalists see their province as a nation in the 1960s?',
       'Do you have an adopted dog, how would you encourage people to adopt and not shop?',
       'Why does velocity affect time? Does velocity affect space geometry?',
       'How did Otto von Guericke used the Magdeburg hemispheres?',
       'Can I convert montra helicon D to a mountain bike by just changing the tyres?',
       'Is Gaza slowly becoming Auschwitz, Dachau or Treblinka for Palestinians?',
       'Why does Quora automatically ban conservative opinions when reported, but does not do the same for liberal views?',
       'Is it crazy if I wash or wipe my groceries off? Germs are everywhere.',
       'Is there such a thing as dressing moderately, and if so, how is that different than dressing modestly?',
       'Is it just me or have you ever been in this phase wherein you became ignorant to the people you once loved, completely disregarding their feelings/lives so you get to have something g

In [8]:
len(df)

1306122

In [9]:
df.isnull().sum()

qid              0
question_text    0
target           0
dtype: int64

### Possibile Features Brainstorming

- certain signal words (topics: racist / sexual / politically incorrect content, statements rather than questions)
- sentiments 
- sentence length
- grammar errors
- right punctuation 

### Getting Started with Spacy

- Removing Stopwords 
- Lemmatizing

In [None]:
#Imports 
import spacy 
from spacy.lang.en import English
nlp = English()

#Doc
doc = nlp('''In what manner has Republican backing of "states rights" been hypocritical
and what ways have they actually restricted the ability of states to make their own laws?''')

In [None]:
doc[2:5]

#### Stopwords

In [None]:
from spacy.lang.en import STOP_WORDS

stopwords = list(STOP_WORDS)
print(len(stopwords))
stopwords[:5]

In [None]:
for word in doc: 
    if word.is_stop:
        print(word.text, end=" | ")

#### Lemmas

In [None]:
for word in doc: 
    print(word.text, '--> Lemma:', word.lemma_, end=' | ')

#### Punctuation 


In [None]:
import string

punctuations = string.punctuation
punctuations

### Baseline Model 

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix

#### Count Vectorizing

In [None]:
sample = 'Why are we gonna do Obamas action?'

In [None]:
def spacy_tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [None]:
tokens = spacy_tokenizer(sample)
tokens

In [None]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC()

In [None]:
#Cut df 
sample_index = random.sample(range(1, len(df)), 400000)
df_cut = df.iloc[sample_index]

# Features and Labels
X = df_cut['question_text']
y = df_cut['target']

In [None]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
print('Train: ', len(X_train), '| Val: ', len(X_val), '| Test: ', len(X_test))

In [None]:
# Create the  pipeline tokenize, vectorize, and classify 
pipe_countv = Pipeline([('vectorizer', vectorizer),
                        ('classifier', classifier)])

In [None]:
# Fit our data
pipe_countv.fit(X_train,y_train)

In [None]:
X_val_pred = pipe_countv.predict(X_val)

In [None]:
pd.Series(X_val_pred).value_counts()

In [None]:
print("F1-Score: ", f1_score(y_val,X_val_pred))

#### TFIDF Vectorizing

In [11]:
# Features and Labels
X = df['question_text']
y = df['target']

In [12]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [13]:
print('Train: ', len(X_train), '| Val: ', len(X_val), '| Test: ', len(X_test))

Train:  783672 | Val:  261225 | Test:  261225


In [None]:
X_train.tail(5)

In [None]:
#Example 
text = X_train.tail(5) 
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf_vectorizer.fit(text)
print(tfidf_vectorizer.vocabulary_)
print(tfidf_vectorizer.idf_)

vector = tfidf_vectorizer.transform(text)

print(vector.shape)
print('First Question Representation:', vector.toarray()[1])

In [None]:
tfidfvectorizer = TfidfVectorizer(ngram_range=(1,3))

pipe_tfidf = Pipeline([('vectorizer', tfidfvectorizer),
                        ('classifier', classifier)])

In [None]:
# Fit our data
pipe_tfidf.fit(X_train,y_train)

In [None]:
X_val_pred = pipe_tfidf.predict(X_val)

In [None]:
pd.Series(X_val_pred).value_counts()

In [None]:
print("F1-Score: ", f1_score(y_val,X_val_pred))

In [None]:
confusion_matrix(y_val,X_val_pred)

#### Sample Questions

In [None]:
pd.set_option('display.max_colwidth', -1)
df_pos.head(10)['question_text']

In [None]:
#pos/neg
sample_questions_pos = df_pos.head(50)['question_text']

In [None]:
pipe_tfidf.predict(sample_questions_pos)

In [None]:
pipe_tfidf.predict(sample_questions_neg)

#### Improving TFIDF - Idea 1: Keeping 3-grams & stopwords, but lemmatizing. 

Useful Link for writing custom trasnformers: https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/ch04.html

In [None]:
import spacy 
from spacy.lang.en import English
from sklearn.base import BaseEstimator, TransformerMixin

class Lemmatizer(TransformerMixin):

    def __init__(self):
        self.nlp = English()
    
    def lemmatize(self, document):
        return " ".join([
            word.lemma_
            for word in document
        ])
    
    def fit(self, X, y=None):
        return self

    def transform(self, documents):
        for doc in documents:
            a = self.lemmatize(self.nlp(doc))
            yield a

In [None]:
#Test
my_lemmatizer = Lemmatizer() 
for obj in my_lemmatizer.transform(["Why don't you start working?", "What did you do with the drunken sailors?"]): 
    print(obj)

In [None]:
for obj in my_lemmatizer.transform(X_train.head(5)): 
    print(obj)

In [None]:
lemmatizer = Lemmatizer() 
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))

pipe_tfidf_1 = Pipeline([('lemmatizer', lemmatizer),
                       ('vectorizer', tfidfvectorizer),
                        ('classifier', classifier)])

In [None]:
# Fit our data
pipe_tfidf_1.fit(X_train,y_train)

In [None]:
X_val_pred_1 = pipe_tfidf_1.predict(X_val)

In [None]:
pd.Series(X_val_pred_1).value_counts()

In [None]:
print("F1-Score: ", f1_score(y_val,X_val_pred_1))

In [None]:
confusion_matrix(y_val,X_val_pred_1)

#### Improving TFIDF - Idea 2: Introducing 4-grams

In [None]:
tfidfvectorizer = TfidfVectorizer(ngram_range=(1,4))
classifier = LinearSVC()

pipe_tfidf_2 = Pipeline([('vectorizer', tfidfvectorizer),
                        ('classifier', classifier)])

In [None]:
# Fit our data
pipe_tfidf_2.fit(X_train,y_train)

In [None]:
X_val_pred_2 = pipe_tfidf_2.predict(X_val)

In [None]:
pd.Series(X_val_pred_2).value_counts()

In [None]:
print("F1-Score: ", f1_score(y_val,X_val_pred_2))

In [None]:
confusion_matrix(y_val,X_val_pred_2)

#### Improving TFIDF - Idea 3: 3 grams without stopwords 

[Sklearn](https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words): "If a string, it is passed to _check_stop_list and the appropriate stop list is returned. ‘english’ is currently the only supported string value. There are several known issues with ‘english’ and you should consider an alternative (see Using stop words)."

In [None]:
tfidfvectorizer = TfidfVectorizer(ngram_range=(1,3), stop_words='english')
classifier = LinearSVC()

pipe_tfidf_3 = Pipeline([('vectorizer', tfidfvectorizer),
                        ('classifier', classifier)])

In [None]:
# Fit our data
pipe_tfidf_3.fit(X_train,y_train)

In [None]:
X_val_pred_3 = pipe_tfidf_3.predict(X_val)

In [None]:
pd.Series(pipe_tfidf_3).value_counts()

In [None]:
print("F1-Score: ", f1_score(y_val,X_val_pred_3))

In [None]:
confusion_matrix(y_val,X_val_pred_3)

**Custom Stopwords?**

Manually edited, see text files.

In [14]:
stopwords = ['the','be','to','of','and','a','an']#,'in','that','it','for','on','with','as','at','this','but','by','from','or','an']

In [15]:
tfidfvectorizer = TfidfVectorizer(ngram_range=(1,4), stop_words=stopwords)
classifier = LinearSVC()

pipe_tfidf_3 = Pipeline([('vectorizer', tfidfvectorizer),
                        ('classifier', classifier)])

In [10]:
# Fit our data
pipe_tfidf_3.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=Tr...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [11]:
X_val_pred_3 = pipe_tfidf_3.predict(X_val)

In [12]:
pd.Series(X_val_pred_3).value_counts()

0    241928
1     19297
dtype: int64

In [13]:
print("F1-Score: ", f1_score(y_val, X_val_pred_3))

F1-Score:  0.6354939752236364


In [14]:
confusion_matrix(y_val,X_val_pred_3)

array([[237048,   8037],
       [  4880,  11260]])

#### --> Current Baseline: TFIDF with custom stopwords & 4-grams

In [None]:
#currently skipped, model only trained on trainset 
pipe_tfidf_3.fit(X,y)

In [16]:
from sklearn.externals import joblib

joblib.dump(pipe_tfidf_3, 'model_tfidf_4g_train.joblib')

['model_tfidf_4g_train.joblib']

**Export Model to Cloud Storage** <br>
gsutil cp ./model_tfidf_4g_train.joblib gs://green-wares-224816/model.joblib

#### Misses

In [None]:
fp = [] #false negatives
fn = [] #false positives
c = 0
for sample, prediction, label in zip(X_val, X_val_pred_3, y_val):
    if c > 1000: break
    if prediction != label:
        if prediction == 0: 
            fn.append(sample)
        else: 
            fp.append(sample)
        c += 1

In [None]:
print('FALSE POSITIVES (predicted 1, got 0)')
fp[100:200]

In [None]:
print('FALSE NEGATIVES (predicted 0, got 1)')
fn[100:150]

#### Improving TFIDF - Idea 4: Extra Features -- Sentence Length, Grammar Errors, Part-of-speech tagging? Keeping numbers?

In [None]:
#

#### Improving TFIDF - Idea 5: Named Entities?

e.g. White House vs. Whites

In [None]:
#

#### Classifying via Doc2Vec

https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

**Step 1: Building Model**

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument

In [None]:
# Features and Labels
X = df['question_text']
y = df['target']

In [None]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
print('Train: ', len(X_train), '| Val: ', len(X_val), '| Test: ', len(X_test))

In [None]:
sample = 'How did Quebec nationalists see their province as a nation in the 1960s?'

In [None]:
from gensim.parsing.preprocessing import preprocess_string,strip_multiple_whitespaces,strip_punctuation

CUSTOM_FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces,strip_punctuation]
preprocess_string(sample, CUSTOM_FILTERS)

In [None]:
#to be considered: throws away years too
def read_corpus(X_train): 
    CUSTOM_FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces,strip_punctuation]
    for i,line in X_train.iteritems():
        yield gensim.models.doc2vec.TaggedDocument(preprocess_string(line, CUSTOM_FILTERS), [i])

In [None]:
train_corpus = list(read_corpus(X))

In [None]:
train_corpus[:10]

In [None]:
import multiprocessing

cores = multiprocessing.cpu_count()
cores

In [None]:
len(train_corpus)

In [None]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)

model_dbow.build_vocab(train_corpus)

In [None]:
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_corpus)]), total_examples=len(train_corpus), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

In [None]:
model_dbow

**Step 2: Generating new dataset**

In [None]:
sample_vector = model_dbow.infer_vector(['Has', 'Putin', 'become', 'more', 'powerful', 'under', 'Trump'])
print(len(sample_vector))
sample_vector[:10]

**Remark**: numbers currently thrown away by genism (e.g. 1960s vanishes)

In [None]:
#Option 1
def construct_vectors(df):
    for t in df.itertuples():
        yield [t[1],t[3]] + list(model_dbow.infer_vector(gensim.utils.simple_preprocess(t[2]))) 

In [None]:
#shape --> [[qid,target,300d vector], ...]
df_vectors_new = list(construct_vectors(df))

In [None]:
len(df_vectors_new)

In [None]:
df_vectors_df = pd.DataFrame(df_vectors_new)

In [None]:
df_vectors_df.to_csv('data/df_vectors_df.csv', index=False)
#df_vectors_df = pd.read_csv('data/df_vectors_df.csv')

**Step 3: Predicting on new dataset**

In [7]:
import pandas as pd

X = pd.read_csv("data/df_vectors_df.csv", usecols = list(range(2,302)))
X.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,292,293,294,295,296,297,298,299,300,301
0,0.060174,-0.019781,-0.021112,0.070328,-0.015423,-0.013999,-0.041823,-0.049703,-0.080816,-0.078015,...,-0.057844,0.021232,0.078791,-0.093427,-0.061685,-0.051947,-0.073295,0.008427,0.11126,0.168297
1,-0.015748,0.048063,-0.002865,-0.092086,-0.010734,0.010626,0.112335,0.060905,0.063989,0.171241,...,0.065685,-0.006349,-0.129088,0.154547,0.005711,0.100574,0.031654,-0.013652,-0.127452,-0.185372
2,0.051672,-0.012536,-0.041117,0.053704,0.017838,0.054164,-0.086351,-0.09573,-0.024896,-0.149535,...,-0.079134,-0.028452,0.053153,-0.144016,0.046623,-0.048005,-0.019915,0.00719,0.109115,0.091008
3,-0.021526,0.039649,0.000481,-0.053885,-0.033144,-0.016723,0.14144,0.067835,0.018965,0.143157,...,0.103638,-0.015355,-0.091295,0.115877,0.010513,0.105494,0.031881,-0.015946,-0.116619,-0.135234
4,0.034355,-0.039514,-0.089744,0.092465,0.031383,0.013943,-0.087753,-0.089349,-0.046415,-0.112365,...,-0.093137,0.00519,0.061923,-0.104256,-0.019084,-0.044934,-0.044734,0.070037,0.149715,0.143461


In [8]:
y = pd.read_csv("data/df_vectors_df.csv", usecols = [1]

In [9]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [10]:
print('Train: ', len(X_train), '| Val: ', len(X_val), '| Test: ', len(X_test))

Train:  783672 | Val:  261225 | Test:  261225


In [23]:
y_train['1'].value_counts()

0    734858
1     48814
Name: 1, dtype: int64

In [12]:
# Fit our data
from sklearn.svm import LinearSVC

classifier = LinearSVC()
classifier.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [25]:
X_val_pred_doc2vec = classifier.predict(X_val)

In [26]:
pd.Series(X_val_pred_doc2vec).value_counts()

0    261225
dtype: int64

Ideas concerning Doc2Vec: 
1. Inferring a vector on every training example
2. Inferring a vector on multiple bags of training examples?

### Embedding & Neural Network Implementation

I was trying to clean some of my code so I can add more models. However, this can never happen without the awesome kernels from other talented Kagglers. Forgive me if I missed any.

* CLR from: https://www.kaggle.com/hireme/fun-api-keras-f1-metric-cyclical-learning-rate/code
* Based on SRK's kernel: https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings
* Vladimir Demidov's 2DCNN textClassifier: https://www.kaggle.com/yekenot/2dcnn-textclassifier
* Attention layer from Khoi Ngyuen: https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb
* LSTM model from Strideradu: https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go
* https://www.kaggle.com/danofer/different-embeddings-with-attention-fork
* https://www.kaggle.com/ryanzhang/tfidf-naivebayes-logreg-baseline
* Borrowed some idea from this model: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52644
* Sentence length seems to a good feature: https://www.kaggle.com/thebrownviking20/analyzing-quora-for-the-insinceres

Some new things here:

* Take average of embeddings (Unweighted DME) instead of blending predictions: https://arxiv.org/pdf/1804.07983.pdf
* The original paper of this idea comes from: Frustratingly Easy Meta-Embedding – Computing Meta-Embeddings by Averaging Source Word Embeddings
* Modified the code to choose best threshold
* Robust method for blending weights: sort the val score and give the final weight

Some thoughts:

* Although I pulished a kernel on Transformer, I will not use it
* Too much randomness in CuDNN. You may get different results by just rerunning this kernel
* Blending rocks

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("data/"))

# Any results you write to the current directory are saved as output.

In [None]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use

**Load packages and data**

In [None]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import concatenate
from keras.callbacks import *

In [None]:
def load_and_prec():
    train_df = pd.read_csv("../input/train.csv")
    test_df = pd.read_csv("../input/test.csv")
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    ## fill up the missing values
    train_X = train_df["question_text"].fillna("_##_").values
    test_X = test_df["question_text"].fillna("_##_").values

    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['target'].values
    
    #shuffling the data
    np.random.seed(2018)
    trn_idx = np.random.permutation(len(train_X))

    train_X = train_X[trn_idx]
    train_y = train_y[trn_idx]
    
    return train_X, test_X, train_y, tokenizer.word_index

**Load embeddings**

In [None]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 
    
def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not aNone: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

**Attention layer**

In [None]:
# https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

**F1 score and CLR**

In [None]:
# https://www.kaggle.com/hireme/fun-api-keras-f1-metric-cyclical-learning-rate/code

class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())
    

def f1(y_true, y_pred):
    '''
    metric from here 
    https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras
    '''
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


**LSTM models**

In [None]:
def model_lstm_atten(embedding_matrix):
    
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.1)(x)
    x = Bidirectional(CuDNNLSTM(40, return_sequences=True))(x)
    y = Bidirectional(CuDNNGRU(40, return_sequences=True))(x)
    
    atten_1 = Attention(maxlen)(x) # skip connect
    atten_2 = Attention(maxlen)(y)
    avg_pool = GlobalAveragePooling1D()(y)
    max_pool = GlobalMaxPooling1D()(y)
    
    conc = concatenate([atten_1, atten_2, avg_pool, max_pool])
    conc = Dense(16, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)    

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1])
    
    return model

**Train and predict**

In [None]:
# https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go
def train_pred(model, train_X, train_y, val_X, val_y, epochs=2, callback=None):
    for e in range(epochs):
        model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y), callbacks = callback, verbose=0)
        pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)

        best_score = metrics.f1_score(val_y, (pred_val_y > 0.33).astype(int))
        print("Epoch: ", e, "-    Val F1 Score: {:.4f}".format(best_score))

    pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    print('=' * 60)
    return pred_val_y, pred_test_y, best_score


**Main part: load, train, pred and blend**

In [None]:
train_X, test_X, train_y, word_index = load_and_prec()
embedding_matrix_1 = load_glove(word_index)
# embedding_matrix_2 = load_fasttext(word_index)
embedding_matrix_3 = load_para(word_index)

In [None]:
## Simple average: http://aclweb.org/anthology/N18-2031

# We have presented an argument for averaging as
# a valid meta-embedding technique, and found experimental
# performance to be close to, or in some cases 
# better than that of concatenation, with the
# additional benefit of reduced dimensionality  


## Unweighted DME in https://arxiv.org/pdf/1804.07983.pdf

# “The downside of concatenating embeddings and 
#  giving that as input to an RNN encoder, however,
#  is that the network then quickly becomes inefficient
#  as we combine more and more embeddings.”
  
embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_3], axis = 0)
np.shape(embedding_matrix)

In [None]:
# https://www.kaggle.com/ryanzhang/tfidf-naivebayes-logreg-baseline

def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [None]:
DATA_SPLIT_SEED = 2018
clr = CyclicLR(base_lr=0.001, max_lr=0.002,
               step_size=300., mode='exp_range',
               gamma=0.99994)

train_meta = np.zeros(train_y.shape)
test_meta = np.zeros(test_X.shape[0])
splits = list(StratifiedKFold(n_splits=4, shuffle=True, random_state=DATA_SPLIT_SEED).split(train_X, train_y))
for idx, (train_idx, valid_idx) in enumerate(splits):
        X_train = train_X[train_idx]
        y_train = train_y[train_idx]
        X_val = train_X[valid_idx]
        y_val = train_y[valid_idx]
        model = model_lstm_atten(embedding_matrix)
        pred_val_y, pred_test_y, best_score = train_pred(model, X_train, y_train, X_val, y_val, epochs = 8, callback = [clr,])
        train_meta[valid_idx] = pred_val_y.reshape(-1)
        test_meta += pred_test_y.reshape(-1) / len(splits)

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
sub.prediction = test_meta > 0.33
sub.to_csv("submission.csv", index=False)

In [None]:
f1_score(y_true=train_y, y_pred=train_meta > 0.33)