### LSTM-GRNN

Implementation based on paper here:
http://aclweb.org/anthology/D15-1167

Inspiration for code taken from here:
https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py

In [1]:
import sys, os, re, csv, codecs, gc, numpy as np, pandas as pd
import tensorflow as tf
#from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Permute, GRU, Conv1D, LSTM, Embedding, Dropout, Activation, CuDNNLSTM, CuDNNGRU, concatenate, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, BatchNormalization, SpatialDropout1D, Dot
from keras.optimizers import Adam, RMSprop
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras_tqdm import TQDMNotebookCallback
import keras.backend as K
from keras.callbacks import LearningRateScheduler
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from functools import reduce
from keras.layers import Layer, PReLU, SpatialDropout1D
from keras import initializers
from sklearn.model_selection import cross_val_predict

from nltk.tokenize import word_tokenize, wordpunct_tokenize, TweetTokenizer, MWETokenizer, ToktokTokenizer, sent_tokenize
from nltk.corpus import stopwords

import unicodedata
from collections import Counter
import itertools

np.random.seed(786)

from SentenceTokenizer import SentenceTokenizer
from ZeroMaskedLayer import ZeroMaskedLayer
from AttentionLayer import AttentionLayer
from RocAucEvaluation import RocAucEvaluation

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
path = '../input/'
utility_path = '../utility/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
EMBEDDING_FILE=f'{utility_path}crawl-300d-2M.vec'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'

In [3]:
def unicodeToAscii(series):
    return series.apply(lambda s: unicodedata.normalize('NFKC', str(s)))

STOP_WORDS = set(stopwords.words( 'english' ))

repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    #"m": "am",
    #"r": "are",
    #"u": "you",
    "haha": "ha",
    "hahaha": "ha",
}

#https://stackoverflow.com/questions/15175142/how-can-i-do-multiple-substitutions-using-regex-in-python
def one_xlat(match):
        return repl[match.group(0)]
    
rx = re.compile('|'.join(map(re.escape, repl)))
# Lowercase, trim, and remove non-letter characters
def normalizeString(series):
    series = unicodeToAscii(series)
    series = series.str.lower()
    series = series.str.replace(rx, one_xlat)
    series = series.str.replace(r"(\n){1,}", " ")
    series = series.str.replace(r"\'", "")
    series = series.str.replace(r"\-", "")
    series = series.str.replace(r"[^0-9a-zA-Z.,!?]+", " ")
    series = series.str.replace(r"[.]+",".")
    series = series.str.replace(r"[!]+","!")
    series = series.str.replace(r"[?]+",".")
    return series


In [4]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

print(train.shape, test.shape)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

#Get validation folds
train['target_str'] = reduce(lambda x,y: x+y, [train[col].astype(str) for col in list_classes])
train['target_str'] = train['target_str'].replace('110101', '000000').replace('110110','000000')
cvlist1 = list(StratifiedKFold(n_splits=10, random_state=786).split(train, train['target_str'].astype('category')))
cvlist2 = list(StratifiedShuffleSplit(n_splits=5, test_size=0.05, random_state=786).split(train, train['target_str'].astype('category')))

(159571, 8) (153164, 2)




In [5]:
for df in train, test:
    df["comment_text"] = normalizeString(df["comment_text"])

In [6]:
MAX_FEATURES = 200000
MAX_SENTENCE_LEN = 50
MAX_SENTENCES = 15

def custome_tokenizer(text):
    return [TweetTokenizer().tokenize(sent) for sent in  sent_tokenize(text)]

tok = SentenceTokenizer(max_features=MAX_FEATURES, max_sentence_len=MAX_SENTENCE_LEN, max_sentences=MAX_SENTENCES, tokenizer=custome_tokenizer)

In [10]:
c = Counter()
s = Counter()
def cnts(x):
    toks = custome_tokenizer(x)
    s.update([len(toks)])
    c.update([len(sent) for sent in toks])
train.comment_text.apply(lambda x: cnts(x))

0         None
1         None
2         None
3         None
4         None
5         None
6         None
7         None
8         None
9         None
10        None
11        None
12        None
13        None
14        None
15        None
16        None
17        None
18        None
19        None
20        None
21        None
22        None
23        None
24        None
25        None
26        None
27        None
28        None
29        None
          ... 
159541    None
159542    None
159543    None
159544    None
159545    None
159546    None
159547    None
159548    None
159549    None
159550    None
159551    None
159552    None
159553    None
159554    None
159555    None
159556    None
159557    None
159558    None
159559    None
159560    None
159561    None
159562    None
159563    None
159564    None
159565    None
159566    None
159567    None
159568    None
159569    None
159570    None
Name: comment_text, Length: 159571, dtype: object

In [24]:
from scipy.stats import gmean, hmean
gmean([np.array([[1,2],[2,1]]), np.array([[4,8],[4,3]])], axis=0)

array([[2.        , 4.        ],
       [2.82842712, 1.73205081]])

In [7]:
tok.fit_transform(train.comment_text.loc[:5])

ValueError: setting an array element with a sequence.