In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv('/kaggle/input/textract-2019/textract_train.csv')

In [None]:
print(df['Contents'])

In [None]:
sentences = df['Contents']
labels = df['Label']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42, stratify = labels)

In [None]:
def clean_word(sample):
    to_replace = """[,\.!?'"123456789]-():;"""
    for char in to_replace:
        sample = sample.replace(char,'')
    return sample

In [None]:
new_train = []
for article in tqdm(X_train):
    new_article = []
    for word in article.split(' '):
        new_word = clean_word(word)
        if new_word != '' and new_word != ' ':
            new_article.append(new_word)
    new_train.append(new_article)

In [None]:
new_test = []
for article in tqdm(X_test):
    new_article = []
    for word in article.split(' '):
        new_word = clean_word(word)
        if new_word != '' and new_word != ' ':
            new_article.append(new_word)
    new_test.append(new_article)

In [None]:
# import random
# for i  in range(len(new_train)):
#     random.shuffle(new_train[i])


In [None]:
from gensim.models import Word2Vec
w2v_size = 300
w2v_model = Word2Vec(min_count=100,
                     window=4,
                     size=w2v_size,
                     workers=2)
w2v_model.build_vocab(new_train)
w2v_model.train(new_train, total_examples=w2v_model.corpus_count, epochs=30)

In [None]:
# vectorizing train data
X_train = new_train
X_dev = new_test
padding= np.zeros(w2v_size)
X_train_vec = []
dim = 200
for sample in X_train:
    vec = []
    k = 0
    for word in sample:
        if k >= dim:
            break
        k += 1
        try:
            vec.append(w2v_model.wv[word])
        except:
            vec.append(padding)
    while k < dim:
        vec.append(padding)
        k += 1
    X_train_vec.append(np.array(vec))
X_train_vec = np.array(X_train_vec)

# vectorizing dev data
padding= np.zeros(w2v_size)
X_dev_vec = []
for sample in X_dev:
    vec = []
    k = 0
    for word in sample:
        if k >= dim:
            break
        k += 1
        try:
            vec.append(w2v_model.wv[word])
        except:
            vec.append(padding)
    while k < dim:
        vec.append(padding)
        k += 1
    X_dev_vec.append(np.array(vec))
X_dev_vec = np.array(X_dev_vec)

In [None]:
X_dev = X_dev_vec
X_train = X_train_vec
y_dev = y_test
print(X_train.shape)
print(X_dev.shape)
print(y_train.shape)
print(y_dev.shape)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, InputLayer, LSTM, Bidirectional, GlobalMaxPooling1D, Conv1D, Dropout, MaxPool1D
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, CSVLogger, Callback, EarlyStopping

In [None]:
def build_model():
    model = Sequential()
    model.add(LSTM(128, return_sequences = 'True'))
    model.add(Dropout(0.5))
    model.add(LSTM(64))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_model(model, X_train, y_train, X_test, y_test):
    mcp_save = ModelCheckpoint('model.hdf5', save_best_only=True, monitor='val_loss', mode='min')
    model.fit(X_train, y_train,validation_data=[X_test, y_test], batch_size=32, epochs=100,  
                        callbacks= [
                              EarlyStopping(patience=10, monitor='val_loss', mode='min'),
                              mcp_save,
                              ReduceLROnPlateau(factor=.3)
                         ])
    model.load_weights(filepath = 'model.hdf5')
    return model

In [None]:
m = build_model()
train_model(m,X_train, np.array(list(y_train)), X_dev, np.array(list(y_dev)))