# Keras Example

## Imports

In [1]:
import re

import numpy as np
import pandas as pd
import keras
import sklearn
import nltk

import gensim

Using Theano backend.


## Utilitary Functions

In [2]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def pre_processing(review):
    review = review.lower()
    review = cleanhtml(review)
    return tokenizer.tokenize(review)

## Load Dataset

### Loading from Processed data

In [3]:
usable_dataset = pd.read_csv("../data/processed/acllib_data.csv")
usable_dataset = usable_dataset.sample(frac=1).reset_index(drop=True)

## Pre processing Data

In [4]:
tokenizer = nltk.tokenize.RegexpTokenizer("[a-z]+")

usable_dataset["REVIEW"] = usable_dataset["REVIEW"].map(pre_processing)
usable_dataset["SCORE"] = usable_dataset["SCORE"].map(lambda x: 0 if x <5 else 1)
usable_dataset.head()

Unnamed: 0,REVIEW,SCORE
0,"[there, has, been, a, political, documentary, ...",1
1,"[literally, every, aspect, of, this, science, ...",0
2,"[gods, i, haven, t, watched, a, movie, this, a...",0
3,"[this, is, one, of, those, movies, that, appea...",0
4,"[being, a, bit, of, a, connoisseur, of, garbag...",1


### Fill sentences

In [5]:
usable_dataset["REVIEW"].map(len).describe()

count    50000.000000
mean       234.139260
std        173.495615
min          6.000000
25%        128.000000
50%        176.000000
75%        284.000000
max       2487.000000
Name: REVIEW, dtype: float64

In [6]:
max_sentence_length = usable_dataset["REVIEW"].map(len).describe()['75%'] + 1
max_sentence_length

285.0

In [7]:
def fill_sentence(sentence):
    tokens_to_fill = int(max_sentence_length - len(sentence))
    
    sentence.append('<END>')
    sentence.extend(['<PAD>']*tokens_to_fill)
    
    return sentence

In [8]:
usable_dataset["REVIEW"] = usable_dataset["REVIEW"].map(fill_sentence)
usable_dataset.loc[0,"REVIEW"]

['there',
 'has',
 'been',
 'a',
 'political',
 'documentary',
 'of',
 'recent',
 'vintage',
 'called',
 'why',
 'we',
 'fight',
 'which',
 'tries',
 'to',
 'examine',
 'the',
 'infamous',
 'military',
 'industrial',
 'complex',
 'and',
 'its',
 'grip',
 'on',
 'this',
 'nation',
 'it',
 'is',
 'considered',
 'both',
 'polemical',
 'and',
 'incisive',
 'in',
 'making',
 'its',
 'case',
 'against',
 'both',
 'that',
 'complex',
 'and',
 'the',
 'war',
 'fiasco',
 'we',
 'are',
 'currently',
 'involved',
 'in',
 'in',
 'iraq',
 'yet',
 'a',
 'far',
 'more',
 'famous',
 'series',
 'of',
 'films',
 'with',
 'the',
 'same',
 'name',
 'was',
 'made',
 'during',
 'world',
 'war',
 'two',
 'by',
 'hollywood',
 'director',
 'frank',
 'capra',
 'although',
 'considered',
 'documentaries',
 'and',
 'having',
 'won',
 'oscars',
 'in',
 'that',
 'category',
 'this',
 'series',
 'of',
 'seven',
 'films',
 'is',
 'really',
 'and',
 'truly',
 'mere',
 'agitprop',
 'more',
 'in',
 'the',
 'vein',
 'of'

## Train Test Validation Split

In [9]:
train_set, validate_set, test_set = np.split(usable_dataset.sample(frac=1), [int(.6*len(usable_dataset)), int(.8*len(usable_dataset))])

## Loading Word2Vec Model

In [10]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("../data/processed/glove.6b.300d.txt")

In [11]:
word2vec_model.similar_by_vector("brazil")

  if np.issubdtype(vec.dtype, np.int):


[('brazilian', 0.7331621050834656),
 ('argentina', 0.672882080078125),
 ('portugal', 0.647482693195343),
 ('paraguay', 0.601586639881134),
 ('paulo', 0.59923255443573),
 ('uruguay', 0.5989081263542175),
 ('venezuela', 0.5980018973350525),
 ('peru', 0.5975527167320251),
 ('ecuador', 0.5785552859306335),
 ('bolivia', 0.5705569386482239)]

### Add utility vectors

In [12]:
word2vec_model.add(['<PAD>','<END>'], [[0.1]*300,[0.2]*300])

In [17]:
word2vec_model.wv.index2word[3]

  """Entry point for launching an IPython kernel.


'of'

## Instanciating model

In [13]:
model = keras.models.Sequential()
model

<keras.engine.sequential.Sequential at 0x7f35d536ae80>

### Adding LSTM Layers

In [14]:
model.add(
    keras.layers.Embedding(
        input_dim=len(word2vec_model.wv.vocab),
        output_dim=300,
        input_length=max_sentence_length
    )
)

model.add(keras.layers.LSTM(300, return_sequences=True))
model.add(keras.layers.LSTM(300, return_sequences=True))
model.add(keras.layers.Dense(150, activation='relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(50, activation='relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(1, activation='sigmoid'))

  This is separate from the ipykernel package so we can avoid doing imports until
