In [3]:
from keras.datasets import imdb
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input, Bidirectional, GlobalMaxPool1D
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
import string
import gensim
import tensorflow as tf
import re
from nltk.tokenize import TweetTokenizer
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from keras.initializers import Constant
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tqdm
import json
import requests
import zipfile
import os

# Rest

In [4]:
train = pd.read_csv('/content/drive/MyDrive/NLP_Course/imdb_train.csv')
dev = pd.read_csv('/content/drive/MyDrive/NLP_Course/imdb_val.csv')
test = pd.read_csv('/content/drive/MyDrive/NLP_Course/imdb_test.csv')

In [5]:
def text_cleaning(text):
  text = re.sub("[^a-zA-Z]", " ", text) 
  text = text.lower()
  return text
  
train['review'] = train['review'].apply(text_cleaning)
dev['review'] = dev['review'].apply(text_cleaning)
test['review'] = test['review'].apply(text_cleaning)

# Trainable Embedding Layer

In [None]:
max_features = 10000
maxlen = 300

In [None]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train.review.tolist() + dev.review.tolist() + test.review.tolist())
x_train = tokenizer.texts_to_sequences(train.review)
x_dev = tokenizer.texts_to_sequences(dev.review)
x_test = tokenizer.texts_to_sequences(test.review)

In [None]:
x_train = pad_sequences(x_train, maxlen=maxlen, padding='post')
x_dev = pad_sequences(x_dev, maxlen=maxlen, padding='post')
x_test = pad_sequences(x_test, maxlen=maxlen, padding='post')

In [None]:
model = Sequential()
model.add(Embedding(max_features, 64))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(Dropout(0.5))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, None, 64)          640000    
_________________________________________________________________
bidirectional_24 (Bidirectio (None, None, 64)          24832     
_________________________________________________________________
dropout_25 (Dropout)         (None, None, 64)          0         
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 64)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 20)                1300      
_________________________________________________________________
dropout_26 (Dropout)         (None, 20)                0         
_________________________________________________________________
dense_21 (Dense)             (None, 1)                

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5,
                              patience=2, min_lr=0.0001, verbose=1)
model_checkpoint = ModelCheckpoint('/content/drive/MyDrive/NLP_Course/trainable_embedding.hdf5', monitor='val_accuracy', save_best_only=True, save_weights_only=False, 
                                   save_freq='epoch', mode='max', verbose=1)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(x_train, train.sentiment.values, batch_size=32, epochs=20, validation_data=(x_dev, dev.sentiment.values), callbacks=[reduce_lr, model_checkpoint], verbose=1)

In [None]:
model.load_weights('/content/drive/MyDrive/NLP_Course/trainable_embedding.hdf5')

In [None]:
y_pred = model.predict_classes(x_test)



In [None]:
print(f"The accuracy score on the test set is equal to {accuracy_score(test.sentiment.values, y_pred)}")

The accuracy score on the test set is equal to 0.8775


# GloVe

In [5]:
max_features = 10000
maxlen = 300
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train.review.tolist() + dev.review.tolist() + test.review.tolist())
x_train = tokenizer.texts_to_sequences(train.review)
x_dev = tokenizer.texts_to_sequences(dev.review)
x_test = tokenizer.texts_to_sequences(test.review)
x_train = pad_sequences(x_train, maxlen=maxlen, padding='post')
x_dev = pad_sequences(x_dev, maxlen=maxlen, padding='post')
x_test = pad_sequences(x_test, maxlen=maxlen, padding='post')

In [8]:
# Get the Glove matrix
URL = "http://nlp.stanford.edu/data/glove.42B.300d.zip"
def fetch_data(url=URL, target_file='glove.zip', delete_zip=False):
    response = requests.get(url, stream=True)
    #read chunk by chunk
    handle = open(target_file, "wb")
    for chunk in tqdm.tqdm(response.iter_content(chunk_size=512)):
        if chunk:  
            handle.write(chunk)
    handle.close()  
    print("  Download completed ;) :") 
    #extract zip_file
    zf = zipfile.ZipFile(target_file)
    print("1. Extracting {} file".format(target_file))
    zf.extractall()
    if delete_zip:
        print("2. Deleting {} file".format(dataset_name+".zip"))
        os.remove(path=zip_file)
fetch_data()

3667580it [14:55, 4094.98it/s]


  Download completed ;) :
1. Extracting glove.zip file


In [10]:
glove_file = "glove.42B.300d.txt"
EMBEDDING_VECTOR_LENGTH = 300
def construct_embedding_matrix(glove_file, word_index):
    embedding_dict = {}
    with open(glove_file,'r') as f:
        for line in tqdm.tqdm(f):
            values=line.split()
            # get the word
            word=values[0]
            if word in word_index.keys():
                # get the vector
                vector = np.asarray(values[1:], 'float32')
                embedding_dict[word] = vector

    num_words=len(word_index)+1
    #initialize it to 0
    embedding_matrix=np.zeros((num_words, EMBEDDING_VECTOR_LENGTH))

    for word,i in tqdm.tqdm(word_index.items()):
        if i < num_words:
            vect=embedding_dict.get(word, [])
            if len(vect)>0:
                embedding_matrix[i] = vect[:EMBEDDING_VECTOR_LENGTH]
    return embedding_matrix
  
embedding_matrix =  construct_embedding_matrix(glove_file, tokenizer.word_index)

1917494it [00:43, 43937.70it/s]
100%|██████████| 75150/75150 [00:00<00:00, 362176.20it/s]


In [19]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, # number of unique tokens
                    EMBEDDING_VECTOR_LENGTH, #number of features
                    embeddings_initializer=Constant(embedding_matrix), # initialize 
                    input_length=maxlen, 
                    trainable=True))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(Dropout(0.5))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 300)          22545300  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 150, 64)           85248     
_________________________________________________________________
dropout_2 (Dropout)          (None, 150, 64)           0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_3 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                

In [20]:
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5,
                              patience=2, min_lr=0.0001, verbose=1)
model_checkpoint = ModelCheckpoint('/content/drive/MyDrive/NLP_Course/trainable_glove_embedding.hdf5', monitor='val_accuracy', save_best_only=True, save_weights_only=False, 
                                   save_freq='epoch', mode='max', verbose=1)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(x_train, train.sentiment.values, batch_size=32, epochs=20, validation_data=(x_dev, dev.sentiment.values), callbacks=[reduce_lr, model_checkpoint], verbose=1)

In [22]:
model.load_weights('/content/drive/MyDrive/NLP_Course/glove_embedding.hdf5')
y_pred = model.predict_classes(x_test)
print(f"The accuracy score on the test set is equal to {accuracy_score(test.sentiment.values, y_pred)}")



The accuracy score on the test set is equal to 0.8785


In [23]:
model.load_weights('/content/drive/MyDrive/NLP_Course/trainable_glove_embedding.hdf5')
y_pred = model.predict_classes(x_test)
print(f"The accuracy score on the test set is equal to {accuracy_score(test.sentiment.values, y_pred)}")



The accuracy score on the test set is equal to 0.882


# IMDB

In [21]:
maxlen = 300

In [5]:
tokenizer = TweetTokenizer()
def tokenize(examples):
  tokens_sentence = []
  for review in examples:
    review = re.sub(' +',' ', review)
    review = review.strip()
    tokens = tokenizer.tokenize(review)
    tokens_sentence.append(tokens)
  return tokens_sentence
train_tokens = tokenize(train['review'].tolist())
dev_tokens = tokenize(dev['review'].tolist())
test_tokens = tokenize(test['review'].tolist())

In [14]:
with open('/content/drive/MyDrive/NLP_Course/vocab.json', 'r') as file:
    vocab = json.load(file)
embedding_matrix = np.load('/content/drive/MyDrive/NLP_Course/input_vecs.npy')

In [17]:
def tokens_to_ints(examples):
  documents_ints = []
  for review in examples:
    sublist = []
    for token in review:
      try:
        idx = vocab[token]
      except KeyError:
        idx = 0
      sublist.append(idx)
    documents_ints.append(sublist)
  return documents_ints
train_ints = tokens_to_ints(train_tokens)
dev_ints = tokens_to_ints(dev_tokens)
test_ints = tokens_to_ints(test_tokens)

In [22]:
x_train = pad_sequences(train_ints, maxlen=maxlen, padding='post')
x_dev = pad_sequences(dev_ints, maxlen=maxlen, padding='post')
x_test = pad_sequences(test_ints, maxlen=maxlen, padding='post')

In [27]:
model = Sequential()
model.add(Embedding(len(vocab)+1, # number of unique tokens
                    64, #number of features
                    embeddings_initializer=Constant(embedding_matrix), # initialize 
                    input_length=maxlen, 
                    trainable=True))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(Dropout(0.5))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 64)           8136768   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 300, 64)           24832     
_________________________________________________________________
dropout_2 (Dropout)          (None, 300, 64)           0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_3 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                

In [28]:
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5,
                              patience=2, min_lr=0.0001, verbose=1)
model_checkpoint = ModelCheckpoint('/content/drive/MyDrive/NLP_Course/trainable_imdb_embedding.hdf5', monitor='val_accuracy', save_best_only=True, save_weights_only=False, 
                                   save_freq='epoch', mode='max', verbose=1)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(x_train, train.sentiment.values, batch_size=32, epochs=20, validation_data=(x_dev, dev.sentiment.values), callbacks=[reduce_lr, model_checkpoint], verbose=1)

In [26]:
model.load_weights('/content/drive/MyDrive/NLP_Course/imdb_embedding.hdf5')
y_pred = model.predict_classes(x_test)
print(f"The accuracy score on the test set is equal to {accuracy_score(test.sentiment.values, y_pred)}")



The accuracy score on the test set is equal to 0.785


In [30]:
model.load_weights('/content/drive/MyDrive/NLP_Course/trainable_imdb_embedding.hdf5')
y_pred = model.predict_classes(x_test)
print(f"The accuracy score on the test set is equal to {accuracy_score(test.sentiment.values, y_pred)}")



The accuracy score on the test set is equal to 0.8425


# 1Billion Dataset

In [6]:
maxlen = 300

In [7]:
tokenizer = TweetTokenizer()
def tokenize(examples):
  tokens_sentence = []
  for review in examples:
    review = re.sub(' +',' ', review)
    review = review.strip()
    tokens = tokenizer.tokenize(review)
    tokens_sentence.append(tokens)
  return tokens_sentence
train_tokens = tokenize(train['review'].tolist())
dev_tokens = tokenize(dev['review'].tolist())
test_tokens = tokenize(test['review'].tolist())

In [8]:
with open('/content/drive/MyDrive/NLP_Course/vocab_1B.json', 'r') as file:
    vocab = json.load(file)
embedding_matrix = np.load('/content/drive/MyDrive/NLP_Course/input_vecs_1B.npy')

In [9]:
def tokens_to_ints(examples):
  documents_ints = []
  for review in examples:
    sublist = []
    for token in review:
      try:
        idx = vocab[token]
      except KeyError:
        idx = 0
      sublist.append(idx)
    documents_ints.append(sublist)
  return documents_ints
train_ints = tokens_to_ints(train_tokens)
dev_ints = tokens_to_ints(dev_tokens)
test_ints = tokens_to_ints(test_tokens)

In [10]:
x_train = pad_sequences(train_ints, maxlen=maxlen, padding='post')
x_dev = pad_sequences(dev_ints, maxlen=maxlen, padding='post')
x_test = pad_sequences(test_ints, maxlen=maxlen, padding='post')

In [24]:
model = Sequential()
model.add(Embedding(len(vocab)+1, # number of unique tokens
                    64, #number of features
                    embeddings_initializer=Constant(embedding_matrix), # initialize 
                    input_length=maxlen, 
                    trainable=True))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(Dropout(0.5))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 300, 64)           35687040  
_________________________________________________________________
bidirectional_4 (Bidirection (None, 300, 64)           24832     
_________________________________________________________________
dropout_8 (Dropout)          (None, 300, 64)           0         
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 64)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_9 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                

In [25]:
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5,
                              patience=2, min_lr=0.0001, verbose=1)
model_checkpoint = ModelCheckpoint('/content/drive/MyDrive/NLP_Course/trainable_1B_embedding.hdf5', monitor='val_accuracy', save_best_only=True, save_weights_only=False, 
                                   save_freq='epoch', mode='max', verbose=1)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(x_train, train.sentiment.values, batch_size=32, epochs=5, validation_data=(x_dev, dev.sentiment.values), callbacks=[reduce_lr, model_checkpoint], verbose=1)

In [19]:
model.load_weights('/content/drive/MyDrive/NLP_Course/1B_embedding.hdf5')
y_pred = model.predict_classes(x_test)
print(f"The accuracy score on the test set is equal to {accuracy_score(test.sentiment.values, y_pred)}")



The accuracy score on the test set is equal to 0.572


In [29]:
model.load_weights('/content/drive/MyDrive/NLP_Course/trainable_1B_embedding.hdf5')
y_pred = model.predict_classes(x_test)
print(f"The accuracy score on the test set is equal to {accuracy_score(test.sentiment.values, y_pred)}")



The accuracy score on the test set is equal to 0.75
