In [1]:
import tarfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
from os import listdir
import nltk
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Concatenate, TimeDistributed, Dense
from tensorflow.keras.layers import Embedding, GRU
import gensim
import gensim.downloader as model_api
import sklearn.feature_extraction.text as text
from sklearn.decomposition import PCA
from tensorflow import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Input, LSTM
from keras.callbacks import EarlyStopping
from nltk.corpus import stopwords
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import random

# 1. Sentiment analysis

Using the [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), we want to do a regression model that predict the ratings are on a 1-10 scale. You have an example train and test set in the `dataset` folder.

### 1.1 Regression Model

Use a feedforward neural network and NLP techniques we've seen up to now to train the best model you can on this dataset

### 1.2 RNN model

Train a RNN to do the sentiment analysis regression. The RNN should consist simply of an embedding layer (to make word IDs into word vectors) a recurrent blocks (GRU or LSTM) feeding into an output layer.

In [None]:
# Get the reviews

In [None]:
# 1. Regrssion Model

In [23]:
pos_train_files = listdir("dataset/aclImdb/train/pos")
neg_train_files = listdir("dataset/aclImdb/train/neg")

pos_test_files = listdir("dataset/aclImdb/test/pos")
neg_test_files = listdir("dataset/aclImdb/test/neg")

def get_reviews(target, rev, files): 
    x = []
    x_line = []

    for file in files:
        with open (f"dataset/aclImdb/{target}/{rev}/{file}", encoding="utf8") as opened_file:
            rating = file.split("_")[1].split(".")[0]

            for line in opened_file:
                x_line = []
                x_line.append(line)
                x_line.append(rating)
                x.append(x_line)
                
    return x

train_pos = pd.DataFrame(columns=["review", "rating"], data=get_reviews("train", "pos", pos_train_files))
train_neg = pd.DataFrame(columns=["review", "rating"], data=get_reviews("train", "neg", neg_train_files))

test_pos = pd.DataFrame(columns=["review", "rating"], data=get_reviews("test", "pos", pos_test_files))
test_neg = pd.DataFrame(columns=["review", "rating"], data=get_reviews("test", "neg", neg_test_files))

train_df = pd.concat([train_pos, train_neg], ignore_index=True)
test_df = pd.concat([test_pos, test_neg], ignore_index=True)

In [24]:
train_df.head()

Unnamed: 0,review,rating
0,For a movie that gets no respect there sure ar...,9
1,Bizarre horror movie filled with famous faces ...,8
2,"A solid, if unremarkable film. Matthau, as Ein...",7
3,It's a strange feeling to sit alone in a theat...,8
4,"You probably all already know this by now, but...",10


In [26]:
# Import Lemmatizer from NLTK
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# function that receive a list of words and do lemmatization:
def lemma_stem_text(words_list):
    # Lemmatizer
    text = [lemmatizer.lemmatize(token.lower()) for token in words_list]
    text = [lemmatizer.lemmatize(token.lower(), "v") for token in text]
    return text


from bs4 import BeautifulSoup
import re

#Creating a function for cleaning of data
def clean_text(raw_text):
    # 1. remove HTML tags
    raw_text = BeautifulSoup(raw_text).get_text() 
    
    # 2. removing all non letters from text
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text) 
    
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                           
    
    # 4. Create variable which contain set of stopwords
    stops = set(stopwords.words("english"))
    stops_indo = set(stopwords.words("indonesian"))
    stops.update(stops_indo)
    
    # 5. Remove stop word & returning   
    words_tmp = [w for w in words if not w in stops]

    # 6. Apply lemmatization function
    words_lemm = lemma_stem_text(words_tmp)

    # 7. Finalize
    return [w for w in words_lemm]


clean_words = []
for i in range(len(train_df['review'])):
    res = clean_text(train_df['review'][i])
    res_len = len(res)
    clean_words.append(res)

In [28]:
se = pd.Series(clean_words)
train_df['clean_words'] = se.values

train_df

Unnamed: 0,review,rating,clean_words
0,For a movie that gets no respect there sure ar...,9,"[movie, get, respect, sure, lot, memorable, qu..."
1,Bizarre horror movie filled with famous faces ...,8,"[bizarre, horror, movie, fill, famous, face, s..."
2,"A solid, if unremarkable film. Matthau, as Ein...",7,"[solid, unremarkable, film, matthau, einstein,..."
3,It's a strange feeling to sit alone in a theat...,8,"[strange, feel, sit, alone, theater, occupy, p..."
4,"You probably all already know this by now, but...",10,"[probably, already, know, additional, episode,..."
...,...,...,...
24995,"My comments may be a bit of a spoiler, for wha...",3,"[comment, may, bite, spoiler, worth, stop, car..."
24996,"The ""saucy"" misadventures of four au pairs who...",4,"[saucy, misadventure, four, au, pair, arrive, ..."
24997,"Oh, those Italians! Assuming that movies about...",1,"[oh, italian, assume, movie, aristocrat, weird..."
24998,Eight academy nominations? It's beyond belief....,3,"[eight, academy, nomination, beyond, belief, t..."


In [4]:
sw = stopwords.words("english")
pca = PCA(n_components=1000)

df = train_df.sample(n=1000, random_state=42)
df = df.reset_index(drop=True)
df.rating = df.rating.astype("float")

df.review = df.review.apply(lambda t: " ".join([t for t in t.replace("<br />", "")
                                         .lower()
                                         .split(" ") if not t in sw])
                                         )

tf = text.TfidfVectorizer()
X = tf.fit_transform(df['review'])
X = X.toarray()

X_pca = pca.fit_transform(X)

df["rev_tfidf"] = [x for x in X_pca]

df.head()

Unnamed: 0,review,rating,rev_tfidf
0,great little thriller. expecting type silly ho...,8.0,"[0.13439975755456773, -0.0010709264420262978, ..."
1,"nothing could saved movie, even superman.ten y...",1.0,"[0.0761370059609401, 0.015579074800994696, 0.1..."
2,good movie. typical war flick something bit di...,8.0,"[0.02997489124833944, -0.01540417098875059, -0..."
3,pen richard condon (the manchurian candidate 1...,2.0,"[-0.08039615132699425, -0.0756867182493885, 0...."
4,suppose today film relevance early sofia loren...,4.0,"[-0.034502646601059506, 0.04037390429403681, -..."


In [5]:
model = Sequential()

model.add(Input(shape=X.shape[-1]))
model.add(Dropout(0.2))

model.add(Dense(50))
model.add(Dropout(0.2))

model.add(Dense(50))
model.add(Dropout(0.2))

model.add(Dense(1))

model.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])

In [8]:

from numpy.random import seed
seed(42)
from tensorflow.random import set_seed
set_seed(42)
model.fit(x=X, y=df.rating, batch_size=1, epochs=25) #, callbacks=[loss_stopper]);

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7fad7b70a100>

In [9]:
df_test = test_df.sample(n=1000, random_state=42)
df_test = df_test.reset_index(drop=True)
df_test.rating = df_test.rating.astype("float")

tf = text.TfidfVectorizer()
Xt = tf.fit_transform(df_test['review'])
Xt = Xt.toarray()

Xt = pca.fit_transform(Xt)

preds = model.predict(Xt)

preds = preds.flatten()

for i in range(len(preds)):
    preds[i] = round(preds[i])

accuracy_score(preds, df_test.rating.values)

0.076

In [None]:
# 1.2 RNN

In [10]:
def get_tag(token):
    
    tags = []
    
    for tag in nltk.pos_tag(token):
        tags.append(tag[1])
    
    return tags

df = train_df.sample(n=1000, random_state=42)
df = df.reset_index(drop=True)
df.rating = df.rating.astype("float")

df["rev_token"] = df["review"].apply(lambda x: nltk.word_tokenize(x))
# df["rev_tag"] = df["rev_token"].apply(lambda x: get_tag(x))

df.head()

Unnamed: 0,review,rating,rev_token
0,Great little thriller. I was expecting some ty...,8.0,"[Great, little, thriller, ., I, was, expecting..."
1,"Nothing could have saved this movie, not even ...",1.0,"[Nothing, could, have, saved, this, movie, ,, ..."
2,This was a good movie. It wasn't your typical ...,8.0,"[This, was, a, good, movie, ., It, was, n't, y..."
3,From the pen of Richard Condon (The Manchurian...,2.0,"[From, the, pen, of, Richard, Condon, (, The, ..."
4,I suppose that today this film has relevance b...,4.0,"[I, suppose, that, today, this, film, has, rel..."


In [11]:
def make_lexicon(token_seqs, min_freq=1):
    token_counts = {}
    for seq in token_seqs:
        for token in seq:
            if token in token_counts:
                token_counts[token] += 1
            else:
                token_counts[token] = 1

    lexicon = [token for token, count in token_counts.items() if count >= min_freq]

    lexicon = {token:idx + 2 for idx,token in enumerate(lexicon)}
    lexicon[u'<UNK>'] = 1 
    lexicon_size = len(lexicon)

    return lexicon

rev_lexicon = make_lexicon(df['rev_token'])
# tag_lexicon = make_lexicon(df['rev_tag'])

def get_lexicon_lookup(lexicon):

    lexicon_lookup = {idx: lexicon_item for lexicon_item, idx in lexicon.items()}
    return lexicon_lookup

def tokens_to_idxs(token_seqs, lexicon):
    idx_seqs = [[lexicon[token] if token in lexicon else lexicon['<UNK>'] for token in token_seq] for token_seq in token_seqs]
    return idx_seqs

df['Sentence_Idxs'] = tokens_to_idxs(df['rev_token'], rev_lexicon)
# df['Tag_Idxs'] = tokens_to_idxs(df['rev_tag'], tag_lexicon)

# tags_lexicon_lookup = get_lexicon_lookup(tag_lexicon)

def pad_idx_seqs(idx_seqs, max_seq_len):
    padded_idxs = pad_sequences(sequences=idx_seqs, maxlen=max_seq_len)
    return padded_idxs

max_seq_len = max([len(idx_seq) for idx_seq in df['Sentence_Idxs']])

train_padded_words = pad_idx_seqs(df['Sentence_Idxs'], max_seq_len + 1)
# train_padded_tags = pad_idx_seqs(df['Tag_Idxs'], max_seq_len + 1)

def create_model(seq_input_len, n_input_nodes, n_embedding_nodes, n_hidden_nodes, stateful=False, batch_size=20):
    
    input_layer = Input(shape=(None,))
    
    #Layer 2
    embedding_layer = Embedding(input_dim=n_input_nodes,
                                output_dim=n_embedding_nodes,
                                mask_zero=True)(input_layer) 
    
    # Layer 3
    gru_layer = GRU(units=n_hidden_nodes)(embedding_layer)

    #Layer 4
    output_layer = Dense(units=1)(gru_layer)

    model = Model(inputs=[input_layer], outputs=output_layer)
    model.compile(loss="mean_squared_error", optimizer='adam')
    
    return model

In [12]:
model = create_model(seq_input_len=train_padded_words.shape[-1] - 1,
                     n_input_nodes=len(rev_lexicon) + 1,
                     n_embedding_nodes=300,
                     n_hidden_nodes=500)

model.fit(x=train_padded_words[:,1:], y=df.rating, batch_size=20, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fad6cfc2040>

In [None]:
test_df = pd.concat([test_pos, test_neg], ignore_index=True)

test_df = test_df.sample(n=1000, random_state=42)
test_df = test_df.reset_index(drop=True)
test_df["rev_token"] = test_df["review"].apply(lambda x: nltk.word_tokenize(x))


test_rev_lexicon = make_lexicon(test_df['rev_token'])

test_df['Sentence_Idxs'] = tokens_to_idxs(test_df['rev_token'], test_rev_lexicon)

max_seq_len = max([len(idx_seq) for idx_seq in test_df['Sentence_Idxs']])

test_padded_words = pad_idx_seqs(test_df['Sentence_Idxs'], max_seq_len + 1)

preds = model.predict(test_padded_words[:,1:])

preds = preds.flatten()
for i in range(len(preds)):
    preds[i] = round(preds[i])

accuracy_score(preds, test_df.rating)

In [157]:
# also
import numpy as np
from keras.utils import to_categorical
from keras import models
from keras import layers
from keras.datasets import imdb
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=10000)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)
def vectorize(sequences, dimension = 10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results
data = vectorize(data)
targets = np.array(targets).astype("float32")
test_x = data[:10000]
test_y = targets[:10000]
train_x = data[10000:]
train_y = targets[10000:]
model = models.Sequential()
# Input - Layer
model.add(layers.Dense(50, activation = "relu", input_shape=(10000, )))
# Hidden - Layers
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()
# compiling the model
model.compile(
 optimizer = "adam",
 loss = "binary_crossentropy",
 metrics = ["accuracy"]
)
results = model.fit(
 train_x, train_y,
 epochs= 2,
 batch_size = 500,
 validation_data = (test_x, test_y)
)
print("Test-Accuracy:", np.mean(results.history["accuracy"]))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 50)                500050    
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 51        
Total params: 505,201
Trainab

# 2. (evil) XOR Problem

Train an LSTM to solve the XOR problem: that is, given a sequence of bits, determine its parity. The LSTM should consume the sequence, one bit at a time, and then output the correct answer at the sequence’s end. Test the two approaches below:

### 2.1 

Generate a dataset of random <=100,000 binary strings of equal length <= 50. Train the LSTM; what is the maximum length you can train up to with precisison?
    

### 2.2

Generate a dataset of random <=200,000 binary strings, where the length of each string is independently and randomly chosen between 1 and 50. Train the LSTM. Does it succeed? What explains the difference?


In [None]:
# https://vitez.me/lstm-xor 

In [None]:
# 2.1

In [18]:
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense, Input, LSTM
from tensorflow.keras.models import Sequential
import numpy as np
import random

In [103]:
SEQ_LEN = 50
COUNT = 100000

In [19]:
bin_pair = lambda x: [x, not(x)]
training = np.array([[bin_pair(random.choice([0, 1])) for _ in range(SEQ_LEN)] for _ in range(COUNT)])
target = np.array([[bin_pair(x) for x in np.cumsum(example[:,0]) % 2] for example in training])

In [105]:
print('shape check:', training.shape, '=', target.shape)

shape check: (100000, 50, 2) = (100000, 50, 2)


In [106]:
model = Sequential()
model.add(Input(shape=(SEQ_LEN, 2), dtype='float32'))
model.add(LSTM(1, return_sequences=True))
model.add(Dense(2, activation='softmax'))

In [107]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(training, target, epochs=10, batch_size=128)
model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 50, 1)             16        
_________________________________________________________________
dense_3 (Dense)              (None, 50, 2)             4         
Total params: 20
Trainable params: 20
Non-trainable params: 0
_________________________________________________________________


In [108]:
predictions = model.predict(training)
i = random.randint(0, COUNT)
chance = predictions[i,-1,0]
print('randomly selected sequence:', training[i,:,0])
print('prediction:', int(chance > 0.5))
print('confidence: {:0.2f}%'.format((chance if chance > 0.5 else 1 - chance) * 100))
print('actual:', np.sum(training[i,:,0]) % 2)

randomly selected sequence: [0 0 0 1 1 1 1 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1
 1 1 1 0 0 1 1 1 0 1 1 0 0]
prediction: 0
confidence: 100.00%
actual: 0


In [None]:
##########

In [None]:
# 2.2

In [15]:
SEQ_LEN = 50
COUNT = 200000

In [None]:
bin_pair = lambda x: [x, not(x)]
training = np.array([[bin_pair(random.choice([0, 1])) for _ in range(SEQ_LEN)] for _ in range(COUNT)])
target = np.array([[bin_pair(x) for x in np.cumsum(example[:,0]) % 2] for example in training])

In [20]:
model = Sequential()
model.add(Input(shape=(SEQ_LEN, 2), dtype='float32'))
model.add(LSTM(1, return_sequences=True))
model.add(Dense(2, activation='softmax'))

In [21]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(training, target, epochs=10, batch_size=128)
model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 50, 1)             16        
_________________________________________________________________
dense_5 (Dense)              (None, 50, 2)             4         
Total params: 20
Trainable params: 20
Non-trainable params: 0
_________________________________________________________________


In [22]:
predictions = model.predict(training)
i = random.randint(0, COUNT)
chance = predictions[i,-1,0]
print('randomly selected sequence:', training[i,:,0])
print('prediction:', int(chance > 0.5))
print('confidence: {:0.2f}%'.format((chance if chance > 0.5 else 1 - chance) * 100))
print('actual:', np.sum(training[i,:,0]) % 2)

randomly selected sequence: [1 0 1 0 1 0 1 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 1 1 1 0
 1 1 1 1 1 1 0 0 0 0 0 1 1]
prediction: 1
confidence: 99.99%
actual: 1


In [None]:

# pd.set_option('display.max_colwidth', 170) #widen pandas rows display

# Get Spacy english core model
# Need to run "python -m spacy download en_core_web_sm" first
encoder = spacy.load("en_core_web_sm")