### Static Embeddings
In this notebook, we examine teh CNN LSTM approach using static embeddings for classification https://www.sciencedirect.com/science/article/pii/S2667096820300070


https://stackabuse.com/python-for-nlp-movie-sentiment-analysis-using-deep-learning-in-keras/

https://keras.io/examples/nlp/bidirectional_lstm_imdb/

In [1]:
!pip install gensim==3.8.3 --quiet
!pip install tensorflow-datasets --quiet
!pip install -U tensorflow-text==2.8.2 --quiet
!pip install pydot --quiet

[K     |████████████████████████████████| 24.2 MB 2.2 MB/s 
[K     |████████████████████████████████| 4.9 MB 4.3 MB/s 
[?25h

In [15]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

import tensorflow as tf
from tensorflow.keras import layers


import sklearn as sk
import os
import nltk
from nltk.corpus import reuters
from nltk.data import find

import matplotlib.pyplot as plt

import re
import gensim

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:


NOTEBOOK_LOC = "/content/drive/MyDrive/Colab Notebooks/W266 Final Project/"

### Load Data

Also format data for weighted BERT below

In [6]:
TRAIN_TEXTS = []
VALID_TEXTS = []

TRAIN_LABELS = []
VALID_LABELS = []

TRAIN_WEIGHTS = []

TARGET_NAMES = ["disagree", "neutral", "agree"]

CV_IDX = [4]

for cv_idx in CV_IDX:

  training_data = pd.read_table(NOTEBOOK_LOC + "/Data/GWSD_training_"+str(cv_idx)+".tsv")
  valid_data = pd.read_table(NOTEBOOK_LOC + "/Data/GWSD_val_"+str(cv_idx)+".tsv")


  train_texts = training_data["sentence"].to_list()
  train_labels = np.asarray(training_data["stance_id"].astype(int).to_list())

  valid_texts = valid_data["sentence"].to_list()
  valid_labels = np.asarray(valid_data["stance_id"].astype(int).to_list())

  training_data["weight"] = training_data[["agree", "neutral", "disagree"]].max(axis=1)
  train_weights = np.asarray(training_data["weight"]) # add label weights

  TRAIN_TEXTS.append(train_texts)
  VALID_TEXTS.append(valid_texts)

  TRAIN_LABELS.append(train_labels)
  VALID_LABELS.append(valid_labels)

  TRAIN_WEIGHTS.append(train_weights)

test_data = pd.read_table(NOTEBOOK_LOC + "/Data/GWSD_test.tsv")
test_texts = test_data["sentence"].to_list()
test_labels = np.asarray(test_data["stance_id"].astype(int).to_list())

### Embedding Setup

In [9]:
nltk.download('word2vec_sample')

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))

model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Unzipping models/word2vec_sample.zip.


In [11]:
# initialize embedding matrix and word-to-id map:
EMBEDDING_DIM = len(model['university']) 

embedding_matrix = np.zeros((len(model.vocab.keys()) + 1, EMBEDDING_DIM))       
vocab_dict = {}

# build the embedding matrix and the word-to-id map:
for i, word in enumerate(model.vocab.keys()):
    embedding_vector = model[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        vocab_dict[word] = i

In [None]:
MAX_SEQUENCE_LENGTH = 100



tokenizer = tf_text.WhitespaceTokenizer()
train_tokens = tokenizer.tokenize(train_examples_batch)
test_tokens = tokenizer.tokenize(test_examples_batch)



In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


#### Simple CNN-LSTM

In [None]:
epochs = 10
num_filters = [100, 100, 50, 25]
kernel_sizes = [3, 5, 10, 20]
dense_layer_dims = [100, 30]
dropout_rate = 0.5

In [None]:

cnn_input_layer = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int64')

cnn_embedding_layer = layers.Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

x = cnn_embedding_layer(cnn_input_layer)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)


x = layers.Embedding(max_features, 128)(x)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)


model = tf.keras.Model(inputs, outputs)
model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), 
                            metrics='accuracy') 

In [None]:
model.fit(train_ds, validation_data=val_ds, epochs=7)