## Setup

### Libraries and Helper Functions


In [1]:
#@title Installs
!pip install pydot --quiet
!pip install gensim --quiet
!pip install tensorflow==2.15.0 --quiet #15 13
!pip install tf_keras==2.15.0 --quiet
!pip install tensorflow-datasets==4.8 --quiet #8
!pip install tensorflow-text==2.15.0 --quiet #15
!pip install transformers==4.17 --quiet #4.40.2 #4.37.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
#@title Imports

import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
import tensorflow_text as tf_text
import transformers

from transformers import BertTokenizer, TFBertModel, BertConfig
from transformers import RobertaTokenizer, TFRobertaModel

from transformers import logging
logging.set_verbosity_error()

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

In [3]:
#@title Global tunable parameters

# Sequence length to truncate/pad
MAX_SEQUENCE_LENGTH = 1000

# hidden layer size after BERT's ouput
HIDDEN_LAYER_SIZE = 512

In [4]:
#@title Utility print function

def print_version(library_name):
    try:
        lib = __import__(library_name)
        version = getattr(lib, '__version__', 'Version number not found')
        print(f"{library_name} version: {version}")
    except ImportError:
        print(f"{library_name} not installed.")
    except Exception as e:
        print(f"An error occurred: {e}")

#confirm versions
print_version('numpy')
print_version('transformers')
print_version('tensorflow')
print_version('keras')
print_version('pandas')
print_version('sklearn')

numpy version: 1.25.2
transformers version: 4.17.0
tensorflow version: 2.15.0
keras version: 2.15.0
pandas version: 2.0.3
sklearn version: 1.2.2


In [5]:
#@title Utility Plot Function

# 4-window plot. Small modification from matplotlib examples.

def make_plot(axs,
              model_history1,
              model_history2,
              model_1_name='model 1',
              model_2_name='model 2',
              ):
    box = dict(facecolor='yellow', pad=5, alpha=0.2)

    for i, metric in enumerate(['loss', 'accuracy']):
        # small adjustment to account for the 2 accuracy measures in the Weighted Averging Model with Attention
        if 'classification_%s' % metric in model_history2.history:
            metric2 = 'classification_%s' % metric
        else:
            metric2 = metric

        y_lim_lower1 = np.min(model_history1.history[metric])
        y_lim_lower2 = np.min(model_history2.history[metric2])
        y_lim_lower = min(y_lim_lower1, y_lim_lower2) * 0.9

        y_lim_upper1 = np.max(model_history1.history[metric])
        y_lim_upper2 = np.max(model_history2.history[metric2])
        y_lim_upper = max(y_lim_upper1, y_lim_upper2) * 1.1

        for j, model_history in enumerate([model_history1, model_history2]):
            model_name = [model_1_name, model_2_name][j]
            model_metric = [metric, metric2][j]
            ax1 = axs[i, j]
            ax1.plot(model_history.history[model_metric])
            ax1.plot(model_history.history['val_%s' % model_metric])
            ax1.set_title('%s - %s' % (metric, model_name))
            ax1.set_ylabel(metric, bbox=box)
            ax1.set_ylim(y_lim_lower, y_lim_upper)

In [6]:
#@title Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
#@title Read Reddit dataset into a dataframe
rdt_trainfile = '/content/drive/MyDrive/MIDS-266/w266/project/Reddit/both_train.csv'
rdt_tesstfile = '/content/drive/MyDrive/MIDS-266/w266/project/Reddit/both_test.csv'
rdt_train = pd.read_csv(rdt_trainfile)
rdt_test = pd.read_csv(rdt_tesstfile)

# Shuffle all rows
rdt_train = rdt_train.sample(frac=1).reset_index(drop=True)
rdt_test = rdt_test.sample(frac=1).reset_index(drop=True)

train_labels = rdt_train.class_id
test_labels = rdt_test.class_id

temp_train_examples = rdt_train.post
temp_test_examples = rdt_test.post

train_examples_list = []
test_examples_list = []

for i in range(len(temp_train_examples)):
  size = len(temp_train_examples[i])
  lower = int(0.3*MAX_SEQUENCE_LENGTH)
  upper = int(0.7*MAX_SEQUENCE_LENGTH)
  if size > MAX_SEQUENCE_LENGTH:
    train_examples_list.append(temp_train_examples[i][:lower] + temp_train_examples[i][upper:])
  else:
    train_examples_list.append(temp_train_examples[i])

for i in range(len(temp_test_examples)):
  size = len(temp_test_examples[i])
  lower = int(0.3*size)
  upper = int(0.7*size)
  if size > MAX_SEQUENCE_LENGTH:
    test_examples_list.append(temp_test_examples[i][:lower] + temp_test_examples[i][upper:])
  else:
    test_examples_list.append(temp_test_examples[i])

train_examples = np.array(train_examples_list)
test_examples = np.array(test_examples_list)


In [8]:
temp_train_examples[0]

"tw: suicide and depression and drugs or whatever and i felt great for like a day. i mean, how could i not? the hospital treated me terribly and i got very little help while there so like. yeah, i was happy to be a free woman. but now reality hits. i'm still bipolar. they didn't change my meds. i still want to stop existing. all my successes feel like failure. i wish people would stop caring about me because i stopped caring about myself. i still haven't relapsed but tbh that's only a matter of time. i feel totally fucking alone in this. every time i think i'm better, two seconds later i get worse. this is easily the worst depressive episode i've had in probably five or six years. i don't quite want to kill myself but i definitely want to do heroin again. nobody gets it. nobody. i just want to throw my life away. i literally want to ruin my life because i don't think i deserve happiness. i feel destined to a lifetime of pain and hard drugs. this fucking sucks."

In [9]:
train_examples[0]

"tw: suicide and depression and drugs or whatever and i felt great for like a day. i mean, how could i not? the hospital treated me terribly and i got very little help while there so like. yeah, i was happy to be a free woman. but now reality hits. i'm still bipolar. they didn't change my meds. i still want to stop existing. all my successes feel like failure. i wish people would stop caring about me because i stopped caring about myself. i still haven't relapsed but tbh that's only a matter of time. i feel totally fucking alone in this. every time i think i'm better, two seconds later i get worse. this is easily the worst depressive episode i've had in probably five or six years. i don't quite want to kill myself but i definitely want to do heroin again. nobody gets it. nobody. i just want to throw my life away. i literally want to ruin my life because i don't think i deserve happiness. i feel destined to a lifetime of pain and hard drugs. this fucking sucks."

In [10]:
#@title Inspect data and label characteristics

print(f'Training set labels shape: {train_labels.shape}')
print(f'Test set labels shape: {test_labels.shape}')

print(f'Training set examples shape: {train_examples.shape}')
print(f'Test set examples shape: {test_examples.shape}')

print(f'Distribution of the length of all title')
print(rdt_train["title"].str.len().describe())

print('Distribution of the length of all posts')
print(rdt_train["post"].str.len().describe())

print(f'Labels min : {rdt_train.class_id.min()} max : {rdt_train.class_id.max()}')


Training set labels shape: (13727,)
Test set labels shape: (1488,)
Training set examples shape: (13727,)
Test set examples shape: (1488,)
Distribution of the length of all title
count    13727.000000
mean        67.111751
std         48.713468
min          2.000000
25%         34.000000
50%         54.000000
75%         85.000000
max        306.000000
Name: title, dtype: float64
Distribution of the length of all posts
count    13727.000000
mean      1065.297734
std       1373.414370
min        123.000000
25%        362.000000
50%        658.000000
75%       1236.500000
max      38168.000000
Name: post, dtype: float64
Labels min : 0 max : 5


In [11]:
#@title Training/Test Sets for BERT-based models

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = TFBertModel.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Next, we will preprocess our train and test data for use in the BERT model. We need to convert our documents into vocab IDs, like we did above with the Word2Vec vocabulary. But this time we'll use the BERT tokenizer, which has a different vocabulary specific to the BERT model we're going to use.

In [12]:
type(train_examples_list)

list

In [13]:
#@title BERT Tokenization of training and test data

#train_examples_str = [x.decode('utf-8') for x in train_examples.numpy()]
#test_examples_str = [x.decode('utf-8') for x in test_examples.numpy()]
train_examples_str = [x for x in train_examples]
test_examples_str = [x for x in test_examples]

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_train_tokenized = bert_tokenizer(train_examples_str,
              max_length=MAX_SEQUENCE_LENGTH,
              truncation=True,
              padding='max_length',
              return_tensors='tf')
bert_train_inputs = [bert_train_tokenized.input_ids,
                     bert_train_tokenized.token_type_ids,
                     bert_train_tokenized.attention_mask]
bert_train_labels = np.array(train_labels)

bert_test_tokenized = bert_tokenizer(test_examples_str,
              max_length=MAX_SEQUENCE_LENGTH,
              truncation=True,
              padding='max_length',
              return_tensors='tf')
bert_test_inputs = [bert_test_tokenized.input_ids,
                     bert_test_tokenized.token_type_ids,
                     bert_test_tokenized.attention_mask]
bert_test_labels = np.array(test_labels)

In [14]:
bert_train_inputs[0][3]

<tf.Tensor: shape=(1000,), dtype=int32, numpy=
array([  101,   178,  1178,  1474,  1142,  1272,  1111,  1199,  2255,
         178,  1108,  3737,  1106,  1301,  1111,  1373,  1159,   119,
       13280,  1136,  1612,  1191,   178,  1108, 10529,   117,  1315,
        6884,   117,  1137,  1315,  1996,  1107,  1139,  1319,  2191,
         118, 13532,   119,   178,  1855,  1105,  1155,  1159,  1822,
        1105,   178,  1921,  4120,   178,  1577,   787,   189,  1202,
        1122,  1118,  1991,   119,  7606,  5098,  2014,  1139,  1297,
        1105,   178,   787,   182,  5221,  9473,  1111,  1343,  1150,
        1202,  1115,  1413,  1104,  1250,   119,  1177,  1274,   787,
         189,  1129,  3737,  1122,  2762,   787,   189,   789,  4348,
         789,   119,   119,   119,  1122,   787,   188,  4348,  1106,
        1519,  1800,  1494,  1128,  1494,  3739,   102,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,  

In [15]:
bert_train_tokenized.attention_mask

<tf.Tensor: shape=(13727, 1000), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>

In [16]:
#@title Distribution of labels in training and test sets
for i in range(np.max(bert_train_labels)):
  pos_indices = np.where(bert_train_labels == i)
  pct_positive = len(pos_indices[0])/len(bert_train_labels)
  print(f'Labels {i} in training set: {len(pos_indices[0])}/{len(bert_train_labels)} ({pct_positive})')

for i in range(np.max(bert_test_labels)):
  pos_indices = np.where(bert_test_labels == i)
  pct_positive = len(pos_indices[0])/len(bert_test_labels)
  print(f'Labels {i} in test set: {len(pos_indices[0])}/{len(bert_train_labels)} ({pct_positive})')


Labels 0 in training set: 2465/13727 (0.17957310410140598)
Labels 1 in training set: 2422/13727 (0.17644059153493116)
Labels 2 in training set: 2407/13727 (0.17534785459313762)
Labels 3 in training set: 2450/13727 (0.17848036715961244)
Labels 4 in training set: 2001/13727 (0.14577110803525897)
Labels 0 in test set: 248/13727 (0.16666666666666666)
Labels 1 in test set: 248/13727 (0.16666666666666666)
Labels 2 in test set: 248/13727 (0.16666666666666666)
Labels 3 in test set: 248/13727 (0.16666666666666666)
Labels 4 in test set: 248/13727 (0.16666666666666666)


In [17]:
#@title RoBERTa Tokenization of training and test data

#train_examples_str = [x.decode('utf-8') for x in train_examples.numpy()]
#test_examples_str = [x.decode('utf-8') for x in test_examples.numpy()]
train_examples_str = [x for x in train_examples]
test_examples_str = [x for x in test_examples]

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_train_tokenized = roberta_tokenizer(train_examples_str,
              max_length=MAX_SEQUENCE_LENGTH,
              truncation=True,
              padding='max_length',
              return_tensors='tf')
roberta_train_inputs = [roberta_train_tokenized.input_ids,
                     roberta_train_tokenized.attention_mask]
roberta_train_labels = np.array(train_labels)

roberta_test_tokenized = roberta_tokenizer(test_examples_str,
              max_length=MAX_SEQUENCE_LENGTH,
              truncation=True,
              padding='max_length',
              return_tensors='tf')
roberta_test_inputs = [roberta_test_tokenized.input_ids,
                     roberta_test_tokenized.attention_mask]
roberta_test_labels = np.array(test_labels)

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [18]:
#@title Train BERT model using CLS token
def create_bert_cls_model(bert_base_model,
                          max_sequence_length=MAX_SEQUENCE_LENGTH,
                          hidden_size = 100,
                          dropout=0.3,
                          learning_rate=0.00005):
    """
    Build a simple classification model with BERT. Use the CLS Token output for classification purposes.
    """

    bert_base_model.trainable = True

    input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    bert_out = bert_base_model(bert_inputs)

    # The first token of every example is a CLS token
    cls_embedding = bert_out[0][:, 0, :]
    print(cls_embedding.shape)

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_embedding)

    hidden = tf.keras.layers.Dropout(dropout)(hidden)

    classification = tf.keras.layers.Dense(6, activation='softmax',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])

    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                                 metrics='accuracy')

    return classification_model

config_l = BertConfig.from_pretrained("bert-base-cased")
config_l.num_max_position_embeddingslabels = 1024
bert_model = TFBertModel.from_pretrained('bert-base-cased', config = config_l)
bert_avg_model = create_bert_cls_model(bert_model, hidden_size=HIDDEN_LAYER_SIZE)

bert_avg_model.summary()

bert_avg_model_history = bert_avg_model.fit(
    bert_train_inputs,
    bert_train_labels,
    validation_data=(bert_test_inputs, bert_test_labels),
    batch_size=12,
    epochs=2)


(None, 768)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask_layer (Inpu  [(None, 1000)]               0         []                            
 tLayer)                                                                                          
                                                                                                  
 input_ids_layer (InputLaye  [(None, 1000)]               0         []                            
 r)                                                                                               
                                                                                                  
 token_type_ids_layer (Inpu  [(None, 1000)]               0         []                            
 tLayer)                                                                          



Epoch 2/2


In [19]:
#@title Train RoBERTa model - use CLS token
def create_roberta_cls_model(bert_base_model,
                          max_sequence_length=MAX_SEQUENCE_LENGTH,
                          hidden_size = 100,
                          dropout=0.3,
                          learning_rate=0.00005):
    """
    Build a simple classification model with BERT. Use the CLS Token output for classification purposes.
    """

    bert_base_model.trainable = True

    input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='input_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'attention_mask': attention_mask}

    bert_out = bert_base_model(bert_inputs)

    # The first token of every example is a CLS token
    cls_embedding = bert_out[0][:, 0, :]
    print(cls_embedding.shape)

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_embedding)

    hidden = tf.keras.layers.Dropout(dropout)(hidden)

    classification = tf.keras.layers.Dense(6, activation='softmax',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=[classification])

    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                                 metrics='accuracy')

    return classification_model


config_l = BertConfig.from_pretrained("roberta-base")
config_l.num_max_position_embeddingslabels = 1024

roberta_model = TFRobertaModel.from_pretrained('roberta-base', config = config_l)
roberta_classification_model = create_roberta_cls_model(roberta_model, hidden_size=HIDDEN_LAYER_SIZE)

roberta_classification_model.summary()

roberta_classification_model_history = roberta_classification_model.fit(
    roberta_train_inputs,
    roberta_train_labels,
    validation_data=(roberta_test_inputs, roberta_test_labels),
    batch_size=12,
    epochs=2
)

Downloading:   0%|          | 0.00/627M [00:00<?, ?B/s]

(None, 768)
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask_layer (Inpu  [(None, 1000)]               0         []                            
 tLayer)                                                                                          
                                                                                                  
 input_ids_layer (InputLaye  [(None, 1000)]               0         []                            
 r)                                                                                               
                                                                                                  
 tf_roberta_model (TFRobert  TFBaseModelOutputWithPooli   1246456   ['attention_mask_layer[0][0]',
 aModel)                     ngAndCrossAttentions(last_   32         'input_ids_



Epoch 2/2


In [20]:
#fig, axs = plt.subplots(2, 2)
#fig.subplots_adjust(left=0.2, wspace=0.6)
#make_plot(axs,
#          dan_shuffled_history,
#          wan_history,
#          model_1_name='dan',
#          model_2_name='wan')

#fig.align_ylabels(axs[:, 1])
#fig.set_size_inches(18.5, 10.5)
#plt.show()