In [1]:
#@title Installs
!pip install pydot --quiet
!pip install gensim --quiet
!pip install tensorflow==2.15.0 --quiet #15 13
!pip install tf_keras==2.15.0 --quiet
!pip install tensorflow-datasets==4.8 --quiet #8
!pip install tensorflow-text==2.15.0 --quiet #15
!pip install transformers==4.17 --quiet #4.40.2 #4.37.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import transformers
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import random
from sklearn.metrics import accuracy_score, f1_score

In [3]:
#@title Imports

import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
import tensorflow_text as tf_text
import transformers

from transformers import BertTokenizer, TFBertModel, BertConfig
from transformers import RobertaTokenizer, TFRobertaModel

from transformers import logging
logging.set_verbosity_error()

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

In [4]:
#@title Utility print function

def print_version(library_name):
    try:
        lib = __import__(library_name)
        version = getattr(lib, '__version__', 'Version number not found')
        print(f"{library_name} version: {version}")
    except ImportError:
        print(f"{library_name} not installed.")
    except Exception as e:
        print(f"An error occurred: {e}")

#confirm versions
print_version('numpy')
print_version('transformers')
print_version('tensorflow')
print_version('keras')
print_version('pandas')
print_version('sklearn')

numpy version: 1.25.2
transformers version: 4.17.0
tensorflow version: 2.15.0
keras version: 2.15.0
pandas version: 2.0.3
sklearn version: 1.2.2


In [5]:
#@title Global tunable parameters

# Sequence length to truncate/pad
MAX_SEQUENCE_LENGTH = 1000

# hidden layer size after BERT's ouput
HIDDEN_LAYER_SIZE = 512

In [6]:
#@title Utility Plot Function

# 4-window plot. Small modification from matplotlib examples.

def make_plot(axs,
              model_history1,
              model_history2,
              model_1_name='model 1',
              model_2_name='model 2',
              ):
    box = dict(facecolor='yellow', pad=5, alpha=0.2)

    for i, metric in enumerate(['loss', 'accuracy']):
        # small adjustment to account for the 2 accuracy measures in the Weighted Averging Model with Attention
        if 'classification_%s' % metric in model_history2.history:
            metric2 = 'classification_%s' % metric
        else:
            metric2 = metric

        y_lim_lower1 = np.min(model_history1.history[metric])
        y_lim_lower2 = np.min(model_history2.history[metric2])
        y_lim_lower = min(y_lim_lower1, y_lim_lower2) * 0.9

        y_lim_upper1 = np.max(model_history1.history[metric])
        y_lim_upper2 = np.max(model_history2.history[metric2])
        y_lim_upper = max(y_lim_upper1, y_lim_upper2) * 1.1

        for j, model_history in enumerate([model_history1, model_history2]):
            model_name = [model_1_name, model_2_name][j]
            model_metric = [metric, metric2][j]
            ax1 = axs[i, j]
            ax1.plot(model_history.history[model_metric])
            ax1.plot(model_history.history['val_%s' % model_metric])
            ax1.set_title('%s - %s' % (metric, model_name))
            ax1.set_ylabel(metric, bbox=box)
            ax1.set_ylim(y_lim_lower, y_lim_upper)

In [7]:
#@title Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
#@title Read Reddit dataset into a dataframe
rdt_trainfile = '/content/drive/MyDrive/MIDS/w266/02_final_project/mids-w266-project/Reddit/both_train.csv'
rdt_tesstfile = '/content/drive/MyDrive/MIDS/w266/02_final_project/mids-w266-project/Reddit/both_test.csv'
rdt_train = pd.read_csv(rdt_trainfile)
rdt_test = pd.read_csv(rdt_tesstfile)

# Shuffle all rows
rdt_train = rdt_train.sample(frac=1).reset_index(drop=True)
rdt_test = rdt_test.sample(frac=1).reset_index(drop=True)

train_labels = rdt_train.class_id
test_labels = rdt_test.class_id

temp_train_examples = rdt_train.post
temp_test_examples = rdt_test.post

train_examples_list = []
test_examples_list = []

for i in range(len(temp_train_examples)):
  size = len(temp_train_examples[i])
  lower = int(0.3*MAX_SEQUENCE_LENGTH)
  upper = int(0.7*MAX_SEQUENCE_LENGTH)
  if size > MAX_SEQUENCE_LENGTH:
    train_examples_list.append(temp_train_examples[i][:lower] + temp_train_examples[i][upper:])
  else:
    train_examples_list.append(temp_train_examples[i])

for i in range(len(temp_test_examples)):
  size = len(temp_test_examples[i])
  lower = int(0.3*size)
  upper = int(0.7*size)
  if size > MAX_SEQUENCE_LENGTH:
    test_examples_list.append(temp_test_examples[i][:lower] + temp_test_examples[i][upper:])
  else:
    test_examples_list.append(temp_test_examples[i])

train_examples = np.array(train_examples_list)
test_examples = np.array(test_examples_list)


In [9]:
df1 = pd.read_csv("/content/drive/MyDrive/MIDS/w266/02_final_project/mids-w266-project/Reddit/both_train.csv")
df2 = pd.read_csv("/content/drive/MyDrive/MIDS/w266/02_final_project/mids-w266-project/Reddit/both_test.csv")

In [10]:
df1

Unnamed: 0,ID,title,post,class_name,class_id
0,691324c4-5c30-44e0-b9e4-45b4f0715e21,a question about the third conditional.,i was making questions for my students and i r...,none,5
1,d4295391-9ca5-4398-b7c8-687e4a984ef1,the epitome of my life,i've recently requested testing accommodations...,adhd,0
2,58937fa5-3c2c-426b-8255-5a140fbab675,what are your favourites offbeat destinations ...,**cambodia** * koh rong: amazing beaches and a...,none,5
3,7daf364c-3b33-4cbe-be37-a214edf9a73e,synesthesia survey (what colour is each month ...,synesthesia. what is synesthesia? according to...,none,5
4,22518271-4bb4-4caf-b683-7305da519288,"science ama series: i’m phil baran, and i’m he...",i’m phil baran and i teach organic chemistry a...,none,5
...,...,...,...,...,...
13722,386a4117-0324-4b25-8330-b1a6857ccaa6,trying to wake up and leave the house every mo...,"i don't often succeed and when i do, it's like...",bipolar,2
13723,4dd92d63-1231-410d-b049-5d8c430c2f36,"""hey man, what'd you do this past weekend?""",fuck i hate this question. fuck off. i got hom...,depression,3
13724,dbde1d3e-e527-4cf8-8045-2093aec2a784,"how to use "" whereby "" correctly?",hi. i'm not a native speaker and not a good en...,none,5
13725,5355fad6-5bb4-4092-8428-9b8527a6d261,"i'm sorry, another depression post",i'm sorry to be such a downer all the time. i ...,bipolar,2


In [11]:
df2

Unnamed: 0,ID,title,post,class_name,class_id
0,b07b978c-7760-4932-85cb-5797b5e74168,"basic needs for neurotypicals : food, clothing...",most people able to be productive and function...,adhd,0
1,b2868fd2-e885-400e-b780-446c9581b1f1,"""you're just an iphone in an android world.: d...",i'm 23f and recently quit my job as a features...,adhd,0
2,3191b3b1-d355-41b6-9588-caeb3e169c9b,"""what kind of music do you listen to?"" no answer",i just got finished with an interview and as t...,adhd,0
3,a3d60895-ef7c-4b57-b5be-632c9fd62150,couldn’t help crying while reading a post that...,off course i’ve forgotten what i exactly read....,adhd,0
4,66fae18e-e744-495f-b7a0-b985543f5052,shout-out to all the parents with adhd childre...,"i have six siblings, and at least 4 of us have...",adhd,0
...,...,...,...,...,...
1483,5855d35b-bf7d-4a49-8030-b185f430f05a,i went on a solo motorcycle ride from tamil na...,i quit my job early this year and moved back t...,none,5
1484,a2faea38-8b93-43da-a75b-cb7c0ba1ae0d,hello and namaste to r/india... from r/nepal. ...,there are lots of trolls trying to create a di...,none,5
1485,7ad72b7f-eba4-4c17-90b0-1038dc144a8b,public api for the largest video game database...,&amp;#x200b; rawg is the largest video game da...,none,5
1486,e73c198f-5e8e-4c84-aedf-333b576f9939,acs ama: hi reddit! my name is dr. john m news...,hi reddit! i’m [dr. john m. and i serve as ceo...,none,5


In [12]:
frames = [df1, df2]
corpus = pd.concat(frames, ignore_index=True)

In [13]:
corpus["text"] = corpus.title + " " + corpus.post

In [14]:
corpus

Unnamed: 0,ID,title,post,class_name,class_id,text
0,691324c4-5c30-44e0-b9e4-45b4f0715e21,a question about the third conditional.,i was making questions for my students and i r...,none,5,a question about the third conditional. i was ...
1,d4295391-9ca5-4398-b7c8-687e4a984ef1,the epitome of my life,i've recently requested testing accommodations...,adhd,0,the epitome of my life i've recently requested...
2,58937fa5-3c2c-426b-8255-5a140fbab675,what are your favourites offbeat destinations ...,**cambodia** * koh rong: amazing beaches and a...,none,5,what are your favourites offbeat destinations ...
3,7daf364c-3b33-4cbe-be37-a214edf9a73e,synesthesia survey (what colour is each month ...,synesthesia. what is synesthesia? according to...,none,5,synesthesia survey (what colour is each month ...
4,22518271-4bb4-4caf-b683-7305da519288,"science ama series: i’m phil baran, and i’m he...",i’m phil baran and i teach organic chemistry a...,none,5,"science ama series: i’m phil baran, and i’m he..."
...,...,...,...,...,...,...
15210,5855d35b-bf7d-4a49-8030-b185f430f05a,i went on a solo motorcycle ride from tamil na...,i quit my job early this year and moved back t...,none,5,i went on a solo motorcycle ride from tamil na...
15211,a2faea38-8b93-43da-a75b-cb7c0ba1ae0d,hello and namaste to r/india... from r/nepal. ...,there are lots of trolls trying to create a di...,none,5,hello and namaste to r/india... from r/nepal. ...
15212,7ad72b7f-eba4-4c17-90b0-1038dc144a8b,public api for the largest video game database...,&amp;#x200b; rawg is the largest video game da...,none,5,public api for the largest video game database...
15213,e73c198f-5e8e-4c84-aedf-333b576f9939,acs ama: hi reddit! my name is dr. john m news...,hi reddit! i’m [dr. john m. and i serve as ceo...,none,5,acs ama: hi reddit! my name is dr. john m news...


In [15]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
torch.cuda.get_device_name(0)

'Tesla T4'

In [16]:
bertCorpus = corpus[["text","class_id"]]
bertCorpus

Unnamed: 0,text,class_id
0,a question about the third conditional. i was ...,5
1,the epitome of my life i've recently requested...,0
2,what are your favourites offbeat destinations ...,5
3,synesthesia survey (what colour is each month ...,5
4,"science ama series: i’m phil baran, and i’m he...",5
...,...,...
15210,i went on a solo motorcycle ride from tamil na...,5
15211,hello and namaste to r/india... from r/nepal. ...,5
15212,public api for the largest video game database...,5
15213,acs ama: hi reddit! my name is dr. john m news...,5


In [17]:
text = bertCorpus.text.values
labels = bertCorpus.class_id.values

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [19]:
#Complete Tokenization
inputIds = [tokenizer.encode(element, add_special_tokens= True) for element in text]

In [20]:
print('Max text length from corpus: ', max([len(ele) for ele in inputIds]))

Max text length from corpus:  8766


In [21]:
#Truncating
inputIdsTrunc = pad_sequences(inputIds, maxlen=512, dtype="long", value=0, truncating="post", padding="post")

In [22]:
print('Max text length after Truncating: ', max([len(ele) for ele in inputIdsTrunc]))

Max text length after Truncating:  512


In [23]:
#Attention Masks
attentionMasks = [[int(tokenId>0) for tokenId in ele] for ele in inputIdsTrunc]

In [24]:
#Train-test Split
trainInputs, validationInputs, trainLabels, validationLabels = train_test_split(inputIdsTrunc, labels, random_state=2021, test_size=0.2)
trainMasks, validationMasks, _, _ = train_test_split(attentionMasks, labels, random_state=2021, test_size=0.2)

In [25]:
#Data-Type Conversion to Torch Tensor
trainInputs = torch.tensor(trainInputs)
validationInputs = torch.tensor(validationInputs)

trainLabels = torch.tensor(trainLabels)
validationLabels = torch.tensor(validationLabels)

trainMasks = torch.tensor(trainMasks)
validationMasks = torch.tensor(validationMasks)

In [26]:
batch_size = 8
#Train DataLoader
trainData = TensorDataset(trainInputs, trainMasks, trainLabels)
trainSampler = RandomSampler(trainData)
train_dataloader = DataLoader(trainData, sampler=trainSampler, batch_size=batch_size)
#Test DataLoader
valData = TensorDataset(validationInputs, validationMasks, validationLabels)
valSampler = RandomSampler(valData)
val_dataloader = DataLoader(valData, sampler=valSampler, batch_size=batch_size)

In [27]:
#model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 10, output_attentions = False, output_hidden_states = False)
model.cuda()

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [28]:
#optimizer
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)



In [29]:
epochs = 4
train_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = train_steps)

In [30]:
seed_val = 500
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [31]:
loss_values = []
for i in range(epochs):
    print("")
    print('Epoch: {}'.format(i + 1))
    print('Training...')
    total_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)
        print('Step: {}'.format(step))
        model.zero_grad()
        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    loss_values.append(total_loss / len(train_dataloader) )

    print("\n  Average training loss: {0:.2f}".format(total_loss / len(train_dataloader)))

print("\nTraining complete!")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step: 1107
Step: 1108
Step: 1109
Step: 1110
Step: 1111
Step: 1112
Step: 1113
Step: 1114
Step: 1115
Step: 1116
Step: 1117
Step: 1118
Step: 1119
Step: 1120
Step: 1121
Step: 1122
Step: 1123
Step: 1124
Step: 1125
Step: 1126
Step: 1127
Step: 1128
Step: 1129
Step: 1130
Step: 1131
Step: 1132
Step: 1133
Step: 1134
Step: 1135
Step: 1136
Step: 1137
Step: 1138
Step: 1139
Step: 1140
Step: 1141
Step: 1142
Step: 1143
Step: 1144
Step: 1145
Step: 1146
Step: 1147
Step: 1148
Step: 1149
Step: 1150
Step: 1151
Step: 1152
Step: 1153
Step: 1154
Step: 1155
Step: 1156
Step: 1157
Step: 1158
Step: 1159
Step: 1160
Step: 1161
Step: 1162
Step: 1163
Step: 1164
Step: 1165
Step: 1166
Step: 1167
Step: 1168
Step: 1169
Step: 1170
Step: 1171
Step: 1172
Step: 1173
Step: 1174
Step: 1175
Step: 1176
Step: 1177
Step: 1178
Step: 1179
Step: 1180
Step: 1181
Step: 1182
Step: 1183
Step: 1184
Step: 1185
Step: 1186
Step: 1187
Step: 1188
Step: 1189
Step: 1190
Step: 1191


In [32]:
predictions = []
true_labels = []
model.eval()
for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    batchInputIds, batchInput_mask, batchLabels = batch
    with torch.no_grad():
        outputs = model(batchInputIds, token_type_ids=None, attention_mask=batchInput_mask)

    out = outputs[0]

    out = out.detach().cpu().numpy()
    labelIds = batchLabels.to('cpu').numpy()

    predictions.append(out)
    true_labels.append(labelIds)

print('DONE.')

DONE.


In [33]:
finalPredictions = [ele for predList in predictions for ele in predList]
finalPredictions = np.argmax(finalPredictions, axis=1).flatten()
finalTrueLabels = [ele for trueList in true_labels for ele in trueList]

In [34]:
accuracy_score(finalPredictions, finalTrueLabels)

0.8708511337495892

In [35]:
f1_score(finalPredictions, finalTrueLabels, average = "weighted")

0.8704899811291447