In [1]:
#@title Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Setup

In [2]:
#@title Installs
#!pip install -U sentence-transformers


In [3]:
#@title Imports

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


import sklearn as sk
import os
import nltk
from nltk.data import find
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import wordnet


import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import roc_curve, roc_auc_score

import re

import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

#from sentence_transformers import SentenceTransformer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [4]:
#@title Global tunable parameters

# Base path to store trained snapshot and results
BASE_PATH = 'drive/MyDrive/MIDS-266/w266/project'

In [5]:
!ls /content/drive/MyDrive/MIDS-266/w266/project/saved_models/

1  2  3


# Utility library

In [6]:
#@title Utility print function

def print_version(library_name):
    try:
        lib = __import__(library_name)
        version = getattr(lib, '__version__', 'Version number not found')
        print(f"{library_name} version: {version}")
    except ImportError:
        print(f"{library_name} not installed.")
    except Exception as e:
        print(f"An error occurred: {e}")

#confirm versions
print_version('numpy')
print_version('pandas')
print_version('sklearn')

numpy version: 1.26.4
pandas version: 2.1.4
sklearn version: 1.3.2


In [7]:
#@title Utility Plot Function

# 4-window plot. Small modification from matplotlib examples.

def make_plot(axs,
              model_history1,
              model_history2,
              model_1_name='model 1',
              model_2_name='model 2',
              ):
    box = dict(facecolor='yellow', pad=5, alpha=0.2)

    for i, metric in enumerate(['loss', 'accuracy']):
        # small adjustment to account for the 2 accuracy measures in the Weighted Averging Model with Attention
        if 'classification_%s' % metric in model_history2.history:
            metric2 = 'classification_%s' % metric
        else:
            metric2 = metric

        y_lim_lower1 = np.min(model_history1.history[metric])
        y_lim_lower2 = np.min(model_history2.history[metric2])
        y_lim_lower = min(y_lim_lower1, y_lim_lower2) * 0.9

        y_lim_upper1 = np.max(model_history1.history[metric])
        y_lim_upper2 = np.max(model_history2.history[metric2])
        y_lim_upper = max(y_lim_upper1, y_lim_upper2) * 1.1

        for j, model_history in enumerate([model_history1, model_history2]):
            model_name = [model_1_name, model_2_name][j]
            model_metric = [metric, metric2][j]
            ax1 = axs[i, j]
            ax1.plot(model_history.history[model_metric])
            ax1.plot(model_history.history['val_%s' % model_metric])
            ax1.set_title('%s - %s' % (metric, model_name))
            ax1.set_ylabel(metric, bbox=box)
            ax1.set_ylim(y_lim_lower, y_lim_upper)

In [8]:

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            lemma_tolower = lemma.name().lower() # convert to lowercase
            lemma_to_lower = lemma_tolower.replace('_', ' ') # remove all underscore characters used as bridge between words
            if lemma_tolower != word: # some lemma words are same as the word being searches, ignore them
              if lemma_tolower not in synonyms:
                #print(lemma_to_lower)
                synonyms.append(lemma_to_lower)
    return set(synonyms)



In [9]:
# Test get_synonyms
# Example usage
word = "depression"
synonyms = get_synonyms(word)
print(f"Synonyms of '{word}': {synonyms}")

Synonyms of 'depression': {'economic crisis', 'low', 'clinical depression', 'natural depression', 'impression', 'imprint', 'great depression', 'slump', 'depressive disorder'}


# Dataset preparation

In [27]:
#@title Glean class-name and id information into globals
rdt_trainfile = f'{BASE_PATH}/Reddit/both_train.csv'
rdt_train = pd.read_csv(rdt_trainfile)
rdt_testfile = f'{BASE_PATH}/Reddit/both_test.csv'
rdt_test = pd.read_csv(rdt_testfile)
mh_cls_names = list(rdt_train.class_name.unique())
mh_cls_id = list(rdt_train.class_id.unique())

mh_dict_id_to_name = {}
mh_dict_name_to_id = {}
for idx, value in enumerate(mh_cls_id):
    mh_dict_id_to_name[value] = mh_cls_names[idx]
for idx, value in enumerate(mh_cls_names):
    mh_dict_name_to_id[value] = mh_cls_id[idx]

In [28]:
rdt_test.value_counts('class_name')

Unnamed: 0_level_0,count
class_name,Unnamed: 1_level_1
adhd,248
anxiety,248
bipolar,248
depression,248
none,248
ptsd,248


In [11]:
#@title Read mismatch dataset into a global dataframe
print('Enter the model to pick up mismatch file from:  ')
print('1. BERT')
print('2. Distillbert')
print('3. RoBERTa')
input_model = input()
if input_model == '':
  input_model = int('1')
else:
  input_model = int(input_model)

BASE_PATH = '/content/drive/MyDrive/MIDS-266/w266/project'
mismatches_file = f'{BASE_PATH}/saved_models/{input_model}/mismatches.csv'
print(f'Mismatch file located at: {mismatches_file}')
res_df = pd.read_csv(mismatches_file)
res_df.head()


Enter the model to pick up mismatch file from:  
1. BERT
2. Distillbert
3. RoBERTa
1
Mismatch file located at: /content/drive/MyDrive/MIDS-266/w266/project/saved_models/1/mismatches.csv


Unnamed: 0,actual,predicted,test_id,text,prob_adhd,prob_anxiety,prob_bipolar,prob_depression,prob_ptsd,prob_none
0,anxiety,bipolar,0,been agoraphobic and nearly housebound for abo...,0.046899,0.033616,0.467618,0.420386,0.017133,0.014347
1,depression,depression,1,can depression make you less smart? sorry if t...,0.018443,0.004,0.049376,0.920667,0.002527,0.004987
2,anxiety,anxiety,2,even the tiniest success is still kinda succes...,0.007255,0.929369,0.022199,0.020004,0.019004,0.00217
3,adhd,adhd,3,difficulty expressing emotions? why is it so h...,0.595535,0.169443,0.111073,0.079633,0.02859,0.015726
4,none,none,4,read terms &amp; conditions of linking aadhaar...,1e-05,2.8e-05,8e-06,8e-06,1.9e-05,0.999927


In [12]:
#@title Inspect the result dataframe
errs = res_df[(res_df['actual'] == 'bipolar') & (res_df['predicted'] != 'bipolar') ]
errs.sort_values(by='prob_bipolar', ascending=False)

Unnamed: 0,actual,predicted,test_id,text,prob_adhd,prob_anxiety,prob_bipolar,prob_depression,prob_ptsd,prob_none
237,bipolar,depression,237,it's my 31st birthday i didn't care to do anyt...,0.078031,0.018507,0.417828,0.455235,0.017411,0.012989
228,bipolar,ptsd,228,taking ownership do any of you have or had tro...,0.014099,0.072408,0.400505,0.060723,0.412998,0.039267
1332,bipolar,depression,1332,for the first time since i was 9 i can say i a...,0.010555,0.009596,0.368446,0.601927,0.005757,0.003720
983,bipolar,anxiety,983,at what point is it psychosis? over the past y...,0.035321,0.433761,0.360509,0.047731,0.086805,0.035872
337,bipolar,depression,337,"louisiana flood victim, and i'm episodic i don...",0.028041,0.014946,0.356986,0.574532,0.020024,0.005471
...,...,...,...,...,...,...,...,...,...,...
368,bipolar,anxiety,368,my new cat i got a cat almost a month ago. i g...,0.006003,0.977705,0.004388,0.005301,0.005510,0.001093
1294,bipolar,ptsd,1294,do you guys get nightmares? i had the worst on...,0.000049,0.000423,0.000775,0.002167,0.996254,0.000333
65,bipolar,ptsd,65,forfeited all my attendance points today. thir...,0.000102,0.000119,0.000455,0.000551,0.998102,0.000670
187,bipolar,ptsd,187,therapy during episodes. what do you do and is...,0.000011,0.000013,0.000136,0.000193,0.999595,0.000052


## Inspect patterns in the post - part 1
Check if the class name string of each class-id is present in the posts. If so, what percentage of the posts have these strings associated with the true class. This will give an idea of the influence of these words on the ability of the NN to predict correctly.

In [13]:
def count_classname_string_in_df (df, mh_cls_names):
  occurance_of_matched_actual_classname_in_text = 0
  total_matches = 0

  for true_class in mh_cls_names:
    valids = df[(df['actual'] == true_class) & (df['predicted'] == true_class) ]
    if true_class == 'none':
      continue
    print(f'Matches for string {true_class} in true class {true_class} : {len(valids)}')
    total_matches += len(valids)
    for index, row in valids.iterrows():
      actual_class = row['actual']
      text = row['text']
      if actual_class in text:
        occurance_of_matched_actual_classname_in_text += 1

  return total_matches, occurance_of_matched_actual_classname_in_text

In [14]:
#@title Occurence of class-name string matching the actual class in test dataset

total_matches, occurances = count_classname_string_in_df(res_df, mh_cls_names)

print(f'\nTotal Matches                             : {total_matches}')
print()
print(f'Occurance of string matching (actual) classname in posts  : {occurances}')
print(f'Percentage of string matching (actual) classname in posts : {occurances/total_matches*100}')


Matches for string adhd in true class adhd : 191
Matches for string depression in true class depression : 191
Matches for string ptsd in true class ptsd : 188
Matches for string anxiety in true class anxiety : 191
Matches for string bipolar in true class bipolar : 179

Total Matches                             : 940

Occurance of string matching (actual) classname in posts  : 552
Percentage of string matching (actual) classname in posts : 58.723404255319146


Observation: Roughly 56% of the test dataset have the class-names appear in the corresponding posts. And the ratio of appearance seems almost uniform across all class types.


In [15]:
#@title Occurence of class-name string matching the actual class in train dataset

train_df = pd.DataFrame()
train_df['actual'] = rdt_train['class_name']
train_df['text'] = rdt_train['post']
train_df['predicted'] = rdt_train['class_name']

total_matches, occurances = count_classname_string_in_df(train_df, mh_cls_names)

print(f'\nTotal Matches                             : {total_matches}')
print()
print(f'Occurance of string matching (actual) classname in posts  : {occurances}')
print(f'Percentage of string matching (actual) classname in posts : {occurances/total_matches*100}')


Matches for string adhd in true class adhd : 2465
Matches for string depression in true class depression : 2450
Matches for string ptsd in true class ptsd : 2001
Matches for string anxiety in true class anxiety : 2422
Matches for string bipolar in true class bipolar : 2407

Total Matches                             : 11745

Occurance of string matching (actual) classname in posts  : 4914
Percentage of string matching (actual) classname in posts : 41.839080459770116


Observation: Roughly 42% of the training dataset has the class-names appear in the corresponding posts. And the ratio of appearance seems almost uniform across all class types.


## Inspect patterns in the post - part 2
Check if synonyms of a class name string is present in the posts. If so, what percentage of the posts have these strings in the (right) actual class. This will give an idea of the influence of these words on the ability of the NN to predict correctly.

In [16]:
def count_classname_synonym_in_df (df, mh_cls_names):
  occurance_of_actual_classname_synonym = 0
  total_matches = 0
  synonym_list = []

  for true_class in mh_cls_names:
    valids = df[(df['actual'] == true_class) & (df['predicted'] == true_class)]
    if true_class == 'none':
      continue
    print(f'Matches for string {true_class} in true class {true_class} : {len(valids)}')
    total_matches += len(valids)

    for index, row in valids.iterrows():
      actual_class = row['actual']
      text = row['text']
      syn = list(get_synonyms(actual_class))
      #print(f'synonym list of {actual_class} : {syn}')
      if any(word in text for word in syn):
        if word == actual_class:
          continue
        occurance_of_actual_classname_synonym += 1
        #print(f'{word} found in text for {actual_class}')

  return total_matches, occurance_of_actual_classname_synonym


In [17]:
#@title Occurence of class-name synonyms matching the actual class in test dataset

total_matches, occurances = count_classname_synonym_in_df(res_df, mh_cls_names)

print(f'\nTotal Matches                             : {total_matches}')
print()
print(f'Occurance of synonyms (actual) classname in posts  : {occurances}')
print(f'Percentage of synonyms (actual) classname in posts : {occurances/total_matches*100}')


Matches for string adhd in true class adhd : 191
Matches for string depression in true class depression : 191
Matches for string ptsd in true class ptsd : 188
Matches for string anxiety in true class anxiety : 191
Matches for string bipolar in true class bipolar : 179

Total Matches                             : 940

Occurance of synonyms (actual) classname in posts  : 34
Percentage of synonyms (actual) classname in posts : 3.6170212765957444


Observation: Only 3.5% of the test dataset have synonyms present in the right class.

In [18]:
#@title Occurence of class-name synonyms matching the actual class in train dataset

train_df = pd.DataFrame()
train_df['actual'] = rdt_train['class_name']
train_df['text'] = rdt_train['post']
train_df['predicted'] = rdt_train['class_name']

total_matches, occurances = count_classname_synonym_in_df(train_df, mh_cls_names)

print(f'\nTotal Matches                             : {total_matches}')
print()
print(f'Occurance of synonyms in (actual) classname in training dataset  : {occurances}')
print(f'Percentage of synonyms in (actual) classname in training dataset : {occurances/total_matches*100}')


Matches for string adhd in true class adhd : 2465
Matches for string depression in true class depression : 2450
Matches for string ptsd in true class ptsd : 2001
Matches for string anxiety in true class anxiety : 2422
Matches for string bipolar in true class bipolar : 2407

Total Matches                             : 11745

Occurance of synonyms in (actual) classname in training dataset  : 410
Percentage of synonyms in (actual) classname in training dataset : 3.4908471690080884


Observation: Only 3.5% of the synonyms appear in the post of the actual class

## Inspects patterns in the post - part 3
Check if class name string is present in the posts. If so, what percentage of the posts have these strings in the (wrong) predicted class. This will give an idea of the influence of these words on the ability of the NN to predict correctly.

In [19]:
#@title Presence of mismatched class-name in misclassified posts of test dataset
occurance_of_mismatched_predicted_classname_in_text = 0
total_mismatches = 0
for true_class in mh_cls_names:
  errs = res_df[(res_df['actual'] == true_class) & (res_df['predicted'] != true_class) ]
  if true_class == 'none':
    continue
  print(f'Misclassified classname string found in predicted class; True Class: {true_class:<20} : {len(errs)}')
  total_mismatches += len(errs)
  for index, row in errs.iterrows():
    predicted_class = row['predicted']
    text = row['text']
    if predicted_class in text:
      #print(f'mismatched classname {predicted_class} seen in text')
      occurance_of_mismatched_predicted_classname_in_text += 1

print(f'\nTotal mismatches                             : {total_mismatches}')
print()
print(f'Occurance of mismatched (predicted) classname  : {occurance_of_mismatched_predicted_classname_in_text}')
print(f'Percentage of mismatched (predicted) classname : {occurance_of_mismatched_predicted_classname_in_text/total_mismatches*100}')


Misclassified classname string found in predicted class; True Class: adhd                 : 57
Misclassified classname string found in predicted class; True Class: depression           : 57
Misclassified classname string found in predicted class; True Class: ptsd                 : 60
Misclassified classname string found in predicted class; True Class: anxiety              : 57
Misclassified classname string found in predicted class; True Class: bipolar              : 69

Total mismatches                             : 300

Occurance of mismatched (predicted) classname  : 57
Percentage of mismatched (predicted) classname : 19.0


Observation: About 18% of the test dataset have a different class-name string in a misclassifed sample

In [20]:
#@title Presence of mismatched class-synonyms in misclassified posts of test dataset
occurance_of_mismatched_predicted_class_synonym_in_text = 0
total_mismatches = 0
for true_class in mh_cls_names:
  errs = res_df[(res_df['actual'] == true_class) & (res_df['predicted'] != true_class) ]
  if true_class == 'none':
    continue
  print(f'Predicted class string found in Mismatch for true class {true_class:<20} : {len(errs)}')
  total_mismatches += len(errs)
  for index, row in errs.iterrows():
    predicted_class = row['predicted']
    text = row['text']
    if predicted_class in text:
      syn = list(get_synonyms(predicted_class))
      #print(f'synonym list of {predicted_class} : {syn}')
      if any(word in text for word in syn):
        if word == predicted_class:
          continue
        #print(f'mismatched class-synonym {predicted_class} seen in text')
        occurance_of_mismatched_predicted_class_synonym_in_text += 1


print(f'\nTotal mismatches                             : {total_mismatches}')
print()
print(f'Occurance of mismatched (predicted) class synonym  : {occurance_of_mismatched_predicted_class_synonym_in_text}')
print(f'Percentage of mismatched (predicted) class synonym : {occurance_of_mismatched_predicted_class_synonym_in_text/total_mismatches*100}')


Predicted class string found in Mismatch for true class adhd                 : 57
Predicted class string found in Mismatch for true class depression           : 57
Predicted class string found in Mismatch for true class ptsd                 : 60
Predicted class string found in Mismatch for true class anxiety              : 57
Predicted class string found in Mismatch for true class bipolar              : 69

Total mismatches                             : 300

Occurance of mismatched (predicted) class synonym  : 0
Percentage of mismatched (predicted) class synonym : 0.0


Observation: About 0% of the test dataset have synonym of class-name string in a misclassifed sample

In [21]:
# Load https://huggingface.co/sentence-transformers/all-mpnet-base-v2
model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name="cosine")
embeddings = model.encode([
    errs.text.values[0],
    errs.text.values[1],
    errs.text.values[2],
    errs.text.values[3]
])
similarities = model.similarity(embeddings, embeddings)

NameError: name 'SentenceTransformer' is not defined

In [None]:
similarities

In [None]:
from sentence_transformers import CrossEncoder

model = CrossEncoder("cross-encoder/stsb-roberta-base")
scores = model.predict( [(reference.text.values[0], reference.text.values[1]),
                        (reference.text.values[0], reference.text.values[2]),
                        (reference.text.values[0], errs.text.values[0]),
                        (reference.text.values[0], errs.text.values[1])])
scores

In [24]:
#@title Inspect data and label characteristics

print(f'Training set labels shape: {train_labels.shape}')
print(f'Validation set labels shape: {val_labels.shape}')
print(f'Test set labels shape: {test_labels.shape}')

print(f'Training set examples shape: {train_examples.shape}')
print(f'Validation set examples shape: {val_examples.shape}')
print(f'Test set examples shape: {test_examples.shape}')

print(f'Distribution of the length of all title')
print(rdt_train["title"].str.len().describe())

print('Distribution of the length of all posts')
print(rdt_train["post"].str.len().describe())

print(f'Labels min : {rdt_train.class_id.min()} max : {rdt_train.class_id.max()}')


NameError: name 'train_labels' is not defined

In [None]:
#@ Histogram of all post length
plt.hist(rdt_train["post"].str.len(), bins=10, range=(0, 10000))
plt.title('Distribution of post length')
plt.xlabel('Post length')
plt.ylabel('Number of posts')
plt.show()

In [22]:
#@title Distribution of labels in training and test sets
for i in range(np.max(train_labels)):
  pos_indices = np.where(train_labels == i)
  pct_positive = len(pos_indices[0])/len(train_labels)
  print(f'Labels {i} in training set: {len(pos_indices[0])}/{len(train_labels)} ({pct_positive})')

for i in range(np.max(val_labels)):
  pos_indices = np.where(val_labels == i)
  pct_positive = len(pos_indices[0])/len(val_labels)
  print(f'Labels {i} in validation set: {len(pos_indices[0])}/{len(val_labels)} ({pct_positive})')

for i in range(np.max(test_labels)):
  pos_indices = np.where(test_labels == i)
  pct_positive = len(pos_indices[0])/len(test_labels)
  print(f'Labels {i} in test set: {len(pos_indices[0])}/{len(test_labels)} ({pct_positive})')


NameError: name 'train_labels' is not defined

In [23]:
mh_cls_names = list(rdt_train.class_name.unique())
mh_cls_names
mh_cls_id = list(rdt_train.class_id.unique())
mh_cls_id

mh_dict_id_to_name = {}
mh_dict_name_to_id = {}
for idx, value in enumerate(mh_cls_id):
    mh_dict_id_to_name[value] = mh_cls_names[idx]
for idx, value in enumerate(mh_cls_names):
    mh_dict_name_to_id[value] = mh_cls_id[idx]

print(mh_dict_id_to_name)
print(mh_dict_name_to_id)


{5: 'none', 0: 'adhd', 3: 'depression', 4: 'ptsd', 1: 'anxiety', 2: 'bipolar'}
{'none': 5, 'adhd': 0, 'depression': 3, 'ptsd': 4, 'anxiety': 1, 'bipolar': 2}


# Main Control loop

In [None]:
!ls /content/drive/MyDrive/MIDS-266/w266/project/saved_models/
