#### DepressionReddit_BERT_FineTuning

In [None]:
#Installing latest version of transformers.
!pip install transformers -U

In [2]:
#import pandas and numpy
import pandas as pd
import numpy as np

In [3]:
#Mounting to google drive
from google.colab import drive
drive.mount('/content/drive/') #force_remount=True
%cd drive/MyDrive/MyProjects/Depression_BERT_FineTuning

Mounted at /content/drive/
/content/drive/MyDrive/MyProjects/Depression_BERT_FineTuning


In [4]:
#Reading .csv file current directory
df = pd.read_csv("depression_dataset_reddit_cleaned.csv")
df.head()

Unnamed: 0,clean_text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [7]:
#How many rows does a class holds
df.is_depression.value_counts()

0    3900
1    3831
Name: is_depression, dtype: int64

In [8]:
len(df)

7731

In [9]:
from sklearn.model_selection import  train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer

In [10]:
#loading pretrained bert tokenizer and model
tokenizer = BertTokenizer.from_pretrained("Bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("Bert-base-uncased")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at Bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
model = model.to("cuda")

In [None]:
model

In [12]:
#Preparing data
X = list(df['clean_text'])
y = list(df['is_depression'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

In [13]:
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [14]:
len(X_train), len(X_test)

(6184, 1547)

In [15]:
# Create torch dataset
import torch
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [16]:
train_data = Dataset(X_train_tokenized, y_train)
test_data = Dataset(X_test_tokenized, y_test)

In [None]:
train_data[6]

In [18]:
#Compute metrics
def compute_metrics(p):
  print(type(p))
  pred, labels = p
  pred = np.argmax(pred, axis=1)

  accuracy = accuracy_score(y_true=labels, y_pred=pred)
  recall = recall_score(y_true=labels, y_pred=pred)
  precision = precision_score(y_true=labels, y_pred=pred)
  f1 = f1_score(y_true=labels, y_pred=pred)

  return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [19]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics
)

In [20]:
#Training Custom model
trainer.train()



Step,Training Loss
500,0.1379


TrainOutput(global_step=773, training_loss=0.11937456513438156, metrics={'train_runtime': 568.6951, 'train_samples_per_second': 10.874, 'train_steps_per_second': 1.359, 'total_flos': 1627078766346240.0, 'train_loss': 0.11937456513438156, 'epoch': 1.0})

In [21]:
#Evaluation
trainer.evaluate()

<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.0849134773015976,
 'eval_accuracy': 0.9773755656108597,
 'eval_precision': 0.9765625,
 'eval_recall': 0.9778357235984355,
 'eval_f1': 0.9771986970684039,
 'eval_runtime': 50.5758,
 'eval_samples_per_second': 30.588,
 'eval_steps_per_second': 3.836,
 'epoch': 1.0}

In [23]:
#Saving custom model
trainer.save_model('FineTunedBertModel')

In [22]:
np.set_printoptions(suppress=True)

In [42]:
model = BertForSequenceClassification.from_pretrained("FineTunedBertModel")

In [46]:
#Testing with single document
#text = "I will get suicidal thoughts sometimes"
text = "the real reason why you're sad? you're attached to people who have been distant with you. you're paying attention to people who ignore you. you make time for people who are 'too busy' for you. you're too caring to people who are care less when it comes to you. let those people go"
#inputs = tokenizer(text, padding = True, truncation = True, return_tensors='pt').to('cuda')
inputs = tokenizer.encode_plus(text, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
outputs = model(**inputs)
# predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = outputs.logits.argmax().item()
print(predictions)
# predictions = predictions.cpu().detach().numpy()
# predictions

1
