In [7]:
import os
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, random_split, DataLoader
import matplotlib.pyplot as plt
import torch.nn.functional as F
from collections import OrderedDict, Counter
from timeit import default_timer as timer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset

In [8]:
if torch.cuda.is_available(): 
    dev = "cuda" 
else: 
    dev = "cpu" 
device = torch.device(dev) 
print(device)

train_on_gpu = torch.cuda.is_available()
print(f'Train on gpu: {train_on_gpu}')

# Number of gpus
if train_on_gpu:
    gpu_count = torch.cuda.device_count()
    print(f'{gpu_count} gpus detected.')
    if gpu_count > 1:
        multi_gpu = True
    else:
        multi_gpu = False

cuda
Train on gpu: True
1 gpus detected.


In [9]:
df = pd.read_csv('Datasets/Mental Health Dataset.csv')


In [10]:

long_posts_df = df[df['posts'].str.split().str.len() > 200]

#eject all posts with more than 200 words
short_posts_df = df[df['posts'].str.split().str.len() <= 200]
print(short_posts_df.head())
print(short_posts_df.info())

print(short_posts_df['intensity'].value_counts())


duplicate = short_posts_df[(short_posts_df['intensity'] == 1) | (short_posts_df['intensity'] == -2)]
short_posts_df = pd.concat([short_posts_df, duplicate])

print(short_posts_df['intensity'].value_counts())


                                                posts predicted  intensity
4   gmos now link to leukemia http nsnbc I 2013 07...   neutral          0
5   here is a link for an interesting article and ...   neutral          0
8   the third know human retrovirus xmrv seem to b...   neutral          0
9   leukemia survivor meet his bone marrow donor w...   neutral          0
14  new video from patient power study bring posit...   neutral          0
<class 'pandas.core.frame.DataFrame'>
Index: 6332 entries, 4 to 10389
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   posts      6332 non-null   object
 1   predicted  6332 non-null   object
 2   intensity  6332 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 197.9+ KB
None
intensity
 0    2921
-1    2307
-2     673
 1     431
Name: count, dtype: int64
intensity
 0    2921
-1    2307
-2    1346
 1     862
Name: count, dtype: int64


In [11]:
new_df = short_posts_df.drop('intensity', axis=1)

print(new_df.head())
print(new_df.info())

tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-mini', do_lower_case=True)

tokenized = tokenizer.batch_encode_plus(
    new_df['posts'].tolist(),
    add_special_tokens=False,
    max_length=200,
    padding='longest',
    truncation=True,
    return_tensors='pt',
    return_attention_mask=False,
    return_token_type_ids=False
)


                                                posts predicted
4   gmos now link to leukemia http nsnbc I 2013 07...   neutral
5   here is a link for an interesting article and ...   neutral
8   the third know human retrovirus xmrv seem to b...   neutral
9   leukemia survivor meet his bone marrow donor w...   neutral
14  new video from patient power study bring posit...   neutral
<class 'pandas.core.frame.DataFrame'>
Index: 7436 entries, 4 to 10388
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   posts      7436 non-null   object
 1   predicted  7436 non-null   object
dtypes: object(2)
memory usage: 174.3+ KB
None


In [12]:
print(tokenized.keys())

dict_keys(['input_ids'])


In [13]:


le = LabelEncoder()
tokenized['intensity'] = le.fit_transform(short_posts_df['intensity'])
tokenized['intensity'] = torch.tensor(tokenized['intensity'])


tokenized['input_ids'] = tokenized['input_ids'].to(device)
tokenized['intensity'] = tokenized['intensity'].to(device)
print(tokenized['input_ids'].shape) 
print(tokenized['intensity'].shape)

print(tokenized['intensity'])
counter = Counter(tokenized['intensity'].tolist())
for label, count in counter.items():
    print(f"Class {label}: {count} occurrences")
    
print(tokenized['intensity'].dtype)



total_dataset = TensorDataset(tokenized['input_ids'], tokenized['intensity'])
print(total_dataset.tensors[0].shape)
print(total_dataset.tensors[1].shape)


train_size = int(0.7 * len(total_dataset))
val_size = len(total_dataset) - train_size
test_size = int(0.5 * val_size)
val_size = val_size - test_size
train_dataset, val_dataset, test_dataset = random_split(total_dataset, [train_size, val_size, test_size])

# Class 0 is most negative, class 1 is negative, class 2 is neutral, class 3 is positive

torch.Size([7436, 200])
torch.Size([7436])
tensor([2, 2, 2,  ..., 3, 3, 0], device='cuda:0')
Class 2: 2921 occurrences
Class 1: 2307 occurrences
Class 3: 862 occurrences
Class 0: 1346 occurrences
torch.int64
torch.Size([7436, 200])
torch.Size([7436])


In [14]:
print(total_dataset.tensors[0][0:50])

tensor([[13938,  2891,  2085,  ...,     0,     0,     0],
        [ 2182,  2003,  1037,  ...,     0,     0,     0],
        [ 1996,  2353,  2113,  ...,     0,     0,     0],
        ...,
        [ 2204,  2851,  2035,  ...,     0,     0,     0],
        [ 1037,  2095,  3283,  ...,     0,     0,     0],
        [ 2026,  2905,  1999,  ...,     0,     0,     0]], device='cuda:0')


In [15]:
hyperpameters = {'batch_size': 150, 'learning_rate': 0.001, 'epochs': 50, 'optimizer': 'Adam'} 


In [16]:

train_dataloader = DataLoader(
    train_dataset,
    batch_size = hyperpameters['batch_size'],  
    shuffle=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=hyperpameters['batch_size'],  
    shuffle=True
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=hyperpameters['batch_size'],  
    shuffle=True
)

criterion = nn.CrossEntropyLoss()

model = BertForSequenceClassification.from_pretrained(
                                      'prajjwal1/bert-mini', 
                                      num_labels = 4,
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:

for param in model.parameters():
    param.requires_grad = False


classifier = nn.Sequential(
    nn.Linear(256, 4),
    nn.Softmax(dim=1)
)

model.classifier = classifier


model = model.to(device)
model.cuda()
print(model)
print(model.device)
print(type(model))

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, e

In [18]:
adamW = torch.optim.AdamW(model.parameters())
hyperpameters['optimizer'] = adamW

In [19]:
def train(model, train_dataloader, val_dataloader, test_dataloader, epochs, optimizer, criterion, train_on_gpu, save_path, save=False):
    history = []
    valid_loss_min = np.Inf
    overall_start = timer()
    
    for epoch in range(epochs):
        start = timer()
        
        train_loss = 0.0
        valid_loss = 0.0
        
        valid_acc = 0
        train_acc = 0
        
        model.train()
        
        for index, (data, target) in enumerate(train_dataloader):
            
            if train_on_gpu:
                data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            
            output = model(data).logits
            
            loss = criterion(output, target)
            loss.backward()
            
            optimizer.step()
            
            train_loss += loss.item() * data.size(0)
            
            _, pred = torch.max(output, dim=1)
            correct_tensor = pred.eq(target.data.view_as(pred))

            accuracy = torch.mean(correct_tensor.type(torch.FloatTensor))

            train_acc += accuracy.item() * data.size(0)
            
            print(f'Epoch: {epoch}\t{100 * (index + 1) / len(train_dataloader):.2f}% complete. {timer() - start:.2f} seconds elapsed in epoch.', end='\r')
        
        with torch.no_grad():
            model.eval()
            for data, target in val_dataloader:
                    # Tensors to gpu
                if train_on_gpu:
                    data, target = data.cuda(), target.cuda()

                    
                output = model(data).logits

                    
                loss = criterion(output, target)
                    
                valid_loss += loss.item() * data.size(0)

                # Calculate validation accuracy
                _, pred = torch.max(output, dim=1)
                correct_tensor = pred.eq(target.data.view_as(pred))
                accuracy = torch.mean(correct_tensor.type(torch.FloatTensor))
                valid_acc += accuracy.item() * data.size(0)
            
            train_loss = train_loss / len(train_dataloader.dataset)
            valid_loss = valid_loss / len(val_dataloader.dataset)

            # Calculate average accuracy
            train_acc = train_acc / len(train_dataloader.dataset)
            valid_acc = valid_acc / len(val_dataloader.dataset)
            history.append([train_loss, valid_loss, train_acc, valid_acc])
            
            print(f'\nEpoch: {epoch} \tTraining Loss: {train_loss:.4f} \tValidation Loss: {valid_loss:.4f}')
            print(f'\t\tTraining Accuracy: {100 * train_acc:.2f}%\t Validation Accuracy: {100 * valid_acc:.2f}%')
    model.optimizer = optimizer
    total_time = timer() - overall_start
    print(f'{total_time:.2f} total seconds elapsed. {total_time / (epoch):.2f} seconds per epoch.')
    history = pd.DataFrame(
        history,
        columns=['train_loss', 'valid_loss', 'train_acc', 'valid_acc'])
    return model, history


In [20]:
model, history = train(model, train_dataloader, val_dataloader, test_dataloader, hyperpameters['epochs'], hyperpameters['optimizer'], criterion, train_on_gpu, 'model.pt')

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch: 0	100.00% complete. 3.21 seconds elapsed in epoch.
Epoch: 0 	Training Loss: 1.3094 	Validation Loss: 1.2799
		Training Accuracy: 40.29%	 Validation Accuracy: 44.27%
Epoch: 1	100.00% complete. 2.71 seconds elapsed in epoch.
Epoch: 1 	Training Loss: 1.2661 	Validation Loss: 1.2648
		Training Accuracy: 45.63%	 Validation Accuracy: 46.95%
Epoch: 2	100.00% complete. 2.73 seconds elapsed in epoch.
Epoch: 2 	Training Loss: 1.2548 	Validation Loss: 1.2539
		Training Accuracy: 47.03%	 Validation Accuracy: 45.88%
Epoch: 3	100.00% complete. 2.64 seconds elapsed in epoch.
Epoch: 3 	Training Loss: 1.2484 	Validation Loss: 1.2474
		Training Accuracy: 48.24%	 Validation Accuracy: 47.94%
Epoch: 4	100.00% complete. 2.66 seconds elapsed in epoch.
Epoch: 4 	Training Loss: 1.2446 	Validation Loss: 1.2435
		Training Accuracy: 49.13%	 Validation Accuracy: 48.03%
Epoch: 5	100.00% complete. 2.63 seconds elapsed in epoch.
Epoch: 5 	Training Loss: 1.2407 	Validation Loss: 1.2414
		Training Accuracy: 48.8

In [21]:
# torch.save(model, 'Models/BERT_mini_whole.pth')

In [22]:
# model = torch.load('Models/BERT_mini_whole.pth')

In [23]:
model.eval()
text = "6 month ago today I lose my mom she die after a five year long battle with leukemia which you think would give I time to prepare but after four year in remission three week in the hospital and then her death I was just not prepared for this it defiantly do not feel like that long ago that I see she I still cry every day is that normal I miss her alot she was my good friend most people think I m fine because I work really hard at act normal but it is so hard I just miss she so much"
tokenized = tokenizer.encode_plus(text, add_special_tokens=False, max_length=200, padding='longest', return_tensors='pt', return_attention_mask=False, return_token_type_ids=False)
tokenized['input_ids'] = tokenized['input_ids'].to(device)
output = model(tokenized['input_ids'])
print(output.logits[0])

prediction = torch.argmax(output.logits[0]).item()
print(prediction)

tensor([9.6343e-01, 3.5388e-02, 1.1803e-03, 5.1527e-06], device='cuda:0',
       grad_fn=<SelectBackward0>)
0


In [24]:

prediction = torch.argmax(output.logits).item()
print(prediction)

0
