Start

In [105]:
import os
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, random_split, DataLoader
import matplotlib.pyplot as plt
import torch.nn.functional as F
from collections import OrderedDict, Counter
from timeit import default_timer as timer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset


In [52]:
if torch.cuda.is_available(): 
    dev = "cuda" 
else: 
    dev = "cpu" 
device = torch.device(dev) 
print(device)

train_on_gpu = torch.cuda.is_available()
print(f'Train on gpu: {train_on_gpu}')

# Number of gpus
if train_on_gpu:
    gpu_count = torch.cuda.device_count()
    print(f'{gpu_count} gpus detected.')
    if gpu_count > 1:
        multi_gpu = True
    else:
        multi_gpu = False

cuda
Train on gpu: True
1 gpus detected.


In [53]:
df = pd.read_csv('Datasets/Mental Health Dataset.csv')


In [107]:

long_posts_df = df[df['posts'].str.split().str.len() > 300]

#eject all posts with more than 300 words
short_posts_df = df[df['posts'].str.split().str.len() <= 300]
print(short_posts_df.head())
print(short_posts_df.info())


                                                posts predicted  intensity
4   gmos now link to leukemia http nsnbc I 2013 07...   neutral          0
5   here is a link for an interesting article and ...   neutral          0
8   the third know human retrovirus xmrv seem to b...   neutral          0
9   leukemia survivor meet his bone marrow donor w...   neutral          0
10  melt down can not stop the water work today I ...   neutral          0


In [98]:
new_df = short_posts_df.drop('intensity', axis=1)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized = tokenizer.batch_encode_plus(
    new_df['posts'].tolist(),
    add_special_tokens=False,
    max_length=250,
    padding='longest',
    truncation=True,
    return_tensors='pt',
    return_attention_mask=False,
    return_token_type_ids=False
)


In [106]:

le = LabelEncoder()
tokenized['intensity'] = le.fit_transform(short_posts_df['intensity'])

tokenized['intensity'] = torch.tensor(tokenized['intensity'])


tokenized['input_ids'] = tokenized['input_ids'].to(device)
tokenized['intensity'] = tokenized['intensity'].to(device)
print(tokenized['input_ids'].shape) 
print(tokenized['intensity'].shape)

print(tokenized['intensity'])
counter = Counter(tokenized['intensity'].tolist())
for label, count in counter.items():
    print(f"Class {label}: {count} occurrences")


torch.Size([8104, 391])
torch.Size([8104])
tensor([2, 2, 2,  ..., 0, 2, 1], device='cuda:0')
Class 2: 3432 occurrences
Class 1: 3189 occurrences
Class 0: 914 occurrences
Class 3: 569 occurrences


In [57]:
total_dataset = TensorDataset(tokenized['input_ids'], tokenized['intensity'])
train_size = int(0.7 * len(total_dataset))
val_size = len(total_dataset) - train_size
test_size = int(0.5 * val_size)
val_size = val_size - test_size
print(train_size, val_size, test_size)
train_dataset, val_dataset, test_dataset = random_split(total_dataset, [train_size, val_size, test_size])


5672 1216 1216


In [94]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = 4,
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )



for param in model.parameters():
    param.requires_grad = False

model.classifier = nn.Linear(in_features=768, out_features=4, bias=True)

model = model.to(device)

print(model)
print(model.device)
print(type(model))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [86]:
hyperpameters = {'batch_size': 60, 'learning_rate': 0.001, 'epochs': 2, 'optimizer': 'Adam'} 
adam = torch.optim.Adam(model.parameters(), lr=hyperpameters['learning_rate'])
hyperpameters['optimizer'] = adam

In [87]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size = hyperpameters['batch_size'],  
    shuffle=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=hyperpameters['batch_size'],  
    shuffle=True
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=hyperpameters['batch_size'],  
    shuffle=True
)

criterion = nn.CrossEntropyLoss()

In [96]:
def train(model, train_dataloader, val_dataloader, test_dataloader, epochs, optimizer, criterion, train_on_gpu, save_path, save=False):
    history = []
    valid_loss_min = np.Inf
    overall_start = timer()
    
    for epoch in range(epochs):
        start = timer()
        
        train_loss = 0.0
        valid_loss = 0.0
        
        valid_acc = 0
        train_acc = 0
        
        model.train()
        
        for index, (data, target) in enumerate(train_dataloader):
            
            if train_on_gpu:
                data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            
            print(data.device)
            print(model.device)
            print(data.shape)
            output = model(data).logits
            
            loss = criterion(output, target)
            loss.backward()
            
            optimizer.step()
            
            train_loss += loss.item() * data.size(0)
            
            _, pred = torch.max(output, dim=1)
            correct_tensor = pred.eq(target.data.view_as(pred))

            accuracy = torch.mean(correct_tensor.type(torch.FloatTensor))

            train_acc += accuracy.item() * data.size(0)
            
            print(f'Epoch: {epoch}\t{100 * (index + 1) / len(train_dataloader):.2f}% complete. {timer() - start:.2f} seconds elapsed in epoch.', end='\r')
        
        with torch.no_grad():
            model.eval()
            for data, target in val_dataloader:
                    # Tensors to gpu
                if train_on_gpu:
                    data, target = data.cuda(), target.cuda()

                    
                output = model(data).logits

                    
                loss = criterion(output, target)
                    
                valid_loss += loss.item() * data.size(0)

                # Calculate validation accuracy
                _, pred = torch.max(output, dim=1)
                correct_tensor = pred.eq(target.data.view_as(pred))
                accuracy = torch.mean(correct_tensor.type(torch.FloatTensor))
                    
            valid_acc += accuracy.item() * data.size(0)
            train_loss = train_loss / len(train_dataloader.dataset)
            valid_loss = valid_loss / len(val_dataloader.dataset)

            # Calculate average accuracy
            train_acc = train_acc / len(train_dataloader.dataset)
            valid_acc = valid_acc / len(val_dataloader.dataset)
            history.append([train_loss, valid_loss, train_acc, valid_acc])
            
            print(f'\nEpoch: {epoch} \tTraining Loss: {train_loss:.4f} \tValidation Loss: {valid_loss:.4f}')
            print(f'\t\tTraining Accuracy: {100 * train_acc:.2f}%\t Validation Accuracy: {100 * valid_acc:.2f}%')
    model.optimizer = optimizer
    total_time = timer() - overall_start
    print(f'{total_time:.2f} total seconds elapsed. {total_time / (epoch):.2f} seconds per epoch.')
    history = pd.DataFrame(
        history,
        columns=['train_loss', 'valid_loss', 'train_acc', 'valid_acc'])
    return model, history

In [97]:
model, history = train(model, train_dataloader, val_dataloader, test_dataloader, hyperpameters['epochs'], hyperpameters['optimizer'], criterion, train_on_gpu, 'model.pt')

cuda:0
cuda:0
torch.Size([60, 391])
cuda:0 0	1.05% complete. 1.22 seconds elapsed in epoch.
cuda:0
torch.Size([60, 391])
cuda:0 0	2.11% complete. 2.36 seconds elapsed in epoch.
cuda:0
torch.Size([60, 391])
cuda:0 0	3.16% complete. 3.49 seconds elapsed in epoch.
cuda:0
torch.Size([60, 391])
cuda:0 0	4.21% complete. 4.63 seconds elapsed in epoch.
cuda:0
torch.Size([60, 391])
cuda:0 0	5.26% complete. 5.76 seconds elapsed in epoch.
cuda:0
torch.Size([60, 391])
cuda:0 0	6.32% complete. 6.89 seconds elapsed in epoch.
cuda:0
torch.Size([60, 391])
cuda:0 0	7.37% complete. 8.02 seconds elapsed in epoch.
cuda:0
torch.Size([60, 391])
cuda:0 0	8.42% complete. 9.15 seconds elapsed in epoch.
cuda:0
torch.Size([60, 391])
cuda:0 0	9.47% complete. 10.28 seconds elapsed in epoch.
cuda:0
torch.Size([60, 391])
cuda:0 0	10.53% complete. 11.42 seconds elapsed in epoch.
cuda:0
torch.Size([60, 391])
cuda:0 0	11.58% complete. 12.58 seconds elapsed in epoch.
cuda:0
torch.Size([60, 391])
cuda:0 0	12.63% complete

In [92]:
torch.save(model.classifier.state_dict(), 'Models/BERT_last_layer.pth')