<a href="https://colab.research.google.com/github/maryamyazdi/news_transcriptions/blob/text-classification/classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
!pip install -q transformers

In [79]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
import nltk
import re
import transformers
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig

In [80]:
# Setup GPU usage (if available)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [81]:
init_df = pd.read_csv('stories.csv')
init_df

Unnamed: 0,body,topic
0,,['39822b5f-e37e-43e8-b997-7142fe55c3ea']
1,,['0d817400-3f5d-41e0-929c-c31fdbe75d31']
2,,['83a09c6b-5f2f-421f-ae50-b38acca7e008']
3,,['6fbf954a-03f9-4782-a65f-783271c9c447']
4,hello and welcome to BBC News a woman who gave...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5..."
...,...,...
5176,News. More local help will soon be on the way....,"['9ff54ded-904b-4e0c-85ce-a3617f5cb913', '9632..."
5177,"with March 1, we start what is called Meteorol...",['9a06646a-e1df-4fca-888e-69658420556b']
5178,overseas. A massive Russian convoy is headed t...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
5179,"And this morning, the National Hockey League s...","['9ff54ded-904b-4e0c-85ce-a3617f5cb913', 'b492..."


In [82]:
def pre_process(df: pd.DataFrame):

  # Remove rows having empty value for ``body``
  df = df[df['body'].str.strip().astype(bool)]
  df.reset_index(drop = True, inplace = True)

  # Convert list-like string to lists
  df['topic'] = df['topic'].str.findall(r'[a-zA-Z0-9-]+')

  # Convert list of topis to onehot vectors
  mlb = MultiLabelBinarizer()

  onehot_df = pd.DataFrame(mlb.fit_transform(df['topic']),columns=mlb.classes_, index=df.index)
  df['topic'] = onehot_df[onehot_df.columns].values.tolist()
  return df

df = pre_process(init_df)

In [83]:
# Remove stop words (experimented to cause performance enhancement)
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

remove_stop_words = lambda x: ' '.join([word for word in x.lower().split() if word not in (stop)])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [84]:
df['body'] = list(map(remove_stop_words, df['body']))
df

Unnamed: 0,body,topic
0,hello welcome bbc news woman gave key evidence...,"[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
1,news north hollywood. 14 yearold girl found de...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
2,homelessness city's greatest failure. message ...,"[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,minneapolis police officer kim potter guilty d...,"[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
4,judy update wildfires wiped entire neighborhoo...,"[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]"
...,...,...
5147,news. local help soon way. group volunteers yo...,"[0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
5148,"march 1, start called meteorological spring. k...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
5149,overseas. massive russian convoy headed toward...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
5150,"morning, national hockey league says suspendin...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]"


In [85]:
# Define some key variables for training stage

MAX_LEN = 256   # Set to 256 due to the 'mean' of tokens is 300
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 1e-05
N_CLASSES = len(df.topic[0])
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [86]:
class StoriesDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.body = dataframe.body
        self.targets = self.data.topic
        self.max_len = max_len

    def __len__(self):
        return len(self.body)

    def __getitem__(self, index):
        body = str(self.body[index])
        body = " ".join(body.split())

        inputs = self.tokenizer.encode_plus(
            body,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            # Construct a tensor, then remove dimensions of size 1 with squeeze
            'ids': torch.squeeze(inputs['input_ids'].clone().detach()),
            'mask': torch.squeeze(inputs['attention_mask'].clone().detach()),
            'token_type_ids': torch.squeeze(inputs['token_type_ids'].clone().detach()),
            'targets': torch.squeeze(torch.tensor(self.targets[index], dtype=torch.float))
        }

In [87]:
# Split 80% data for training stage and leftover 20% for testing
train_size = 0.8
train_df = df.sample(frac=train_size, random_state=200).reset_index(drop=True)
test_df = df.drop(train_df.index).reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_df.shape))
print("TEST Dataset: {}".format(test_df.shape))


# Each element in trainig_set or test_set is a dictionary consists of 4 tensors
training_set = StoriesDataset(train_df, tokenizer, MAX_LEN)
testing_set = StoriesDataset(test_df, tokenizer, MAX_LEN)

FULL Dataset: (5152, 2)
TRAIN Dataset: (4122, 2)
TEST Dataset: (1030, 2)


In [88]:
# Define dataLoader and load training and testing datasets

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 8
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'num_workers': 8
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [89]:
# Creating the network

class BERTClass(torch.nn.Module):
    def __init__(self, n_classes: int):
        super(BERTClass, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased',return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, n_classes)
    
    def forward(self, ids, mask, token_type_ids):
        bert_output= self.bert(input_ids = ids, attention_mask = mask, token_type_ids = token_type_ids)
        dropout_output = self.dropout(bert_output.pooler_output)
        output = self.classifier(dropout_output)
        return output

# Linear layer with 15 dimensions due to 15 different classes (topics)
model = BERTClass(N_CLASSES)

model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

In [90]:
# Use BCE to evaluate the probability of categories indiviually

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [91]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [92]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()   # Clear out the gradients
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()   
        loss.backward()   # Back-propagation to the first layer
        optimizer.step()  # Update the model parameters

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.7346986532211304
Epoch: 1, Loss:  0.24991366267204285
Epoch: 2, Loss:  0.1672009825706482


In [95]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            # Activation function : 'sigmoid'
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [96]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.4242718446601942
F1 Score (Micro) = 0.6951456310679612
F1 Score (Macro) = 0.2595824302684295
Accuracy Score = 0.4242718446601942
F1 Score (Micro) = 0.6951456310679612
F1 Score (Macro) = 0.2595824302684295
Accuracy Score = 0.4242718446601942
F1 Score (Micro) = 0.6951456310679612
F1 Score (Macro) = 0.2595824302684295


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [106]:
!mkdir -p ./Model
PATH = "./Model/state_dict_model.pt"

# Save
torch.save(model.state_dict(), PATH)