<a href="https://colab.research.google.com/github/maryamyazdi/transc/blob/text-classification/classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
import nltk
import re
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [28]:
# Setting up GPU usage (if available)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
init_df = pd.read_csv('stories.csv')
init_df

Unnamed: 0,body,topic
0,,['39822b5f-e37e-43e8-b997-7142fe55c3ea']
1,,['0d817400-3f5d-41e0-929c-c31fdbe75d31']
2,,['83a09c6b-5f2f-421f-ae50-b38acca7e008']
3,,['6fbf954a-03f9-4782-a65f-783271c9c447']
4,hello and welcome to BBC News a woman who gave...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5..."
...,...,...
5176,News. More local help will soon be on the way....,"['9ff54ded-904b-4e0c-85ce-a3617f5cb913', '9632..."
5177,"with March 1, we start what is called Meteorol...",['9a06646a-e1df-4fca-888e-69658420556b']
5178,overseas. A massive Russian convoy is headed t...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
5179,"And this morning, the National Hockey League s...","['9ff54ded-904b-4e0c-85ce-a3617f5cb913', 'b492..."


In [5]:
# Pre-processing the domain data (initial dataset)

nltk.download('stopwords')
from nltk.corpus import stopwords

df = init_df.copy()

# Remove empty body rows
df = df[df['body'].str.strip().astype(bool)]
df.reset_index(drop = True, inplace = True)

# Removing stop words
stop = stopwords.words('english')
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.lower().split() if word not in (stop)]))

# Reformat 'topic' column to binary vectors
unique_topics = {}
a = set()
for index in df.index:
  df['topic'][index] = re.findall("[a-zA-Z0-9-]+",df['topic'][index])
  for id in df['topic'][index]:
    a.add(id)

unique_topics = {topic: index for (index,topic) in enumerate(a)}
count = {x:0 for x in unique_topics.keys()}

for index in df.index:
  e = [0] * len(unique_topics)
  for id in df.topic[index]:
    e[unique_topics[id]]=1
    count[id] +=1 
  df['topic'][index] = e   

df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,body,topic
0,hello welcome bbc news woman gave key evidence...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
1,news north hollywood. 14 yearold girl found de...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,homelessness city's greatest failure. message ...,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
3,minneapolis police officer kim potter guilty d...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,judy update wildfires wiped entire neighborhoo...,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...
5147,news. local help soon way. group volunteers yo...,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
5148,"march 1, start called meteorological spring. k...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5149,overseas. massive russian convoy headed toward...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5150,"morning, national hockey league says suspendin...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"


In [6]:
unique_topics

{'9ff54ded-904b-4e0c-85ce-a3617f5cb913': 0,
 'f5cdd7f2-9d4d-4ba5-9925-00c1701e30fa': 1,
 '6fbf954a-03f9-4782-a65f-783271c9c447': 2,
 '9a06646a-e1df-4fca-888e-69658420556b': 3,
 '74e2fab8-689f-4e17-9a1c-e1f92e084f55': 4,
 'aa1edc37-1a01-414a-bcf7-8517e7c7053d': 5,
 'ebf2991e-4b7d-44c6-927b-a261a7b21d2c': 6,
 '39822b5f-e37e-43e8-b997-7142fe55c3ea': 7,
 'ca197b81-ca86-4792-8c25-2ba7cd4195b5': 8,
 'b49207eb-96eb-4b73-b534-adc0ef85022a': 9,
 'e7cbe38d-c987-4113-aa94-fd77eda451d5': 10,
 '96326734-fd82-4350-b45c-513e7eb9147c': 11,
 'a58b4b70-1b59-4240-917d-a2165a0ce2f0': 12,
 '0d817400-3f5d-41e0-929c-c31fdbe75d31': 13,
 '83a09c6b-5f2f-421f-ae50-b38acca7e008': 14}

In [7]:
count

{'9ff54ded-904b-4e0c-85ce-a3617f5cb913': 1825,
 'f5cdd7f2-9d4d-4ba5-9925-00c1701e30fa': 92,
 '6fbf954a-03f9-4782-a65f-783271c9c447': 559,
 '9a06646a-e1df-4fca-888e-69658420556b': 409,
 '74e2fab8-689f-4e17-9a1c-e1f92e084f55': 253,
 'aa1edc37-1a01-414a-bcf7-8517e7c7053d': 163,
 'ebf2991e-4b7d-44c6-927b-a261a7b21d2c': 278,
 '39822b5f-e37e-43e8-b997-7142fe55c3ea': 391,
 'ca197b81-ca86-4792-8c25-2ba7cd4195b5': 250,
 'b49207eb-96eb-4b73-b534-adc0ef85022a': 388,
 'e7cbe38d-c987-4113-aa94-fd77eda451d5': 22,
 '96326734-fd82-4350-b45c-513e7eb9147c': 1076,
 'a58b4b70-1b59-4240-917d-a2165a0ce2f0': 53,
 '0d817400-3f5d-41e0-929c-c31fdbe75d31': 183,
 '83a09c6b-5f2f-421f-ae50-b38acca7e008': 1793}

In [10]:
pd.DataFrame(count.items(), columns=['topic', 'frequency']);

In [11]:
# Defining some key variables for training stage

MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [38]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.body = dataframe.body
        self.targets = self.data.topic
        self.max_len = max_len

    def __len__(self):
        return len(self.body)

    def __getitem__(self, index):
        body = str(self.body[index])
        body = " ".join(body.split())

        inputs = self.tokenizer.encode_plus(
            body,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            # truncation=True,
            # return_attention_mask=True,
            return_tensors='pt'
        )
        # split the three tensors
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']


        return {
            'ids': ids.clone().detach(),
            # 'ids': torch.tensor(ids, dtype=torch.long),
            'mask': mask.clone().detach(),
            'token_type_ids': token_type_ids.clone().detach(),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [39]:
train_size = 0.8
train_df = df.sample(frac=train_size, random_state=200).reset_index(drop=True)
test_df = df.drop(train_df.index).reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_df.shape))
print("TEST Dataset: {}".format(test_df.shape))

training_set = CustomDataset(train_df, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_df, tokenizer, MAX_LEN)

FULL Dataset: (5152, 2)
TRAIN Dataset: (4122, 2)
TEST Dataset: (1030, 2)


In [42]:
training_set[0]['mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [15]:
# DataLoader

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [21]:
# Creating the network

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 15)
        # 15 dimensions for Linear layer due to 15 different topic categories
    
    def forward(self, ids, mask, token_type_ids):
        _, pooled_output= self.l1(input_ids = ids, attention_mask = mask, token_type_ids = token_type_ids)
        dropout_output = self.l2(pooled_output)
        output = self.l3(dropout_output)
        return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [17]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [18]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [19]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)