In [1]:
N_SAMPLE = 10000

In [2]:
import pandas as pd

# https://towardsdatascience.com/bert-text-classification-using-pytorch-723dfb8b6b5b

import matplotlib.pyplot as plt
import torch
import numpy as np
import random 
import spacy

# Preliminaries

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
from sklearn.model_selection import train_test_split

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
from datetime import datetime
startTime = datetime.now()

In [3]:
train_test_ratio = 0.10
train_valid_ratio = 0.80
first_n_words = 512

#nlp = spacy.load("en_core_web_sm", disable = ['ner', 'tagger', 'parser', 'textcat'])
#nlp.max_length = 10000000

df_ml = pd.read_csv("temp_vn.csv", encoding="latin1", parse_dates=True)
df_ml = df_ml[df_ml.len != -1]
df_ml = df_ml[~df_ml.id.isna()]
df_ml.id = df_ml.id.astype(int)
#df_ml = df_ml.sample(n=N_SAMPLE)  # random subsample to get going quicker
df_ml['text'] = "zzz"
df_ml = df_ml.reset_index(drop=True)

In [4]:
df_ml

Unnamed: 0,id,version_number,bill_id,signed,partisan_lean,len,sc_id,text
0,2119301,1,1092981,0,0.236118,8102,562-1,zzz
1,2708948,2,1092981,0,0.236118,9545,562-1,zzz
2,2708949,3,1092981,0,0.236118,10714,562-1,zzz
3,2708950,4,1092981,0,0.236118,1239,562-1,zzz
4,2708951,5,1092981,0,0.236118,133,562-1,zzz
...,...,...,...,...,...,...,...,...
496245,2740636,1,1346334,0,0.525000,163,666-2,zzz
496246,2740995,1,1346470,0,0.525000,879,666-2,zzz
496247,2741134,1,1346569,0,0.525000,3603,666-2,zzz
496248,2741135,1,1346570,0,0.525000,345,666-2,zzz


In [5]:
def trim_string(x):
    fname = '../data/raw/' + str(df_ml.iloc[i]['id']) +'.txt'
    x = open(fname).read()
    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

#def clipped_text(i):

    
#    tokens = nlp(d1)
#    str_tokens = [token.orth_ for token in tokens[:512]]
#    df_ml.at[i, 'text'] = " ".join(str_tokens)
#    return(1)
    
startTime2 = datetime.now()
for i in range(len(df_ml)):
    df_ml.at[i, 'text'] = trim_string(i)
print(datetime.now()-startTime2)

# 43 min to process all the datafiles

0:43:23.727699


In [17]:
df_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496250 entries, 0 to 496249
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              496250 non-null  int64  
 1   version_number  496250 non-null  int64  
 2   bill_id         496250 non-null  int64  
 3   signed          496250 non-null  int64  
 4   partisan_lean   496250 non-null  float64
 5   len             496250 non-null  int64  
 6   sc_id           496250 non-null  object 
 7   text            496250 non-null  object 
dtypes: float64(1), int64(5), object(2)
memory usage: 30.3+ MB


In [7]:
#raw_data_path = '~/Downloads/news.csv'

#destination_folder = 'bert_cl/'



In [8]:
import pandas as pd


In [9]:
df_bert = df_ml[['signed', 'text']]
df_bert.head()


Unnamed: 0,signed,text
0,0,HOUSE BILL 1 By Staples SENATE BILL 16 By Dick...
1,0,Senate Government Operations Committee 1 Amend...
2,0,Senate State and Local Government Committee 1 ...
3,0,"Senate Finance, Ways, and Means Committee 1 Am..."
4,0,Amendment No. 4 to SB0016 Dickerson Signature ...


In [10]:
destination_folder = 'bert_classification'

# Train-test split
df_train_full, df_test = train_test_split(df_bert, train_size = train_test_ratio, random_state = 1)

# Train-valid split
df_train, df_valid = train_test_split(df_train_full, train_size = train_valid_ratio, random_state = 1)

# Write preprocessed data
df_train.to_csv(destination_folder + '/train.csv', index=False)
df_valid.to_csv(destination_folder + '/valid.csv', index=False)
df_test.to_csv(destination_folder + '/test.csv', index=False)

In [11]:
# Preprocess and prepare

device = "cuda"


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Model parameter
MAX_SEQ_LEN = first_n_words
NUM_EPOCHS = 20
BATCH_SIZE = 4
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('label', label_field), ('text', text_field)]

# TabularDataset

train, valid, test = TabularDataset.splits(path=destination_folder, train='train.csv', validation='valid.csv',
                                           test='test.csv', format='CSV', fields=fields, skip_header=True)

# Iterators

train_iter = BucketIterator(train, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (654 > 512). Running this sequence through the model will result in indexing errors


In [12]:
test_iter

<torchtext.data.iterator.Iterator at 0x7f7d547fdbb0>

In [13]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = "bert-base-uncased"
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea

In [14]:
# Save and Load Functions

def save_checkpoint(save_path, model, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']


In [15]:
def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = NUM_EPOCHS,
          eval_every = len(train_iter) // 2,
          file_path = destination_folder,
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for (labels, text), _ in train_loader:
            labels = labels.type(torch.LongTensor)           
            labels = labels.to(device)
            text = text.type(torch.LongTensor)  
            text = text.to(device)
            output = model(text, labels)
            loss, _ = output

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    

                    # validation loop
                    for (labels, text), _ in valid_loader:
                        labels = labels.type(torch.LongTensor)           
                        labels = labels.to(device)
                        text = text.type(torch.LongTensor)  
                        text = text.to(device)
                        output = model(text, labels)
                        loss, _ = output
                        
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss)
                    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

model = BERT().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)

train(model=model, optimizer=optimizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch [1/20], Step [4962/198500], Train Loss: 0.5403, Valid Loss: 0.5612
Model saved to ==> bert_classification/model.pt
Model saved to ==> bert_classification/metrics.pt
Epoch [1/20], Step [9924/198500], Train Loss: 0.5772, Valid Loss: 0.5893
Epoch [2/20], Step [14886/198500], Train Loss: 0.5503, Valid Loss: 0.5906
Epoch [2/20], Step [19848/198500], Train Loss: 0.5915, Valid Loss: 0.5859
Epoch [3/20], Step [24810/198500], Train Loss: 0.5497, Valid Loss: 0.5886
Epoch [3/20], Step [29772/198500], Train Loss: 0.5922, Valid Loss: 0.6022
Epoch [4/20], Step [34734/198500], Train Loss: 0.5486, Valid Loss: 0.5865
Epoch [4/20], Step [39696/198500], Train Loss: 0.5890, Valid Loss: 0.5836
Epoch [5/20], Step [44658/198500], Train Loss: 0.5520, Valid Loss: 0.5867
Epoch [5/20], Step [49620/198500], Train Loss: 0.5890, Valid Loss: 0.5836


KeyboardInterrupt: 

In [None]:
#train(model=model, optimizer=optimizer)

In [None]:
train_loss_list, valid_loss_list, global_steps_list = load_metrics(destination_folder + '/metrics.pt')
plt.plot(global_steps_list, train_loss_list, label='Train')
plt.plot(global_steps_list, valid_loss_list, label='Valid')
plt.xlabel('Global Steps')
plt.ylabel('Loss')
plt.legend()
plt.show() 

In [None]:

def evaluate(model, test_loader):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for (labels, text), _ in test_loader:

                labels = labels.type(torch.LongTensor)           
                labels = labels.to(device)
                text = text.type(torch.LongTensor)  
                text = text.to(device)
                output = model(text, labels)

                _, output = output
                y_pred.extend(torch.argmax(output, 1).tolist())
                y_true.extend(labels.tolist())
    
    print('Classification Report:')
    print(classification_report(y_true, y_pred, labels=[1,0], digits=4))
    
    cm = confusion_matrix(y_true, y_pred, labels=[1,0])
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")

    ax.set_title('Confusion Matrix')

    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

    ax.xaxis.set_ticklabels(['Signed', 'Failed'])
    ax.yaxis.set_ticklabels(['Signed', 'Failed'])
    
best_model = BERT().to(device)

load_checkpoint(destination_folder + '/model.pt', best_model)

evaluate(best_model, test_iter)

In [None]:
(5966+723)/(5966+723+638+1673)

In [None]:
# 10000 samples, 5 epochs, 128 seq_len:  accuracy = 0.743
# 100,000 samples, 5 epochs, 128 seq_len: accuracy = 0.818
# 10,000 samples, 5 epochs, 512 seq_len, accuracy = 0.747
# 10,000 samples, 5 epochs, 512 seq len, accuracy = 0.687
print(datetime.now() - startTime)
print("sequence length:{}, N samples:{}, time:{}".format(MAX_SEQ_LEN, N_SAMPLE, datetime.now() - startTime)