In [None]:
import numpy as np
import pandas as pd

from pathlib import Path
from typing import *

import torch
import torch.optim as optim

In [100]:
from fastai import *
from fastai.vision import *
from fastai.text import *
from fastai.callbacks import *

In [101]:
import fastai
fastai.__version__

'1.0.60'

In [123]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    testing=False,
    bert_model_name="bert-base-uncased",
    max_lr=1e-2,
    epochs=1,
    use_fp16=False,
    bs=32,
    discriminative=False,
    max_seq_len=256,
)

In [103]:
from pytorch_pretrained_bert import BertTokenizer
bert_tok = BertTokenizer.from_pretrained(
    config.bert_model_name,
)

In [104]:
def _join_texts(texts:Collection[str], mark_fields:bool=False, sos_token:Optional[str]=BOS):
    """Borrowed from fast.ai source"""
    if not isinstance(texts, np.ndarray): texts = np.array(texts)
    if is1d(texts): texts = texts[:,None]
    df = pd.DataFrame({i:texts[:,i] for i in range(texts.shape[1])})
    text_col = f'{FLD} {1} ' + df[0].astype(str) if mark_fields else df[0].astype(str)
    if sos_token is not None: text_col = f"{sos_token} " + text_col
    for i in range(1,len(df.columns)):
        #text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i]
        text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i].astype(str)
    return text_col.values

In [105]:
class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=128, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]

In [106]:
location = "C:\\Users\\osrivast\\Desktop\\Everything_work\\Research\\BERTvsULMFit\\dataset"
src_fol = Path(location)
#Decide the train size
file_train = "trainset.xlsx"
datafile = file_train

train = pd.read_excel(datafile)
train.shape



(1649, 2)

In [107]:
train.shape

(1649, 2)

In [108]:
file_test = "testsetBERT.xlsx"
datafile_test = file_test

test = pd.read_excel(datafile_test)
test.shape

(576, 2)

In [109]:
train['text'] = train['text'].str.replace("[^a-zA-Z]", " ")

import nltk
nltk.download('stopwords')


from nltk.corpus import stopwords 
stop_words = stopwords.words('english')
tokenized_doc = train['text'].apply(lambda x: x.split())



# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(train)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 
    
train['text'] = detokenized_doc   

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/osrivastava/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [110]:
from sklearn.model_selection import train_test_split
#We know the training size here
train, val = train_test_split(train, stratify = train['category'], test_size = 0.2, random_state = 12)
train.shape, val.shape

((1319, 2), (330, 2))

In [111]:
fastai_bert_vocab = Vocab(list(bert_tok.vocab.keys()))

In [112]:
fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=config.max_seq_len), pre_rules=[], post_rules=[])

In [113]:
#label_cols = ['category']
label_cols = ['business', 'entertainment', 'politics', 'sport', 'tech']

In [114]:
class BertTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class BertNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=Vocab(list(bert_tok.vocab.keys())), **kwargs)

def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for BERT
    We remove sos/eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original BERT model.
    """
    return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]

In [115]:
class BertDataBunch(TextDataBunch):
    @classmethod
    def from_df(cls, path:PathOrStr, train_df:DataFrame, valid_df:DataFrame, test_df:Optional[DataFrame]=None,
                tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, text_cols:IntsOrStrs=1,
                label_cols:IntsOrStrs=0, label_delim:str=None, **kwargs) -> DataBunch:
        "Create a `TextDataBunch` from DataFrames."
        p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor)
        # use our custom processors while taking tokenizer and vocab as kwargs
        processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs)
        if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
        src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
                        TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
        src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes)
        if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
        return src.databunch(**kwargs)

In [116]:
databunch = TextDataBunch.from_df(".", train, val, 
                                  test, tokenizer=fastai_tokenizer, 
                                  vocab=fastai_bert_vocab, 
                                  bs=32, include_bos=False,
     include_eos=False, collate_fn=partial(pad_collate, pad_first=False))

In [117]:
from pytorch_pretrained_bert.modeling import BertConfig, BertForSequenceClassification
bert_model = BertForSequenceClassification.from_pretrained(config.bert_model_name, num_labels = 5)

In [118]:
#loss_func = nn.BCEWithLogitsLoss()
#loss_func = nn.NLLLoss

loss_func = nn.CrossEntropyLoss()

In [119]:
from fastai.callbacks import *

learner = Learner(
    databunch, bert_model,
    loss_func=loss_func,
)

In [121]:
#LR for LM and epochs
#Takes like 10 mins
learner.fit_one_cycle(5, 1e-2, moms=(0.8,0.7), wd=0.1)

epoch,train_loss,valid_loss,time
0,2.175191,2.630991,18:41
1,2.91663,1.806864,11:49
2,2.739515,1.919592,13:39
3,2.258651,1.727643,12:14
4,1.909431,1.605483,12:17


In [124]:
learner_safety = learner
#Takes 15 mins per epoch
#LR for classifier and epoch
learner.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,time
0,2.053079,1.708304,10:23


In [125]:
fastai.__version__

'1.0.60'

In [145]:
preds, y = learner.get_preds()

In [147]:
predictions = np.argmax(preds, axis = 1)

In [146]:
pd.crosstab(preds, y)

col_0,0,1,2,3,4
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(tensor(0.1394), tensor(0.0791), tensor(0.2443), tensor(0.2557), tensor(0.2815))",0,1,0,1,2
"(tensor(0.1394), tensor(0.0791), tensor(0.2443), tensor(0.2557), tensor(0.2815))",0,0,0,1,1
"(tensor(0.1394), tensor(0.0791), tensor(0.2443), tensor(0.2557), tensor(0.2815))",5,8,4,3,4
"(tensor(0.1394), tensor(0.0791), tensor(0.2443), tensor(0.2557), tensor(0.2815))",1,1,0,0,0
"(tensor(0.1394), tensor(0.0791), tensor(0.2443), tensor(0.2557), tensor(0.2815))",0,0,0,0,1
"(tensor(0.1394), tensor(0.0791), tensor(0.2443), tensor(0.2557), tensor(0.2815))",1,0,1,0,2
"(tensor(0.1394), tensor(0.0791), tensor(0.2443), tensor(0.2557), tensor(0.2815))",1,1,0,1,0
"(tensor(0.1394), tensor(0.0791), tensor(0.2443), tensor(0.2557), tensor(0.2815))",0,0,1,0,0
"(tensor(0.1394), tensor(0.0791), tensor(0.2443), tensor(0.2557), tensor(0.2815))",1,2,1,1,1
"(tensor(0.1394), tensor(0.0791), tensor(0.2443), tensor(0.2557), tensor(0.2815))",5,3,3,6,7


In [148]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

accuracy = accuracy_score(predictions, y)
recall = recall_score(predictions, y, average='macro')
f1 = f1_score(predictions, y, average='macro')

In [149]:
# AUC_ROC_CURVE for text classifier
from fastai import metrics
metrics.auc_roc_score(predictions, y)

tensor(0.5000)

In [150]:
accuracy, recall, f1

(0.17575757575757575, 0.03515151515151515, 0.059793814432989686)