In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!git clone https://github.com/vonsovsky/bert-sentiment.git

Cloning into 'bert-sentiment'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 38 (delta 16), reused 27 (delta 11), pack-reused 0[K
Unpacking objects: 100% (38/38), done.


In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 5.2 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 66.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 51.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 22.6 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled P

In [4]:
%cd bert-sentiment/

/content/bert-sentiment


In [68]:
import re
import os

import numpy as np
import pandas as pd
import torch
from sklearn import metrics

from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler
from tqdm import tqdm, trange
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer

In [20]:
PAD_TOKEN_LABEL_ID = CrossEntropyLoss().ignore_index

BATCH_SIZE = 16
LEARNING_RATE_MODEL = 1e-5
LEARNING_RATE_CLASSIFIER = 1e-3
WARMUP_STEPS = 0
GRADIENT_ACCUMULATION_STEPS = 1
MAX_GRAD_NORM = 1.0
SEED = 42
NO_CUDA = False

In [54]:
filename = "/content/drive/MyDrive/bert/citypulse.dublin_city_council.test.csv"

data, y_hat = read_dublin_data(filename)


In [None]:
# col_names = ['sentiment', 'id', 'date', 'query', 'user', 'text']
# df_train = pd.read_csv("/content/drive/MyDrive/data/Sentiment140-train.csv", encoding="latin-1", header = None, names = col_names)
# df_test = pd.read_csv("/content/drive/MyDrive/data/Sentiment140-test.csv", encoding="latin-1", header = None, names = col_names)
# df_dublin = pd.read_csv("/content/drive/MyDrive/data/citypulse.dublin_city_council.test.csv", encoding="latin-1" )

In [50]:
def rpad(array, n):
    current_len = len(array)
    if current_len > n:
        return array[:n]
    extra = n - current_len
    return array + ([0] * extra)


def convert_to_embedding(tokenizer, sentences_with_labels):
    for sentence, label in sentences_with_labels:
        tokens = tokenizer.tokenize(sentence)
        tokens = tokens[:250]
        bert_sent = rpad(tokenizer.convert_tokens_to_ids(["CLS"] + tokens + ["SEP"]), n=256)
        yield torch.tensor(bert_sent), torch.tensor(label, dtype=torch.int64)


def parse_line(line):
    line = line.strip().lower()
    line = line.replace("&nbsp;", " ")
    line = re.sub(r'<br(\s\/)?>', ' ', line)
    line = re.sub(r' +', ' ', line)  # merge multiple spaces into one

    return line


def convert_sentiment_dublin(x):
    if x == "neutral":
        return -1
    elif x == "negative":
        return 0
    else:
        return 1


def read_imdb_data(filename):
    data = []
    for line in open(filename, 'r', encoding="utf-8"):
        data.append(parse_line(line))

    return data


def read_dublin_data(filename):
    data = []
    df = pd.read_csv(filename, encoding="latin-1" )
    df['sentiment'] = df['sentiment'].apply(convert_sentiment_dublin)
    data = list(df['text'].values)
    y = np.zeros(len(df))
    y = df['sentiment'].values
    return data, y


def prepare_dataloader(tokenizer, sampler=RandomSampler, train=False):
    #filename = "/content/drive/MyDrive/bert/Sentiment140-train.csv" if train else "/content/drive/MyDrive/bert/Sentiment140-test.csv"
    filename = "/content/drive/MyDrive/bert/citypulse.dublin_city_council.test.csv"

    data, y = read_dublin_data(filename)
    
    sentences_with_labels = zip(data, y.tolist())

    dataset = list(convert_to_embedding(tokenizer, sentences_with_labels))

    sampler_func = sampler(dataset) if sampler is not None else None
    dataloader = DataLoader(dataset, sampler=sampler_func, batch_size=BATCH_SIZE)

    return dataloader

In [60]:
class Transformers:
    model = None

    def __init__(self, tokenizer):
        self.pad_token_label_id = PAD_TOKEN_LABEL_ID
        self.device = torch.device("cuda" if torch.cuda.is_available() and not NO_CUDA else "cpu")
        self.tokenizer = tokenizer

    def predict(self, sentence):
        if self.model is None or self.tokenizer is None:
            self.load()

        embeddings = list(convert_to_embedding([(sentence, -1)]))
        preds = self._predict_tags_batched(embeddings)
        return preds

    def evaluate(self, dataloader):
        from sklearn.metrics import classification_report
        y_pred = self._predict_tags_batched(dataloader)
        y_true = np.zeros(len(y_hat))
        #y_true = y_hat

        score = classification_report(y_true, y_pred)
        print(score)
        return y_true, y_pred

    def _predict_tags_batched(self, dataloader):
        preds = []
        self.model.eval()
        for batch in tqdm(dataloader, desc="Computing NER tags"):
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                outputs = self.model(batch[0])
                _, is_neg = torch.max(outputs[0], 1)
                preds.extend(is_neg.cpu().detach().numpy())

        return preds

    def train(self, dataloader, model, epochs):
        assert self.model is None  # make sure we are not training after load() command
        model.to(self.device)
        self.model = model

        t_total = len(dataloader) // GRADIENT_ACCUMULATION_STEPS * epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        optimizer_grouped_parameters = [
            {"params": model.bert.parameters(), "lr": LEARNING_RATE_MODEL},
            {"params": model.classifier.parameters(), "lr": LEARNING_RATE_CLASSIFIER}
        ]
        optimizer = AdamW(optimizer_grouped_parameters)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=t_total)

        # Train!
        print("***** Running training *****")
        print("Training on %d examples", len(dataloader))
        print("Num Epochs = %d", epochs)
        print("Total optimization steps = %d", t_total)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(epochs, desc="Epoch")
        self._set_seed()
        for _ in train_iterator:
            epoch_iterator = tqdm(dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                model.train()
                batch = tuple(t.to(self.device) for t in batch)
                outputs = model(batch[0], labels=batch[1])
                loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

                if GRADIENT_ACCUMULATION_STEPS > 1:
                    loss = loss / GRADIENT_ACCUMULATION_STEPS

                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)

                    scheduler.step()  # Update learning rate schedule
                    optimizer.step()
                    model.zero_grad()
                    global_step += 1

        self.model = model

        return global_step, tr_loss / global_step

    def _set_seed(self):
        torch.manual_seed(SEED)
        if self.device == 'gpu':
            torch.cuda.manual_seed_all(SEED)

    def load(self, model_dir='weights/'):
        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
        self.model = BertForSequenceClassification.from_pretrained(model_dir)
        self.model.to(self.device)

In [63]:
def train(epochs=20, output_dir="weights/"):
    num_labels = 2  # negative and positive reviews
    config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

    dataloader = prepare_dataloader(tokenizer, train=True)
    predictor = Transformers(tokenizer)
    predictor.train(dataloader, model, epochs)

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

def evaluate(model_dir="weights/"):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    dataloader = prepare_dataloader(tokenizer, train=False, sampler=None)
    predictor = Transformers(tokenizer)
    predictor.load(model_dir=model_dir)
    y_true, y_pred = predictor.evaluate(dataloader)
    return y_true, y_pred



path = '/content/drive/MyDrive/bert/weights/'
#os.makedirs(path, exist_ok=True)
#train(epochs=10, output_dir=path)
y_true, y_pred = evaluate(model_dir=path)

Computing NER tags:  18%|█▊        | 33/188 [00:09<00:44,  3.45it/s][A
Computing NER tags:  18%|█▊        | 34/188 [00:09<00:44,  3.46it/s][A
Computing NER tags:  19%|█▊        | 35/188 [00:10<00:44,  3.45it/s][A
Computing NER tags:  19%|█▉        | 36/188 [00:10<00:44,  3.44it/s][A
Computing NER tags:  20%|█▉        | 37/188 [00:10<00:43,  3.45it/s][A
Computing NER tags:  20%|██        | 38/188 [00:11<00:43,  3.43it/s][A
Computing NER tags:  21%|██        | 39/188 [00:11<00:43,  3.41it/s][A
Computing NER tags:  21%|██▏       | 40/188 [00:11<00:44,  3.35it/s][A
Computing NER tags:  22%|██▏       | 41/188 [00:11<00:43,  3.38it/s][A
Computing NER tags:  22%|██▏       | 42/188 [00:12<00:43,  3.39it/s][A
Computing NER tags:  23%|██▎       | 43/188 [00:12<00:42,  3.39it/s][A
Computing NER tags:  23%|██▎       | 44/188 [00:12<00:42,  3.38it/s][A
Computing NER tags:  24%|██▍       | 45/188 [00:13<00:42,  3.40it/s][A
Computing NER tags:  24%|██▍       | 46/188 [00:13<00:41,  3.39i

              precision    recall  f1-score   support

         0.0       1.00      0.85      0.92      3000
         1.0       0.00      0.00      0.00         0

    accuracy                           0.85      3000
   macro avg       0.50      0.43      0.46      3000
weighted avg       1.00      0.85      0.92      3000




  _warn_prf(average, modifier, msg_start, len(result))


In [83]:
y_pred_bi = np.array(y_pred)[np.in1d(y_hat, [0,1])]
y_hat_bi = np.array(y_hat)[np.in1d(y_hat, [0,1])]
print(metrics.confusion_matrix(y_hat_bi, y_pred_bi))
print(metrics.classification_report(y_hat_bi, y_pred_bi))
print("Accuracy Score: %.3f" % metrics.accuracy_score(y_hat_bi, y_pred_bi))

[[805 195]
 [897 103]]
              precision    recall  f1-score   support

           0       0.47      0.81      0.60      1000
           1       0.35      0.10      0.16      1000

    accuracy                           0.45      2000
   macro avg       0.41      0.45      0.38      2000
weighted avg       0.41      0.45      0.38      2000

Accuracy Score: 0.454
