In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 24.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 47.4MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 51.6MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |█

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import transformers
from transformers import BertModel, BertTokenizer,AutoTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F


In [4]:
from transformers import DistilBertTokenizer, DistilBertModel

In [6]:
import os
import time
from tqdm import notebook
from functools import partial

In [24]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [7]:
class ReviewDataset(Dataset):
    def __init__(self, poems, targets, tokenizer, max_len):
        self.poems = poems
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len 
    
    def __len__(self):
        return len(self.poems)
    
    def __getitem__(self, item):
        poem = str(self.poems[item])
        target = self.targets[item]
        
        encoding = self.tokenizer.encode_plus(
            poem,
            add_special_tokens = True,
            max_length = self.max_len,
            return_token_type_ids = False,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt',)
        
        output = {
            'poem_text': poem,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype = torch.long)
        }
        
        return output

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = ReviewDataset(
            poems = df.clean_content.to_numpy(), #use clean_content for now
            targets = df.label.to_numpy(),
            tokenizer = tokenizer,
            max_len = max_len)
    
    return DataLoader(
        ds,
        batch_size = batch_size,
        shuffle = True
    )

In [8]:
cd /content/drive/MyDrive/Colab Notebooks/NLP proj

/content/drive/MyDrive/Colab Notebooks/NLP proj


In [None]:
! ls

BERT.ipynb  data_thresh40


### Loading Data

In [9]:
pwd_train_40 = '/content/drive/MyDrive/Colab Notebooks/NLP proj/data_thresh40/train_data.csv'
pwd_val_40 = '/content/drive/MyDrive/Colab Notebooks/NLP proj/data_thresh40/val_data.csv'
pwd_test_40 = '/content/drive/MyDrive/Colab Notebooks/NLP proj/data_thresh40/test_data.csv'


In [10]:
train_df = pd.read_csv(pwd_train_40)
val_df = pd.read_csv(pwd_val_40)
test_df = pd.read_csv(pwd_test_40)

In [None]:
train_df.head()

In [11]:
train_df['source_len'] = train_df['clean_content'].apply(lambda x: len(x.replace('\n',' ').split()))


In [12]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_df['author'])
y_train=le.transform(train_df['author'])
y_val=le.transform(val_df['author'])
y_test=le.transform(test_df['author'])

In [13]:
train_df['label'] = y_train
val_df['label'] = y_val
test_df['label'] = y_test

In [14]:
MAX_LEN = 512 #int(train_df['source_len'].quantile(0.95)) ###
BATCH_SIZE = 64
MAX_LEN

512

In [None]:
train_df.author.nunique()

12

In [15]:
nclasses = 12
PRE_TRAINED_MODEL_NAME = 'distilbert-base-cased'

In [40]:
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer_auto = AutoTokenizer.from_pretrained('bert-base-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




In [16]:
tokenizer_distil = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [17]:
train_data_loader = create_data_loader(train_df, tokenizer_distil, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer_distil, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer_distil, MAX_LEN, BATCH_SIZE)


In [None]:
ds = ReviewDataset(
            poems = train_df.clean_content.to_numpy(), #use clean_content for now
            targets = train_df.label.to_numpy(),
            tokenizer = tokenizer_auto,
            max_len = MAX_LEN)
    
ds

<__main__.ReviewDataset at 0x7f71b4aef2b0>

In [None]:
ds[0]



{'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1]),
 'input_ids': tensor([  101,   783,  1108,  ..., 16516, 19726,   102]),
 'poem_text': '—was this\nthat one fairest river lovd\nto blend murmur nurs song\nand alder shade rocki falls\nand ford shallow sent voice\nthat flowd along dream didst thou\no derwent travel green plains\nnear sweet birthplac didst thou beauteous stream\nmak ceaseless music night day\nwhich steadi cadenc tempering\nour human wayward composd thoughts\nto infant soft give me\namong fret dwell mankind\na knowledg dim earnest calm\nthat natur breath among hill groves\nwhen left mountain towers\nof cockermouth beauteous river came\nbehind father hous passd close by\nalong margin terrac walk\nh playmat dear lovd\noh mani time five year child\na nake boy one delight rill\na littl millrac severd stream\nmad one long bath summer day\nbaskd sun plung baskd again\naltern summer day coursd\nov sandi field leap groves\nof yellow grunsel crag hill\nth wood distant skiddaw lof

### Classifer/Model

In [18]:
class AuthorClassifier(nn.Module):
    def __init__(self, n_classes=12, dropout=0.3):
        super(AuthorClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.dropout = nn.Dropout(p=dropout)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        output = self.dropout(pooled_output)
        output = self.out(output)
        return output

In [19]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
    ):
    model = model.train()

    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)


In [20]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [21]:
def get_predictions(model, data_loader):
    model = model.eval()

    poem_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:

            texts = d["doc_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            probs = F.softmax(outputs, dim=1)

            poem_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return poem_texts, predictions, prediction_probs, real_values

In [29]:
class_names = train_df.author.unique().tolist()
sample_txt = train_df.clean_content[0]

In [49]:
tokens = tokenizer_distil.tokenize(sample_txt)
token_ids = tokenizer_distil.convert_tokens_to_ids(tokens)
distil_encoding = tokenizer_distil.encode_plus(
                  sample_txt,
                  max_length=MAX_LEN,
                  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
                  return_token_type_ids=False,
                  pad_to_max_length=True,
                  return_attention_mask=True,
                  return_tensors='pt',  # Return PyTorch tensors
                )





In [50]:
data = next(iter(train_data_loader))



In [51]:
distilbert_model = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

last_hidden_state, pooled_output = distilbert_model(
                input_ids=encoding['input_ids'], 
                attention_mask=encoding['attention_mask'])

ValueError: ignored

In [25]:
model = AuthorClassifier(nclasses)
model = model.to(device)

In [26]:
#model = AuthorClassifier(nclasses)
#model = model.to(device)

#input_ids = data['input_ids'].to(device)
#attention_mask = data['attention_mask'].to(device)


EPOCHS = 10

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)


history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(train_df)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    device, 
    len(df_val)
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc
    
test_acc, _ = eval_model(
      model,
      test_data_loader,
      loss_fn,
      device,
      len(df_test)
    )
print('\nTest Accuracy:\n')
print(test_acc.item())

y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
model,
test_data_loader
)

print(classification_report(y_test, y_pred, target_names=class_names))


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/10
----------




RuntimeError: ignored