In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import spacy
import os

base_directory = 'nlp/stanfordSentimentTreebank/'
sentences = pd.read_csv('nlp/stanfordSentimentTreebank/datasetSentences.txt', index_col="sentence_index",
                                sep="\t")
splits = pd.read_csv(os.path.join(base_directory, "datasetSplit.txt"), index_col="sentence_index")
sentences = sentences.join(splits)
sentences = sentences.sort_values(by=['splitset_label', 'sentence_index'])
sentences.reset_index(inplace=True)

sents_df = pd.DataFrame()

for i,data in enumerate(['nlp/trees/train.txt', 'nlp/trees/test.txt', 'nlp/trees/dev.txt']):
  new = pd.read_csv(data, sep="\t", header=None)
  new = new.apply(lambda it: str(it).split()[1][1], axis=1).to_frame().rename(columns={0: 'label'})
  new['set'] = i+1
  sents_df = sents_df.append(new)

sents_df.reset_index(inplace=True, drop=True)
sents_df = pd.concat([sentences, sents_df], axis=1)
sents_df.label = sents_df.label.astype(float)
sents_df.to_pickle('nlp/sents.pkl')


In [None]:
import pandas as pd

phrases = pd.read_csv('nlp/stanfordSentimentTreebank/dictionary.txt', header=None,
                                sep="|").rename(columns={0: 'sentence', 1: 'phrase_id'})
phrase_sentiments = pd.read_csv('nlp/stanfordSentimentTreebank/sentiment_labels.txt',
                                sep="|").rename(columns={'phrase ids': 'phrase_id', 'sentiment values': 'label'})
phrase_df = pd.merge(phrases, phrase_sentiments, on='phrase_id')
phrase_df.to_pickle('nlp/phrases.pkl')

In [5]:
import pandas as pd
import numpy as np

phrase_df = pd.read_pickle('nlp/phrases.pkl')
sents_df = pd.read_pickle('nlp/sents.pkl')

def factorize(value):
  if value < 0.2:
    return 0
  elif 0.2 <= value < 0.4:
    return 1
  elif 0.4<= value < 0.6:
    return 2
  elif 0.6<= value < 0.8:
    return 3
  elif 0.8<= value:
    return 4

phrase_df['label'] = phrase_df['label'].apply(factorize)

In [54]:
import spacy
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, accuracy_score, f1_score

def predict(model, tokenizer, sentence, max_len):
    labels = {0: 'very negative', 1: 'negative', 2: 'neutral', 3: 'positive', 4: 'very positive'}
    model.eval()
    model.to(device)
    input = tokenizer.encode_plus(
      sentence,
      add_special_tokens=True,
      max_length=max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    with torch.no_grad():
      values = []
      input_ids = input["input_ids"].to(device)
      attention_mask = input["attention_mask"].to(device)

      outputs = model(input_ids, attention_mask)
      preds = outputs.logits.argmax().item()

      return labels[preds]

def evaluate(model, test_set):
  test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=128)

  ## Evaluate performance on validation set
  y_pred = []
  y_test = []
  with torch.no_grad():
    model.eval()
    valid_loss = 0
    loss_counter = 0
    for batch, d in enumerate(test_dataloader):
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["labels"].argmax(-1).cpu()

      outputs = model(input_ids, attention_mask)
      preds = outputs.logits.argmax(-1).cpu()

      y_pred.extend(preds)
      y_test.extend(targets)


  print(y_pred, y_test)
  print(confusion_matrix(y_test, y_pred))
  precision, recall, fscore, support  = precision_recall_fscore_support(y_test, y_pred)
  print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
  print("f1: {}".format(f1_score(y_test, y_pred, average='weighted')))
  print('precision: {}'.format(precision))
  print('recall: {}'.format(recall))
  print('fscore: {}'.format(fscore))
  print('support: {}'.format(support))

In [7]:
import torch
import pandas as pd
import spacy
from torch import nn
from torch.nn import functional as F
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.preprocessing import LabelBinarizer


class SentimentDataset(torch.utils.data.Dataset):
  def __init__(self, dataset, tokenizer) -> None:
    super().__init__()

    df = dataset.reset_index()
    self.labels = list(LabelBinarizer().fit_transform(df['label'].round(0)))
    sents = df['sentence'].to_list()

    self.encodings = tokenizer(sents, truncation=True, padding=True)
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, index):
    item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[index], dtype=torch.float)

    return item

class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=n_classes)
  
  def forward(self, input_ids, attention_mask):
    output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    return output[0]

In [None]:
## Manual training with pyTorch

import torch
import numpy as np
import time

def train_model(model, train_set, validation_set, max_epochs, batch_size):

  evaluation_data = {
      'train_loss': [],
      'validation_loss': [],
  }

  # Define model and loss functions
  model = model.to(device)
  optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
  criterion = nn.CrossEntropyLoss().to(device)

  # Define dataloaders
  dataloader = torch.utils.data.DataLoader(train_set, batch_size=batch_size)
  validation_dataloader = torch.utils.data.DataLoader(validation_set, batch_size=64)

  # Train loop
  for epoch in range(max_epochs):
    start_time = time.time()
    model.train()

    ## Train in batches
    epoch_loss = 0
    loss_counter = 0
    for batch, d in enumerate(dataloader):
      if batch%100 == 0:
        print(f'Epoch {epoch} - Batch {batch} of {int(len(train_set)/batch_size)}')
      optimizer.zero_grad()

      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["labels"].to(device)

      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = criterion(outputs, targets)

      loss.backward()
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      optimizer.step()

      epoch_loss += loss.item()
      loss_counter +=1

    epoch_loss /= loss_counter
    

    ## Evaluate performance on validation set
    print(f'Epoch {epoch} - Evaluating validation set')
    with torch.no_grad():
      model.eval()
      valid_loss = 0
      loss_counter = 0
      for batch, d in enumerate(validation_dataloader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = criterion(outputs, targets)

        valid_loss += loss.item()
        loss_counter +=1

      valid_loss /= loss_counter

    ## update evaluation data
    evaluation_data['train_loss'].append(epoch_loss)
    evaluation_data['validation_loss'].append(valid_loss)

    print("Train Epoch {}: Time {}s |  Loss - {} | Validation loss - {}".format(epoch, int(time.time() - start_time), epoch_loss, valid_loss))
    print("----------------------------------------")
  return evaluation_data


In [46]:
## Train with Trainer
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

def compute_metrics(pred):
  labels = pred.label_ids.argmax(-1)
  preds = pred.predictions.argmax(-1)

  return {
      'accuracy': accuracy_score(labels, preds),
      'f1': f1_score(labels, preds, average='weighted')
  }


device = "cuda"
distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased", do_lower_case=True, model_max_length=512, batched=True)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5, problem_type="multi_label_classification").to(device)


df = phrase_df.sample(frac=1)
phrase_train_dataset, phrase_test_dataset = train_test_split(df, test_size=0.2, shuffle=True)
phrase_test_dataset, phrase_validation_dataset = train_test_split(phrase_test_dataset, test_size=0.5)

phrase_train_dataset = SentimentDataset(phrase_train_dataset, distilbert_tokenizer)
phrase_validation_dataset = SentimentDataset(phrase_validation_dataset, distilbert_tokenizer)
phrase_test_dataset = SentimentDataset(phrase_test_dataset, distilbert_tokenizer)

training_args = TrainingArguments(
    seed=0,
    output_dir='nlp/bert/phrases/results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='nlp/bert/phrases/logs',            # directory for storing logs
    logging_steps=500,
    no_cuda=False,
    load_best_model_at_end=True,
    evaluation_strategy="steps"
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=phrase_train_dataset,         # training dataset
    eval_dataset=phrase_validation_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)

trainer.train()



loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accef

Step,Training Loss,Validation Loss,Accuracy,F1
500,0.3939,0.304649,0.638355,0.617146
1000,0.3031,0.29726,0.642117,0.630199
1500,0.2946,0.281509,0.665817,0.661126
2000,0.2919,0.280302,0.667614,0.668088
2500,0.2867,0.278043,0.671836,0.651935
3000,0.2776,0.279748,0.669119,0.670899
3500,0.2752,0.272573,0.680572,0.680049
4000,0.2755,0.266595,0.685253,0.683794
4500,0.2687,0.264857,0.69169,0.68432
5000,0.2719,0.262613,0.693989,0.691101


***** Running Evaluation *****
  Num examples = 23924
  Batch size = 64
Saving model checkpoint to nlp/bert/phrases/results/checkpoint-500
Configuration saved in nlp/bert/phrases/results/checkpoint-500/config.json
Model weights saved in nlp/bert/phrases/results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 23924
  Batch size = 64
Saving model checkpoint to nlp/bert/phrases/results/checkpoint-1000
Configuration saved in nlp/bert/phrases/results/checkpoint-1000/config.json
Model weights saved in nlp/bert/phrases/results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 23924
  Batch size = 64
Saving model checkpoint to nlp/bert/phrases/results/checkpoint-1500
Configuration saved in nlp/bert/phrases/results/checkpoint-1500/config.json
Model weights saved in nlp/bert/phrases/results/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 23924
  Batch size = 64
Saving model checkpoint to nlp/bert/p

TrainOutput(global_step=11962, training_loss=0.26136551515610795, metrics={'train_runtime': 6554.3315, 'train_samples_per_second': 58.4, 'train_steps_per_second': 1.825, 'total_flos': 6536482594428600.0, 'train_loss': 0.26136551515610795, 'epoch': 2.0})

In [40]:
train = sents_df[sents_df.splitset_label == 1]
test = sents_df[sents_df.splitset_label == 2]
dev = sents_df[sents_df.splitset_label == 3]

sents_train_dataset = SentimentDataset(train, distilbert_tokenizer)
sents_validation_dataset = SentimentDataset(dev, distilbert_tokenizer)
sents_test_dataset = SentimentDataset(test, distilbert_tokenizer)

trainer.predict(phrase_test_dataset)

***** Running Prediction *****
  Num examples = 2392
  Batch size = 64


PredictionOutput(predictions=array([[-5.3413424 , -3.4993482 , -1.093116  ,  0.7655581 , -3.1847749 ],
       [-6.4875865 , -4.6604743 ,  2.3212569 , -2.3729525 , -5.8557615 ],
       [-2.3275664 , -0.7624236 , -1.0487303 , -1.5854385 , -3.6403081 ],
       ...,
       [-4.030131  , -0.7505966 ,  0.33073053, -3.065243  , -5.710458  ],
       [-6.044316  , -4.6209316 ,  3.121001  , -3.2444077 , -5.768075  ],
       [-1.1600947 ,  0.29896525, -2.2655993 , -4.1514773 , -5.298276  ]],
      dtype=float32), label_ids=array([[0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.]], dtype=float32), metrics={'test_loss': 0.2983350455760956, 'test_accuracy': 0.6500836120401338, 'test_f1': 0.6450424637728529, 'test_runtime': 8.2458, 'test_samples_per_second': 290.086, 'test_steps_per_second': 4.608})

In [13]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
device = "cuda"
distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased", do_lower_case=True, model_max_length=512, batched=True)
model = DistilBertForSequenceClassification.from_pretrained("nlp/bert/phrases/results/checkpoint-11000", num_labels=5, problem_type="multi_label_classification").to(device)

train = sents_df[sents_df.splitset_label == 1]
test = sents_df[sents_df.splitset_label == 2]
dev = sents_df[sents_df.splitset_label == 3]

sents_train_dataset = SentimentDataset(train, distilbert_tokenizer)
sents_validation_dataset = SentimentDataset(dev, distilbert_tokenizer)
sents_test_dataset = SentimentDataset(test, distilbert_tokenizer)

def compute_metrics(pred):
  labels = pred.label_ids.argmax(-1)
  preds = pred.predictions.argmax(-1)

  return {
      'accuracy': accuracy_score(labels, preds),
      'f1': f1_score(labels, preds, average='weighted'),
      'confusion': str(confusion_matrix(labels, preds))
  }

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    compute_metrics=compute_metrics
)

trainer.predict(sents_test_dataset)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 2210
  Batch size = 8


PredictionOutput(predictions=array([[-5.8134    , -0.76526517,  0.3534432 , -2.4080489 , -7.5354104 ],
       [-8.696852  , -5.9402914 , -2.24961   ,  1.4550132 , -2.2659087 ],
       [-9.246331  , -7.2609954 , -4.455598  , -0.42864442,  0.35292432],
       ...,
       [-9.368487  , -7.0722756 , -3.8692892 ,  0.14449883, -0.25039145],
       [-3.8700454 , -0.8647524 ,  0.12746282, -2.109626  , -5.8655405 ],
       [ 0.91129225, -1.170349  , -3.744451  , -6.723289  , -7.957592  ]],
      dtype=float32), label_ids=array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.]], dtype=float32), metrics={'test_loss': 0.2642548382282257, 'test_accuracy': 0.6954751131221719, 'test_f1': 0.6912560441433127, 'test_confusion': '[[169 108   2   0   0]\n [ 81 480  59  12   1]\n [  6 105 190  85   3]\n [  0   6  25 391  88]\n [  0   0   4  88 307]]', 'test_runtime': 11.9404, 'te

In [30]:
evaluate(model, sents_test_dataset)

[tensor(2), tensor(3), tensor(4), tensor(3), tensor(4), tensor(3), tensor(3), tensor(3), tensor(3), tensor(4), tensor(4), tensor(4), tensor(4), tensor(3), tensor(3), tensor(2), tensor(4), tensor(4), tensor(4), tensor(4), tensor(4), tensor(3), tensor(3), tensor(3), tensor(3), tensor(3), tensor(3), tensor(1), tensor(3), tensor(4), tensor(4), tensor(2), tensor(2), tensor(3), tensor(1), tensor(4), tensor(3), tensor(4), tensor(1), tensor(2), tensor(3), tensor(0), tensor(3), tensor(4), tensor(3), tensor(2), tensor(4), tensor(4), tensor(3), tensor(3), tensor(3), tensor(3), tensor(3), tensor(4), tensor(3), tensor(3), tensor(3), tensor(3), tensor(3), tensor(4), tensor(2), tensor(3), tensor(3), tensor(2), tensor(3), tensor(3), tensor(4), tensor(4), tensor(4), tensor(3), tensor(2), tensor(4), tensor(4), tensor(4), tensor(3), tensor(2), tensor(4), tensor(4), tensor(3), tensor(3), tensor(3), tensor(4), tensor(3), tensor(3), tensor(4), tensor(4), tensor(4), tensor(3), tensor(3), tensor(3), tensor(1)

In [55]:
predict(model, distilbert_tokenizer, "I don't think that the movie, which my father told me about last night when we returned home, was amazing", 512)



'negative'

In [56]:
predict(model, distilbert_tokenizer, "I think that the movie, which my father told me about last night when we returned home, was amazing", 512)



'very positive'

In [146]:
predict(model, distilbert_tokenizer, "The chicken crossed the road, it got struck by a car, but it survived and it's rich ", 512)



'positive'