#### Pre-trained model was taken from extra credit homework.

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 321 kB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 47.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 66.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 59.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [2]:
import torch
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
assert torch.cuda.is_available()
device = torch.device("cuda")

path = 'drive/MyDrive/CS685'

Mounted at /content/drive


#Sentiment Analysis


In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
import os

saved_model = os.path.join(path, "bert-base-cased-finetuned-sst")
tokenizer = AutoTokenizer.from_pretrained(saved_model, local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained(saved_model, local_files_only=True).to(device)
config = AutoConfig.from_pretrained(saved_model, local_files_only=True)

In [4]:
class GetDataset(Dataset):

  def __init__(self, dataframe, tokenizer):
    texts = df['sentence'].tolist()
    labels = df['label'].tolist()

    self.n_examples = len(labels)
    self.inputs = tokenizer(texts, add_special_tokens=True, padding=True, return_tensors='pt')
    self.sequence_len = self.inputs['input_ids'].shape[-1]
    self.inputs.update({'labels':torch.tensor(labels)})
    return

  def __len__(self):
    
    return self.n_examples

  def __getitem__(self, item):

    return {key: self.inputs[key][item] for key in self.inputs.keys()}


In [5]:
def predict_sentiment(dataloader, device_, model):

  predictions_labels = []
  true_labels = []
  model.eval()

  for batch in tqdm(dataloader, total=len(dataloader)):

    true_labels += batch['labels'].numpy().flatten().tolist()
    batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

    with torch.no_grad():        
        outputs = model(**batch)
        loss, logits = outputs[:2]
        predict_content = torch.argmax(logits, axis=-1).flatten().tolist()
        predictions_labels += predict_content

  return true_labels, predictions_labels

### Accuracy of original data

In [6]:
df = pd.read_csv(os.path.join(path, "sst_gt_data.tsv"), sep='\t')
sentm_dataset =  GetDataset(dataframe=df,
                            tokenizer=tokenizer)

sentm_dataloader = DataLoader(sentm_dataset, batch_size=128, shuffle=False)
true_label, pred_label = predict_sentiment(sentm_dataloader, device, model)

  0%|          | 0/7 [00:00<?, ?it/s]

In [7]:
print(f"Accuracy = {sum(1 for x,y in zip(true_label,pred_label) if x == y) / len(true_label)}")

Accuracy = 0.9197247706422018


### Accuracy of noisy data

In [257]:
df = pd.read_csv(os.path.join(path, "sst_unnormalized_data.tsv"), sep='\t')
sentm_dataset =  GetDataset(dataframe=df,
                            tokenizer=tokenizer)

sentm_dataloader = DataLoader(sentm_dataset, batch_size=128, shuffle=False)
true_label, pred_label = predict_sentiment(sentm_dataloader, device, model)

  0%|          | 0/7 [00:00<?, ?it/s]

In [258]:
print(f"Accuracy = {sum(1 for x,y in zip(true_label,pred_label) if x == y) / len(true_label)}")

Accuracy = 0.7981651376146789


### Accuracy of noisy data normalized by Masked Language Pipeline

In [329]:
df = pd.read_csv(os.path.join(path, "mlm_sst_normalized.tsv"), sep='\t')
sentm_dataset =  GetDataset(dataframe=df,
                            tokenizer=tokenizer)

sentm_dataloader = DataLoader(sentm_dataset, batch_size=128, shuffle=False)
true_label, pred_label = predict_sentiment(sentm_dataloader, device, model)

  0%|          | 0/7 [00:00<?, ?it/s]

In [330]:
print(f"Accuracy = {sum(1 for x,y in zip(true_label,pred_label) if x == y) / len(true_label)}")

Accuracy = 0.8555045871559633


### Accuracy of noisy data normalized by fine-tuned T5

In [343]:
df = pd.read_csv(os.path.join(path, "t5_sst_normalized.tsv"), sep='\t')
sentm_dataset =  GetDataset(dataframe=df,
                            tokenizer=tokenizer)

sentm_dataloader = DataLoader(sentm_dataset, batch_size=128, shuffle=False)
true_label, pred_label = predict_sentiment(sentm_dataloader, device, model)

  0%|          | 0/7 [00:00<?, ?it/s]

In [344]:
print(f"Accuracy = {sum(1 for x,y in zip(true_label,pred_label) if x == y) / len(true_label)}")

Accuracy = 0.8623853211009175
