Reference: https://shivanandroy.com/fine-tune-t5-transformer-with-pytorch/

In [1]:
!pip install sentencepiece
!pip install transformers
!pip install torch

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 21.9 MB/s eta 0:00:01[K     |▌                               | 20 kB 14.4 MB/s eta 0:00:01[K     |▉                               | 30 kB 10.1 MB/s eta 0:00:01[K     |█                               | 40 kB 8.5 MB/s eta 0:00:01[K     |█▍                              | 51 kB 3.9 MB/s eta 0:00:01[K     |█▋                              | 61 kB 4.4 MB/s eta 0:00:01[K     |██                              | 71 kB 4.4 MB/s eta 0:00:01[K     |██▏                             | 81 kB 5.0 MB/s eta 0:00:01[K     |██▍                             | 92 kB 5.2 MB/s eta 0:00:01[K     |██▊                             | 102 kB 4.2 MB/s eta 0:00:01[K     |███                             | 112 kB 4.2 MB/s eta 0:00:01[K     |███▎                            | 122 kB 4.2 MB/s eta 0:00:01[K     |███▌       

In [2]:
import torch
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Mounted at /content/drive


# Fine-tuning T5 for Text Normalization

In [3]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
import matplotlib
import matplotlib.pyplot as plt
import nltk
import difflib
nltk.download('punkt')

%matplotlib inline 

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)
path = 'drive/MyDrive/CS685'

class GetDataset(Dataset):
  def get_masked_text(self, idx, text, len):
    text = str(text[idx])
    # converting tab to space
    text = " ".join(text.split())
    encoded_text = self.tokenizer.batch_encode_plus(
        [text],
        max_length=len,
        pad_to_max_length=True,
        truncation=True,
        padding="max_length",
        return_tensors="pt")
    ids = encoded_text['input_ids'].squeeze()
    mask = encoded_text["attention_mask"].squeeze()
    return ids.to(dtype=torch.long), mask.to(dtype=torch.long)
  
  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len    
    self.source_text = self.data[source_text]
    self.target_len = target_len
    self.target_text = self.data[target_text]
  
  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, idx):
    source_ids, source_mask = self.get_masked_text(idx, self.source_text, self.source_len)
    target_ids, target_mask = self.get_masked_text(idx, self.target_text, self.target_len)
    return {"source_ids": source_ids.to(device), "source_mask": source_mask.to(device),
            "target_ids": target_ids.to(device), "target_mask": target_mask.to(device)}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.




In [7]:
model_params = {
    "MODEL": "t5-base",
    "TRAIN_BATCH_SIZE": 8,
    "VALID_BATCH_SIZE": 8,
    "TRAIN_EPOCHS": 3,
    "VAL_EPOCHS": 1,
    "TEST_EPOCHS": 1,
    "LEARNING_RATE": 5e-5, #2e-5, 5e-6
    "MAX_SOURCE_TEXT_LENGTH": 512,
    "MAX_TARGET_TEXT_LENGTH": 512,
    "SEED": 42,
}
test_params = {
    "batch_size": model_params["VALID_BATCH_SIZE"],
    "shuffle": False,
    "num_workers": 0,
}


In [None]:
def get_dissimilar_spans(orig_words, gt_words, pred_words):
  gt_matcher = difflib.SequenceMatcher(a=orig_words, b=gt_words)
  pred_matcher = difflib.SequenceMatcher(a=gt_words, b=pred_words)
  orig_spans = []
  gt_spans = []
  pred_spans = []
  mismatch_spans = []
  for codes in gt_matcher.get_opcodes():
    op,a_start,a_end,b_start,b_end = codes
    if op == 'replace':
      orig_spans.append(" ".join(orig_words[a_start:a_end]))
      gt_spans.append(" ".join(gt_words[b_start:b_end]))

  for codes in pred_matcher.get_opcodes():
    op,a_start,a_end,b_start,b_end = codes
    if op == 'replace':
      pred_spans.append(" ".join(pred_words[b_start:b_end]))
      mismatch_spans.append(" ".join(gt_words[a_start:a_end]))
  
  return orig_spans, gt_spans, pred_spans, mismatch_spans

def get_stats_for_predictions(orig_text, gt_text, pred_text):
  orig_words = nltk.word_tokenize(orig_text)
  gt_words = nltk.word_tokenize(gt_text)
  pred_words = nltk.word_tokenize(pred_text)
  orig_words = [word.lower().strip() for word in orig_words]
  gt_words = [word.lower().strip() for word in gt_words]
  pred_words = [word.lower().strip() for word in pred_words]
  correct_preds = []
  wrong_preds = []
  changed_orig_words = []
  changed_gt_words = []
  replaced_word_cnt = 0
  correct_pred_cnt = 0
  if len(orig_words)!= len(gt_words):
    print(orig_text)
    print(gt_text)
  elif len(gt_words)!=len(pred_words):
    orig_spans, gt_spans, pred_spans, mismatch_spans = get_dissimilar_spans(orig_words, gt_words, pred_words)
    wrong_preds = pred_spans
    changed_orig_words = orig_spans
    changed_gt_words = gt_spans
    replaced_word_cnt = len(gt_spans)
    correct_pred_cnt = len(gt_spans) - len(mismatch_spans)
    correct_preds = list(set(gt_spans)-set(mismatch_spans))
  else:
    for i in range(len(orig_words)):
      orig_word = orig_words[i]
      gt_word = gt_words[i]
      pred_word = pred_words[i]
      if orig_word != gt_word:
        changed_orig_words.append(orig_word)
        changed_gt_words.append(gt_word)
        replaced_word_cnt = replaced_word_cnt+1
        if pred_word == gt_word:
          correct_preds.append(pred_word)
          correct_pred_cnt = correct_pred_cnt+1
        else:
          wrong_preds.append(pred_word)

  return {"replaced_gt_words":changed_gt_words,
          "replaced_original_words": changed_orig_words,
          "replaced_word_count": replaced_word_cnt,
          "correct_predictions": correct_preds,
          "correct_prediction_count": correct_pred_cnt,
          "wrong_predictions": wrong_preds}

In [None]:
def get_accuracy_df(input_df, pred_df):
  df = pd.concat([input_df, pred_df], axis=1).drop(columns=['gt_text'])
  df['text'] = df['text'].apply(lambda x : x.replace('denoise_text: ',''))
  df["Stats"] = df.apply(lambda x: get_stats_for_predictions(x["text"], x["GT_Text"],x["Predicted_Text"]), axis = 1)
  df = pd.concat([df.drop(['Stats'], axis=1), df['Stats'].apply(pd.Series)], axis=1)
  return df

In [None]:
def train(epoch, tokenizer, model, loader, optimizer):
  model.train()
  loss_history = []
  for i, data in enumerate(loader, 0):
    y = data["target_ids"]
    y_ids = y[:, :-1].contiguous()
    labels = y[:, 1:].clone().detach()
    labels[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data["source_ids"]
    mask = data["source_mask"]
    outputs = model(
        input_ids=ids,
        attention_mask=mask,
        decoder_input_ids=y_ids,
        labels=labels
        )
    loss = outputs[0]
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if i % 50 == 0:
      print("Epoch: {}, Step: {}, Loss: {}".format(epoch,i,loss))
      loss_history.append(loss)
  # plt.plot(range(len(loss_history)), loss_history)


In [9]:
def predict_from_model(epoch, tokenizer, model, loader):
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
    for i, data in enumerate(loader, 0):
      y = data['target_ids']
      ids = data['source_ids']
      mask = data['source_mask']

      # predicting correct sentences from noisy input
      generated_ids = model.generate(
          input_ids = ids,
          attention_mask = mask,
          max_length=512,
          num_beams=2, #3,5
          repetition_penalty=2.5,
          length_penalty=1.0,
          early_stopping=True
          )
      preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
      target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in y]
      preds = [x.lstrip(". ").lstrip(': ') for x in preds]
      if i%150==0:
        print('Completed {} Steps'.format(i))
      
      predictions.extend(preds)
      actuals.extend(target)
  return predictions, actuals

In [None]:
def T5Trainer(
    dataframe, source_text, target_text, model_params, output_dir
):

    torch.manual_seed(model_params["SEED"])
    print(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"]).to(device)

    dataframe = dataframe[[source_text, target_text]]
    display(dataframe.head(2))

    train_size = 0.8
    train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    train_dataset.to_csv(os.path.join(output_dir, "train_set.csv"), index=False, sep='\t')
    val_dataset.to_csv(os.path.join(output_dir, "val_set.csv"), index=False, sep='\t')

    print(f"FULL Dataset: {dataframe.shape}")
    print(f"TRAIN Dataset: {train_dataset.shape}")
    print(f"TEST Dataset: {val_dataset.shape}\n")

    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }
    training_set = GetDataset(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = GetDataset(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    print(f"Starting Fine Tuning...\n")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, training_loader, optimizer)

    print(f"Saving Model...\n")
    # Saving the model after training
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

    # evaluating validation dataset
    print(f"Starting Validation...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = predict_from_model(epoch, tokenizer, model, val_loader)
        final_df = pd.DataFrame({"Predicted_Text": predictions, "GT_Text": actuals})
        accuracy_df = get_accuracy_df(val_dataset, final_df)
        print(f"Validation accuracy: {accuracy_df['correct_prediction_count'].sum()/accuracy_df['replaced_word_count'].sum()}")
        final_df.to_csv(os.path.join(output_dir, "predictions.csv"), index=False, sep='\t')
    
    print(f"Validation Completed...\n")
    print(f"""Model saved at {os.path.join(output_dir, "model_files")}\n""")
    print(f"""Generation on Validation data saved at {os.path.join(output_dir,'predictions.csv')}\n""")

In [None]:
df = pd.read_csv(os.path.join(path, "xsum_train_val.csv"),sep='\t')
df['text'] = 'denoise_text: '+df['text']


In [None]:
import time

output_dir = os.path.join(path,"t5_trained_model_"+str(time.time()))
os.mkdir(output_dir)
T5Trainer(
    dataframe=df,
    source_text="text",
    target_text="gt_text",
    model_params=model_params,
    output_dir=output_dir)

[Model]: Loading t5-base...



Unnamed: 0,text,gt_text
0,"denoise_text: A haul of wht is belved to be cocan wid a street value of £120,000 has been uncovered by engineerS dsantlng a scraPped jumooo jet.","A haul of what is believed to be cocaine with a street value of £120,000 has been uncovered by engineers dismantling a scrapped jumbo jet."
1,denoise_text: Sevennnnn council-run car hms earmarked for closure t save £1.9m have been takEn over by a private company.,Seven council-run care homes earmarked for closure to save £1.9m have been taken over by a private company.


FULL Dataset: (60000, 2)
TRAIN Dataset: (48000, 2)
TEST Dataset: (12000, 2)

Starting Fine Tuning...

Epoch: 0, Step: 0, Loss: 8.920101165771484
Epoch: 0, Step: 50, Loss: 3.739185333251953
Epoch: 0, Step: 100, Loss: 2.454301118850708
Epoch: 0, Step: 150, Loss: 2.605936288833618
Epoch: 0, Step: 200, Loss: 1.9391080141067505
Epoch: 0, Step: 250, Loss: 2.093918561935425
Epoch: 0, Step: 300, Loss: 1.5826873779296875
Epoch: 0, Step: 350, Loss: 1.878882646560669
Epoch: 0, Step: 400, Loss: 1.5180747509002686
Epoch: 0, Step: 450, Loss: 1.6507668495178223
Epoch: 0, Step: 500, Loss: 1.0915071964263916
Epoch: 0, Step: 550, Loss: 1.3303786516189575
Epoch: 0, Step: 600, Loss: 1.51410710811615
Epoch: 0, Step: 650, Loss: 1.119321346282959
Epoch: 0, Step: 700, Loss: 1.1974446773529053
Epoch: 0, Step: 750, Loss: 1.1575895547866821
Epoch: 0, Step: 800, Loss: 0.8258547782897949
Epoch: 0, Step: 850, Loss: 1.0285224914550781
Epoch: 0, Step: 900, Loss: 0.9652339220046997
Epoch: 0, Step: 950, Loss: 0.8514559

## Denoising Test Data

In [None]:
test_df = pd.read_csv(os.path.join(path, "xsum_test.csv"),sep='\t')
test_df['text'] = 'denoise_text: '+test_df['text']

saved_model = os.path.join(path, "t5_trained_model_v1")
tokenizer = T5Tokenizer.from_pretrained(saved_model, local_files_only=True)
model = T5ForConditionalGeneration.from_pretrained(saved_model, local_files_only=True).to(device)



test_set = GetDataset(
        test_df,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text="text",
        target_text="gt_text",
    )

test_loader = DataLoader(test_set, **test_params)
predictions, actuals = predict_from_model(model_params["TEST_EPOCHS"], tokenizer, model, test_loader)

Completed 0 Steps
Completed 150 Steps
Completed 300 Steps
Completed 450 Steps
Completed 600 Steps
Completed 750 Steps
Completed 900 Steps
Completed 1050 Steps
Completed 1200 Steps
Completed 1350 Steps
Completed 1500 Steps


In [None]:
test_preds  = pd.DataFrame({"Predicted_Text": predictions, "GT_Text": actuals})
test_preds.to_csv(os.path.join(path, "xsum_test_pred_t5.csv"), index=False, sep='\t')

## Evaluation on Test Data

In [None]:
test_df = pd.read_csv(os.path.join(path, "xsum_test.csv"),sep='\t')
test_preds = pd.read_csv(os.path.join(path, "xsum_test_pred_t5.csv"), sep='\t')
acc_df = get_accuracy_df(test_df, test_preds)

print(f"Total incorrect tokens: {acc_df['replaced_word_count'].sum()}\n Total correct predictions: {acc_df['correct_prediction_count'].sum()} \nTest accuracy: {acc_df['correct_prediction_count'].sum()/acc_df['replaced_word_count'].sum()}")
display(acc_df.head(2))

Total incorrect tokens: 63595
 Total correct predictions: 55858 
Test accuracy: 0.8783394920984354


Unnamed: 0,text,Predicted_Text,GT_Text,replaced_gt_words,replaced_original_words,replaced_word_count,correct_predictions,correct_prediction_count,wrong_predictions
0,Bangor City MAnageR Kevin Nicholson sayS it Would be a hg achievement if they overturn a 1-0 first-leg dficit against Lyngby BK.,Bangor City manager Kevin Nicholson says it would be a huge achievement if they overturn a 1-0 first-leg deficit against Lyngby BK.,Bangor City manager Kevin Nicholson says it would be a huge achievement if they overturn a 1-0 first-leg deficit against Lyngby BK.,"[huge, deficit]","[hg, dficit]",2,"[huge, deficit]",2,[]
1,dee actor who played Darth Vader in the original Star Wars filmss says hopeufls from Bristol auditioning for the nww film should disgis their accents.,The actor who played Darth Vader in the original Star Wars films says hopefuls from Bristol auditioning for the new film should disgise their accents.,The actor who played Darth Vader in the original Star Wars films says hopefuls from Bristol auditioning for the new film should disguise their accents.,"[the, films, hopefuls, new, disguise]","[dee, filmss, hopeufls, nww, disgis]",5,"[the, films, hopefuls, new]",4,[disgise]


#### Correctly denoised sentences

In [None]:
acc_df[acc_df['replaced_word_count']==acc_df['correct_prediction_count']]\
  .drop(columns=['replaced_original_words','replaced_word_count',
               'correct_prediction_count','wrong_predictions']).head(5)

Unnamed: 0,text,Predicted_Text,GT_Text,replaced_gt_words,correct_predictions
0,Bangor City MAnageR Kevin Nicholson sayS it Would be a hg achievement if they overturn a 1-0 first-leg dficit against Lyngby BK.,Bangor City manager Kevin Nicholson says it would be a huge achievement if they overturn a 1-0 first-leg deficit against Lyngby BK.,Bangor City manager Kevin Nicholson says it would be a huge achievement if they overturn a 1-0 first-leg deficit against Lyngby BK.,"[huge, deficit]","[huge, deficit]"
2,"Catalans Dragons earned a narrow vic2ry at Hull FC, despite four tRies from Airlie Birds wingEr Tom Lineham.","Catalans Dragons earned a narrow victory at Hull FC, despite four goals from Airlie Birds winger Tom Lineham.","Catalans Dragons earned a narrow victory at Hull FC, despite four tries from Airlie Birds winger Tom Lineham.",[victory],[victory]
3,"A driverr whoooo crashed into a shop, klilingg a teenage passenger, after failig t stopppppp for police has been jiled for eight years.","A driver who crashed into a shop, killing a teenage passenger, after failing to stop for police has been jailed for eight years.","A driver who crashed into a shop, killing a teenage passenger, after failing to stop for police has been jailed for eight years.","[driver, who, killing, failing, to, stop, jailed]","[driver, who, killing, failing, to, stop, jailed]"
4,"Kenyan police have bEen Involved in aaa firece gun battle with militant Islamists in north-eastern Garissa County, leaving onee officer wounded, a government spokesman has told the BBC.","Kenyan police have been involved in a fierce gun battle with militant Islamists in north-eastern Garissa County, leaving one officer wounded, a government spokesman has told the BBC.","Kenyan police have been involved in a fierce gun battle with militant Islamists in north-eastern Garissa County, leaving one officer wounded, a government spokesman has told the BBC.","[a, fierce, one]","[a, fierce, one]"
9,The care provided for vulnrble chIldrEn in Leeds has been praised by gvrnment inspectors fve years after the SrviceS were rated inadeqUate.,The care provided for vulnerable children in Leeds has been praised by government inspectors five years after the services were rated inadequate.,The care provided for vulnerable children in Leeds has been praised by government inspectors five years after the services were rated inadequate.,"[vulnerable, government, five, services]","[vulnerable, government, five, services]"


#### Incorrectly denoised sentences

In [None]:
incorrect_df = acc_df[acc_df['replaced_word_count']!=acc_df['correct_prediction_count']]\
  .drop(columns=['replaced_original_words','replaced_word_count',
               'correct_prediction_count'])
incorrect_df.head(10)

Unnamed: 0,text,Predicted_Text,GT_Text,replaced_gt_words,correct_predictions,wrong_predictions
1,dee actor who played Darth Vader in the original Star Wars filmss says hopeufls from Bristol auditioning for the nww film should disgis their accents.,The actor who played Darth Vader in the original Star Wars films says hopefuls from Bristol auditioning for the new film should disgise their accents.,The actor who played Darth Vader in the original Star Wars films says hopefuls from Bristol auditioning for the new film should disguise their accents.,"[the, films, hopefuls, new, disguise]","[the, films, hopefuls, new]",[disgise]
5,"Leicester Haveee complained t the Football Association about abuse, provcoatoin and intimidation thy say their players, fans and stafff suffered during Sturdays match at Millwall.","Leicester have complained to the Football Association about abuse, provision and intimidation they say their players, fans and staff suffered during Saturdays match at Millwall.","Leicester have complained to the Football Association about abuse, provocation and intimidation they say their players, fans and staff suffered during Saturdays match at Millwall.","[have, to, provocation, they, staff, saturdays]","[have, to, they, staff, saturdays]",[provision]
6,Ruth Davidson hasss wrned de SNP aganist using Brexit as an excuse to strtt yet anthr fratricidal conflict.,Ruth Davidson has warned the SNP activist using Brexit as an excuse to strike yet another religious conflict.,Ruth Davidson has warned the SNP against using Brexit as an excuse to start yet another fratricidal conflict.,"[has, warned, the, against, start, another]","[has, warned, the, another]","[activist, strike]"
7,"The lss fff midfielder NGolo Kante to Chelsea maens Leicester wiilllllllll Not repe8 their shock title win of last season, sys formr Foxes striker Gary Lineker.","The loss of midfielder NGolo Kante to Chelsea means Leicester will not replicate their shock title win of last season, says former Foxes striker Gary Lineker.","The loss of midfielder NGolo Kante to Chelsea means Leicester will not repeat their shock title win of last season, says former Foxes striker Gary Lineker.","[loss, of, means, will, repeat, says, former]","[loss, of, means, will, says, former]",[replicate]
8,Cch John Bracewell says the looming decision over Test status is puttng his Ireland team under huge pressure innn the one-day series against England.,Coach John Bracewell says the looming decision over Test status is placing his Ireland team under huge pressure in the one-day series against England.,Coach John Bracewell says the looming decision over Test status is putting his Ireland team under huge pressure in the one-day series against England.,"[coach, putting, in]","[coach, in]",[placing]
10,"Satuurdays singr Rochelle Humes is the New host of The X Factor spin-off sHOw The Xtra Factor, whre she hass been prtnered with Kiss FM DJ Melvin Odoom.","Saturdays singer Rochelle Humes is the new host of The X Factor spin-off Show The Extra Factor, where she has been replaced with Kiss FM DJ Melvin Odoom.","Saturdays singer Rochelle Humes is the new host of The X Factor spin-off show The Xtra Factor, where she has been partnered with Kiss FM DJ Melvin Odoom.","[saturdays, singer, where, has, partnered]","[saturdays, singer, where, has]",[replaced]
12,A seacrh hass resuemd of the River Ness in Inverness after police received reports of a man falling from a bridge oN Sunday.,A seacrooper has resurfaced of the River Ness in Inverness after police received reports of a man falling from a bridge on Sunday.,A search has resumed of the River Ness in Inverness after police received reports of a man falling from a bridge on Sunday.,"[search, has, resumed]",[has],"[seacrooper, resurfaced]"
17,Owen Williams kikced 17 points s Leicester Tigers inflicted a fisrt defat of d season on Bath.,Owen Williams scored 17 points as Leicester Tigers inflicted a first defeat of the season on Bath.,Owen Williams kicked 17 points as Leicester Tigers inflicted a first defeat of the season on Bath.,"[kicked, as, first, defeat, the]","[as, first, defeat, the]",[scored]
18,"Inexperienced refrees have gone berserk nn imposnnng yellow cards afer a neww rule Was intrdcddd in 2017, sys ex-Wales sr Jonathan Davies.","Inexperienced red cards have gone berserk in imposing yellow cards after a new rule was introduced in 2017, says ex-Wales striker Jonathan Davies.","Inexperienced referees have gone berserk in imposing yellow cards after a new rule was introduced in 2017, says ex-Wales star Jonathan Davies.","[referees, in imposing, after, new, introduced, says, star]","[after, in imposing, introduced, says, new]","[red cards, striker]"
19,The famous Spiegleetntttt has lost its hme during the Edinburgh Festival.,The famous Spiegleetint has lost its name during the Edinburgh Festival.,The famous Spiegeltent has lost its home during the Edinburgh Festival.,"[spiegeltent, home]",[],"[spiegleetint, name]"


In [None]:
# writing incorrect examples for analysis
incorrect_df.to_csv(os.path.join(path, "xsum_test_wrong_pred_t5.csv"), index=False, sep='\t')

## Normalizing SMS data

In [10]:
saved_model = os.path.join(path, "t5_trained_model_v1")
tokenizer = T5Tokenizer.from_pretrained(saved_model, local_files_only=True)
model = T5ForConditionalGeneration.from_pretrained(saved_model, local_files_only=True).to(device)

sms_df = pd.read_csv(os.path.join(path, "sms_data.tsv"),sep='\t')
sms_df['text'] = 'denoise_text: '+sms_df['text']
sms_df['gt_text']=""

sms_set = GetDataset(
        sms_df,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text="text",
        target_text="gt_text",
    )

data_loader = DataLoader(sms_set, **test_params)
predictions, actuals = predict_from_model(model_params["TEST_EPOCHS"], tokenizer, model, data_loader)

sms_preds  = pd.DataFrame({"Predicted_Text": predictions})
sms_preds.to_csv(os.path.join(path, "sms_test_pred_t5.csv"), index=False, sep='\t')


Completed 0 Steps


## Normalizing data for Sentiment Analysis

In [15]:
saved_model = os.path.join(path, "t5_trained_model_v1")
tokenizer = T5Tokenizer.from_pretrained(saved_model, local_files_only=True)
model = T5ForConditionalGeneration.from_pretrained(saved_model, local_files_only=True).to(device)

sst_df = pd.read_csv(os.path.join(path, "sst_unnormalized_data.tsv"),sep='\t')
sst_df['sentence'] = 'denoise_text: '+sst_df['sentence']

sst_set = GetDataset(
        sst_df,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text="sentence",
        target_text="label",
    )

data_loader = DataLoader(sst_set, **test_params)
predictions, actuals = predict_from_model(model_params["TEST_EPOCHS"], tokenizer, model, data_loader)

sst_preds  = pd.DataFrame({"sentence": predictions, "label": actuals})
sst_preds.to_csv(os.path.join(path, "t5_sst_normalized.csv"), index=False, sep='\t')


Completed 0 Steps
