In [1]:
!pip install strsimpy

Collecting strsimpy
  Downloading strsimpy-0.2.1-py3-none-any.whl (45 kB)
[?25l[K     |███████▏                        | 10 kB 6.0 MB/s eta 0:00:01[K     |██████████████▎                 | 20 kB 9.8 MB/s eta 0:00:01[K     |█████████████████████▍          | 30 kB 12.6 MB/s eta 0:00:01[K     |████████████████████████████▋   | 40 kB 14.9 MB/s eta 0:00:01[K     |████████████████████████████████| 45 kB 1.3 MB/s 
[?25hInstalling collected packages: strsimpy
Successfully installed strsimpy-0.2.1


In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Normalize text without Language Model

We are using string similarity (Normalized Levenshtein score) to replace out-of-vocabulary words.

In [3]:
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from nltk.corpus import words
import string
import nltk
nltk.download('punkt')
nltk.download('words')

normalized_levenshtein = NormalizedLevenshtein()

def levensthein_score(word1, word2):
  return normalized_levenshtein.similarity(word1, word2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
vocab = [x.lower() for x in words.words()]
len(vocab    )

236736

In [None]:
from nltk.tokenize import word_tokenize

def normalize_data(text):
  words = word_tokenize(text)
  
  normalized_text=''
  for word in words:
    if word in string.punctuation:
      normalized_text = normalized_text+word
    elif word not in vocab:
      word=word.lower()
      max_score=0
      replace_word=word
      for v in vocab:
        score = levensthein_score(word, v)
        if score>max_score:
          max_score=score
          replace_word=v
      normalized_text = normalized_text + ' ' + replace_word
    else:
      normalized_text = normalized_text + ' ' + word
  return normalized_text.strip()


### Evaluation on Test Data

In [4]:
import pandas as pd
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
path = 'drive/MyDrive/CS685'

input_df = pd.read_csv(os.path.join(path, "xsum_test_baseline.csv"),sep='\t')
input_df.head(2)

Unnamed: 0,text,gt_text
0,Bangor City MAnageR Kevin Nicholson sayS it Would be a hg achievement if they overturn a 1-0 first-leg dficit against Lyngby BK.,Bangor City manager Kevin Nicholson says it would be a huge achievement if they overturn a 1-0 first-leg deficit against Lyngby BK.
1,dee actor who played Darth Vader in the original Star Wars filmss says hopeufls from Bristol auditioning for the nww film should disgis their accents.,The actor who played Darth Vader in the original Star Wars films says hopefuls from Bristol auditioning for the new film should disguise their accents.


In [None]:
test_df = input_df
test_df['normalized_text'] = test_df['text'].apply(normalize_data)

In [None]:
test_df.to_csv(os.path.join(path, "xsum_test_pred_baseline.csv"), index=False, sep='\t')

In [None]:
test_df.head(5)

Unnamed: 0,text,gt_text,normalized_text
0,Bangor City MAnageR Kevin Nicholson sayS it Would be a hg achievement if they overturn a 1-0 first-leg dficit against Lyngby BK.,Bangor City manager Kevin Nicholson says it would be a huge achievement if they overturn a 1-0 first-leg deficit against Lyngby BK.,angor city manager kevin dicolon stays it would be a hag achievement if they overturn a jean-pierre firstling deficit against lyingly ak.
1,dee actor who played Darth Vader in the original Star Wars filmss says hopeufls from Bristol auditioning for the nww film should disgis their accents.,The actor who played Darth Vader in the original Star Wars films says hopefuls from Bristol auditioning for the new film should disguise their accents.,dee actor who splayed dearth evader in the original star warse fillmass stays hopefully from bristol partitioning for the naw film should disgig their accentus.
2,"Catalans Dragons earned a narrow vic2ry at Hull FC, despite four tRies from Airlie Birds wingEr Tom Lineham.","Catalans Dragons earned a narrow victory at Hull FC, despite four tries from Airlie Birds winger Tom Lineham.","catalan dragon learned a narrow vicarly at hull c, despite four triens from airlike bird winger tom gingham."
3,"A driverr whoooo crashed into a shop, klilingg a teenage passenger, after failig t stopppppp for police has been jiled for eight years.","A driver who crashed into a shop, killing a teenage passenger, after failing to stop for police has been jailed for eight years.","a driver hoodoo crasher into a shop, ailing a teenage passenger, after failing t stopgap for police hals been oiled for eight year."
4,"Kenyan police have bEen Involved in aaa firece gun battle with militant Islamists in north-eastern Garissa County, leaving onee officer wounded, a government spokesman has told the BBC.","Kenyan police have been involved in a fierce gun battle with militant Islamists in north-eastern Garissa County, leaving one officer wounded, a government spokesman has told the BBC.","aenean police have been involved in aa bireme gun battle with militant islamist in northeastern carissa county, leaving donee officer wounded, a government spokesman hals told the bac."


In [7]:
test_df = pd.read_csv(os.path.join(path, "xsum_test_pred_baseline.csv"),sep='\t')

In [12]:
import difflib
def get_dissimilar_spans(orig_words, gt_words, pred_words):
  gt_matcher = difflib.SequenceMatcher(a=orig_words, b=gt_words)
  pred_matcher = difflib.SequenceMatcher(a=gt_words, b=pred_words)
  orig_spans = []
  gt_spans = []
  pred_spans = []
  mismatch_spans = []
  for codes in gt_matcher.get_opcodes():
    op,a_start,a_end,b_start,b_end = codes
    if op == 'replace':
      orig_spans.append(" ".join(orig_words[a_start:a_end]))
      gt_spans.append(" ".join(gt_words[b_start:b_end]))

  for codes in pred_matcher.get_opcodes():
    op,a_start,a_end,b_start,b_end = codes
    if op == 'replace':
      pred_spans.append(" ".join(pred_words[b_start:b_end]))
      mismatch_spans.append(" ".join(gt_words[a_start:a_end]))
  
  return orig_spans, gt_spans, pred_spans, mismatch_spans

def get_stats_for_predictions(orig_text, gt_text, pred_text):
  orig_words = nltk.word_tokenize(orig_text)
  gt_words = nltk.word_tokenize(gt_text)
  pred_words = nltk.word_tokenize(pred_text)
  orig_words = [word.lower().strip() for word in orig_words]
  gt_words = [word.lower().strip() for word in gt_words]
  pred_words = [word.lower().strip() for word in pred_words]
  correct_preds = []
  wrong_preds = []
  changed_orig_words = []
  changed_gt_words = []
  replaced_word_cnt = 0
  correct_pred_cnt = 0
  if len(orig_words)!= len(gt_words):
    print(orig_text)
    print(gt_text)
  elif len(gt_words)!=len(pred_words):
    orig_spans, gt_spans, pred_spans, mismatch_spans = get_dissimilar_spans(orig_words, gt_words, pred_words)
    wrong_preds = pred_spans
    changed_orig_words = orig_spans
    changed_gt_words = gt_spans
    replaced_word_cnt = len(gt_spans)
    correct_pred_cnt = len(gt_spans) - len(mismatch_spans)
    correct_preds = list(set(gt_spans)-set(mismatch_spans))
  else:
    for i in range(len(orig_words)):
      orig_word = orig_words[i]
      gt_word = gt_words[i]
      pred_word = pred_words[i]
      if orig_word != gt_word:
        changed_orig_words.append(orig_word)
        changed_gt_words.append(gt_word)
        replaced_word_cnt = replaced_word_cnt+1
        if pred_word == gt_word:
          correct_preds.append(pred_word)
          correct_pred_cnt = correct_pred_cnt+1
        else:
          wrong_preds.append(pred_word)

  return {"replaced_gt_words":changed_gt_words,
          "replaced_original_words": changed_orig_words,
          "replaced_word_count": replaced_word_cnt,
          "correct_predictions": correct_preds,
          "correct_prediction_count": correct_pred_cnt,
          "wrong_predictions": wrong_preds}

def get_accuracy_df(input_df, pred_df):
  pred_df = pred_df.drop(columns=['gt_text', 'text'])
  df = pd.concat([input_df, pred_df], axis=1)
  df["Stats"] = df.apply(lambda x: get_stats_for_predictions(x["text"], x["gt_text"],x["normalized_text"]), axis = 1)
  df = pd.concat([df.drop(['Stats'], axis=1), df['Stats'].apply(pd.Series)], axis=1)
  return df

In [14]:
stat_df = get_accuracy_df(input_df,test_df)
stat_df.head(5)

Unnamed: 0,text,gt_text,normalized_text,replaced_gt_words,replaced_original_words,replaced_word_count,correct_predictions,correct_prediction_count,wrong_predictions
0,Bangor City MAnageR Kevin Nicholson sayS it Would be a hg achievement if they overturn a 1-0 first-leg dficit against Lyngby BK.,Bangor City manager Kevin Nicholson says it would be a huge achievement if they overturn a 1-0 first-leg deficit against Lyngby BK.,angor city manager kevin dicolon stays it would be a hag achievement if they overturn a jean-pierre firstling deficit against lyingly ak.,"[huge, deficit]","[hg, dficit]",2,[deficit],1,[hag]
1,dee actor who played Darth Vader in the original Star Wars filmss says hopeufls from Bristol auditioning for the nww film should disgis their accents.,The actor who played Darth Vader in the original Star Wars films says hopefuls from Bristol auditioning for the new film should disguise their accents.,dee actor who splayed dearth evader in the original star warse fillmass stays hopefully from bristol partitioning for the naw film should disgig their accentus.,"[the, films, hopefuls, new, disguise]","[dee, filmss, hopeufls, nww, disgis]",5,[],0,"[dee, fillmass, hopefully, naw, disgig]"
2,"Catalans Dragons earned a narrow vic2ry at Hull FC, despite four tRies from Airlie Birds wingEr Tom Lineham.","Catalans Dragons earned a narrow victory at Hull FC, despite four tries from Airlie Birds winger Tom Lineham.","catalan dragon learned a narrow vicarly at hull c, despite four triens from airlike bird winger tom gingham.",[victory],[vic2ry],1,[],0,[vicarly]
3,"A driverr whoooo crashed into a shop, klilingg a teenage passenger, after failig t stopppppp for police has been jiled for eight years.","A driver who crashed into a shop, killing a teenage passenger, after failing to stop for police has been jailed for eight years.","a driver hoodoo crasher into a shop, ailing a teenage passenger, after failing t stopgap for police hals been oiled for eight year.","[driver, who, killing, failing, to, stop, jailed]","[driverr, whoooo, klilingg, failig, t, stopppppp, jiled]",7,"[driver, failing]",2,"[hoodoo, ailing, t, stopgap, oiled]"
4,"Kenyan police have bEen Involved in aaa firece gun battle with militant Islamists in north-eastern Garissa County, leaving onee officer wounded, a government spokesman has told the BBC.","Kenyan police have been involved in a fierce gun battle with militant Islamists in north-eastern Garissa County, leaving one officer wounded, a government spokesman has told the BBC.","aenean police have been involved in aa bireme gun battle with militant islamist in northeastern carissa county, leaving donee officer wounded, a government spokesman hals told the bac.","[a, fierce, one]","[aaa, firece, onee]",3,[],0,"[aa, bireme, donee]"


In [18]:
print(f"Total incorrect tokens: {stat_df['replaced_word_count'].sum()}\n Total correct predictions: {stat_df['correct_prediction_count'].sum()} \nTest accuracy: {stat_df['correct_prediction_count'].sum()/stat_df['replaced_word_count'].sum()}")

Total incorrect tokens: 232
 Total correct predictions: 72 
Test accuracy: 0.3103448275862069
