In [None]:
import pandas as pd
import re
from google.colab import drive, files
drive.mount('/content/drive')

In [None]:
!pip3 install transformers
!cp /content/drive/MyDrive/fake-news-explainability/utils_fake_news.py .

In [None]:
%run utils_fake_news.py

In [None]:
negate_dict = {"isn't":"is",
    "isn\\'t":"is",
    "is not ":"is ",
    "is ":"is not ",
    "didn't":"did",
    "didn\\'t":"did",
    "did not ":"did",
    "does not have":"has",
    "doesn't have":"has",
    "doesn\\'t have":"has",
    "has ":"does not have ",
    "shouldn't":"should",
    "shouldn\\'t":"should",
    "should not":"should",
    "should":"should not",
    "wouldn't":"would",
    "wouldn\\'t":"would",
    "would not":"would",
    "would":"would not",
    "mustn't":"must",
    "mustn\\'t":"must",
    "must not":"must",
    "must ":"must not ",
    "can't":"can",
    "can\\'t":"can",
    "cannot":"can",
    " can ":" cannot "}

IRREGULAR_ES_VERB_ENDINGS = ["ss", "x", "ch", "sh", "o"]

def negate(sentence):

  for key in negate_dict.keys():
    if sentence.find(key) > -1:
      return sentence.replace(key, negate_dict[key])

  # doesn't work -> works
  doesnt_regex = r'(doesn\'t|doesn\\\'t|does not) (?P<verb>\w+)'

  if re.search(doesnt_regex, sentence):
    return re.sub(doesnt_regex, replace_doesnt, sentence, 1)

  return None

def __is_consonant(letter):
  return letter not in ['a', 'e', 'i', 'o', 'u', 'y']

def replace_doesnt(matchobj):
  verb = matchobj.group(2)

  if verb.endswith("y") and __is_consonant(verb[-2]):
    return "{0}ies".format(verb[0:-1])

  for ending in IRREGULAR_ES_VERB_ENDINGS:
    if verb.endswith(ending):
      return "{0}es".format(verb)

  return "{0}s".format(verb)

def replace_verb(matchobj):
  subject = matchobj.group(1)
  verb = matchobj.group(2)
  whitespace = matchobj.group(3)

  # flies -> fly, but not die -> dy
  if verb.endswith("ie") and len(verb) > 3:
    verb = "{0}y".format(verb[0:-2])

  # stresses -> stress
  for ending in IRREGULAR_ES_VERB_ENDINGS:
    if verb.endswith("{0}e".format(ending)):
      verb = verb[0:-1]

  return "{0}does not {1}{2}".format(subject, verb, whitespace)

### Fake News Dataset

In [None]:
# Read in data
df = pd.read_csv("/content/drive/MyDrive/fake-news-explainability/Data/Raw/fake_news_train.csv")
df = df.rename(columns={'title':'statement'})
df = df.dropna(subset=['statement']).reset_index(drop=True)
df = df.drop(['author','text'], axis=1)

# Clean and negate
df['statement'] = df['statement'].apply(lambda x: x.lower().replace('’',"'"))
df_neg = df.copy()
df_neg['statement'] = df_neg['statement'].apply(negate)

df_neg = df_neg.loc[~df_neg.statement.isnull()]
df_pos = df.loc[df_neg.index].reset_index(drop=True)
df_neg = df_neg.reset_index(drop=True)

# Relabel
df_neg['label'] = df_neg['label'].apply(lambda x: 0 if x==1 else 1)

# Save encoded versions for FakeBERT
# torch.save(encode_dataframe(df['statement'], df['label']),
#            '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/fake_news/training/fake_news.pt')
# torch.save(encode_dataframe(df_pos['statement'], df_pos['label']),
#            '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/fake_news/evaluation/fake_news_pos.pt')
# torch.save(encode_dataframe(df_neg['statement'], df_neg['label']),
#            '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/fake_news/evaluation/fake_news_neg.pt')

# Save CSV versions for FakeBERT TF-IDF
df_pos.to_csv('/content/drive/MyDrive/fake-news-explainability/Data/Encoded/fake_news/evaluation/fake_news_pos.csv', index=False)
df_neg.to_csv('/content/drive/MyDrive/fake-news-explainability/Data/Encoded/fake_news/evaluation/fake_news_neg.csv', index=False)

### LIAR Dataset

In [None]:
# Read in train data
df = pd.read_csv("/content/drive/MyDrive/fake-news-explainability/Data/Raw/liar_train.tsv", 
                 delimiter='\t', 
                 header=None)
df.columns = ['ID','label','statement','subject','speaker',
              'job_title','state','party','barely_true_count',
              'false_count','half_true_count','mostly_true_count',
              'pants_on_fire_count','context']
df = df.dropna(subset=['statement']).reset_index(drop=True)
df = df[['ID','statement','label']]

# Label
liar_encode = {'barely-true':1, 'false':1, 'half-true':1, 
               'mostly-true':0, 'pants-fire':0, 'true':0}
df['label'] = df['label'].apply(lambda x: liar_encode[x])

# Save
torch.save(encode_dataframe(df['statement'], df['label']),
           '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/liar/training/liar_train.pt')

In [None]:
# Read in test data
df = pd.read_csv("/content/drive/MyDrive/fake-news-explainability/Data/Raw/liar_test.tsv", 
                 delimiter='\t', 
                 header=None)
df.columns = ['ID','label','statement','subject','speaker',
              'job_title','state','party','barely_true_count',
              'false_count','half_true_count','mostly_true_count',
              'pants_on_fire_count','context']
df = df.dropna(subset=['statement']).reset_index(drop=True)
df = df[['ID','statement','label']]

# Label
liar_encode = {'barely-true':1, 'false':1, 'pants-fire':1, 
               'half-true':0, 'mostly-true':0, 'true':0}
df['label'] = df['label'].apply(lambda x: liar_encode[x])

# Clean and negate
df['statement'] = df['statement'].apply(lambda x: x.lower().replace('’',"'"))
df_neg = df.copy()
df_neg['statement'] = df_neg['statement'].apply(negate)

df_neg = df_neg.loc[~df_neg.statement.isnull()]
df_pos = df.loc[df_neg.index].reset_index(drop=True)
df_neg = df_neg.reset_index(drop=True)

# Relabel
df_neg['label'] = df_neg['label'].apply(lambda x: 0 if x==1 else 1)

# Save
torch.save(encode_dataframe(df['statement'], df['label']),
           '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/liar/training/liar_test.pt')
torch.save(encode_dataframe(df_pos['statement'], df_pos['label']),
           '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/liar/evaluation/liar_test_pos.pt')
torch.save(encode_dataframe(df_neg['statement'], df_neg['label']),
           '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/liar/evaluation/liar_test_neg.pt')

In [None]:
# Read in validation data
df = pd.read_csv("/content/drive/MyDrive/fake-news-explainability/Data/Raw/liar_valid.tsv", 
                 delimiter='\t', 
                 header=None)
df.columns = ['ID','label','statement','subject','speaker',
              'job_title','state','party','barely_true_count',
              'false_count','half_true_count','mostly_true_count',
              'pants_on_fire_count','context']
df = df.dropna(subset=['statement']).reset_index(drop=True)
df = df[['ID','statement','label']]

# Label
liar_encode = {'barely-true':1, 'false':1, 'half-true':1, 
               'mostly-true':0, 'pants-fire':0, 'true':0}
df['label'] = df['label'].apply(lambda x: liar_encode[x])

# Clean and negate
df['statement'] = df['statement'].apply(lambda x: x.lower().replace('’',"'"))
df_neg = df.copy()
df_neg['statement'] = df_neg['statement'].apply(negate)

df_neg = df_neg.loc[~df_neg.statement.isnull()]
df_pos = df.loc[df_neg.index].reset_index(drop=True)
df_neg = df_neg.reset_index(drop=True)

# Relabel
df_neg['label'] = df_neg['label'].apply(lambda x: 0 if x==1 else 1)

# Save
torch.save(encode_dataframe(df['statement'], df['label']),
           '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/liar/training/liar_valid.pt')
torch.save(encode_dataframe(df_pos['statement'], df_pos['label']),
           '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/liar/evaluation/liar_valid_pos.pt')
torch.save(encode_dataframe(df_neg['statement'], df_neg['label']),
           '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/liar/evaluation/liar_valid_neg.pt')