In [None]:
!pip3 install -U spacy
!python -m spacy download en_core_web_sm
!pip3 install fuzzywuzzy
!pip3 install transformers
!cp /content/drive/MyDrive/fake-news-explainability/utils_fake_news.py .
%run utils_fake_news.py

Collecting spacy
  Downloading spacy-3.1.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 4.1 MB/s 
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 49.8 MB/s 
Collecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.7 MB/s 
[?25hCollecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (451 kB)
[K     |████████████████████████████████| 451 kB 64.1 MB/s 
[?25hCollecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.6-py3-none-any.whl (17 kB)
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.0-py3-none-any.whl (27 kB)
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (628 kB)
[K     |███████

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import pandas as pd
import spacy
from fuzzywuzzy import fuzz, process
import random
import itertools

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

def find_entities(text, type='PERSON'):
  doc = nlp(text)
  return [entity.text for entity in doc.ents if (entity.label_==type)\
          and (len(entity.text.split())>1)]



In [None]:
# df = pd.read_csv('/content/drive/MyDrive/fake-news-explainability/Data/Raw/liar_valid.tsv', 
#                  delimiter='\t',
#                  header=None)
df = pd.read_csv('/content/drive/MyDrive/fake-news-explainability/Data/Raw/fake_news.csv')
df.columns = ['ID','label','statement','subject','speaker',
              'job_title','state','party','barely_true_count',
              'false_count','half_true_count','mostly_true_count',
              'pants_on_fire_count','context']

# Find names within statements, keep statements with names
df['entities'] = df['statement'].apply(find_entities)
df = df.loc[df.entities.apply(len)>0].reset_index(drop=True)

### Manual Encoding

In [None]:
names = list(set(list(itertools.chain(*df['entities']))))
df_pol = pd.DataFrame({'name':names})

df_pol_raw = pd.read_csv('/content/drive/MyDrive/fake-news-explainability/politicians_raw.csv')
df_pol['fuzzy'] = df_pol['name'].apply(lambda s: process.extractOne(s, df_pol_raw['Name']))
df_pol['match'] = df_pol['fuzzy'].apply(lambda x: x[0])
df_pol['score'] = df_pol['fuzzy'].apply(lambda x: x[1])
df_pol['index'] = df_pol['fuzzy'].apply(lambda x: x[2])
df_pol['recheck'] = df_pol['score'].apply(lambda x: 1 if x<95 else 0)
# df_names.to_csv('politicians.csv', index=False)

In [None]:
df_pol = df_names

### Automated Processing

In [None]:
# Reclassified data, keep only verified names
df_pol = pd.read_csv('/content/drive/MyDrive/fake-news-explainability/politicians.csv',
                 encoding='latin-1')
df_pol = df_pol.loc[df_pol.recheck==0].reset_index(drop=True)

# Add party to names
df_raw = pd.read_csv('/content/drive/MyDrive/fake-news-explainability/politicians_raw.csv')
df_raw.loc[df_raw.Political_party.str.contains('Republican'),'Political_party'] = 'Republican Party'
df_raw.loc[df_raw.Political_party.str.contains('Democratic'),'Political_party'] = 'Democratic Party'
df_pol['match'] = df_pol['index'].apply(lambda i: df_raw['Name'][i])
df_pol['party'] = df_pol['index'].apply(lambda i: df_raw['Political_party'][i])

# Fix names in entity list
entity_dict = {row['name']:row['match'] for (i,row) in df_pol.iterrows()}
df['entities_clean'] = df['entities'].apply(lambda lst: [entity_dict[name] for name in lst if name in entity_dict])

# Match party to name-statement pairs
party_dict = {row['match']:row['party'] for (i,row) in df_pol.iterrows()}
df['party'] = df['entities_clean'].apply(lambda lst: [party_dict[name] for name in lst if name in party_dict])

# Only keep statements that have names from 1 party
df = df.loc[df.party.apply(lambda l: len(set(l))==1)].reset_index(drop=True)
df['party'] = df['party'].apply(lambda l: l[0])
df = df[['id', 'label', 'statement', 'entities', 'entities_clean', 'party']]

# Collect list of names per party
df_party = df_pol.groupby('party').agg({'match':list}).reset_index().rename(columns={'match':'name'})
party_dict = {row['party']:row['name'] for (i,row) in df_party.iterrows()}

# Specify which party a replacement candidate should come from
df = df.loc[df.party.isin(['Democratic Party','Republican Party'])].reset_index(drop=True)
df['replacement_party'] = df['party'].apply(lambda x: 'Democratic Party' if x=='Republican Party' else 'Republican Party')

# Generate list of replacement candidates
df['replacement_names'] = df.apply(lambda row: random.sample(party_dict[row['replacement_party']], len(row['entities'])), axis=1)

# Replace
df['statement_new'] = df.apply(lambda row: replace_names(row['statement'],row['entities'],row['replacement_names']), axis=1)

In [None]:
def replace_names(text, lst1, lst2):
    for (a,b) in zip(lst1,lst2):
        text = text.replace(a,b)
    return text

In [None]:
df_orig = df[['id','label','statement']]
df_new  = df[['id','label','statement_new']].rename(columns={'statement_new':'statement'})

In [None]:
# liar_encode = {'barely-true':1, 'false':1, 'pants-fire':1, 
#                'half-true':0, 'mostly-true':0, 'true':0}

# df_orig['label'] = df_orig['label'].apply(lambda x: liar_encode[x])
# df_new['label']  = df_new['label'].apply(lambda x: liar_encode[x])
df_new['label']  = df_new['label'].apply(lambda x: 0 if x==1 else 1)


In [None]:
df_orig.to_csv('/content/drive/MyDrive/fake-news-explainability/Data/Raw/fake_news_name_orig.csv', index=False)
df_new.to_csv('/content/drive/MyDrive/fake-news-explainability/Data/Raw/fake_news_name_new.csv', index=False)

In [None]:
torch.save(encode_dataframe(df_orig['statement'], df_orig['label']),
           '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/fake_news/evaluation/fake_news_name_orig.pt')
torch.save(encode_dataframe(df_new['statement'], df_new['label']),
           '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/fake_news/evaluation/fake_news_name_new.pt')



In [None]:
df_orig = pd.read_csv('/content/drive/MyDrive/fake-news-explainability/Data/Raw/liar_valid_name_orig.csv')
df_new = pd.read_csv('/content/drive/MyDrive/fake-news-explainability/Data/Raw/liar_valid_name_new.csv')

In [None]:
df_orig = df_orig.loc[df_orig.label==0].reset_index(drop=True)
df_new  = df_new.loc[df_new.label==1].reset_index(drop=True)

df_orig.to_csv('/content/drive/MyDrive/fake-news-explainability/Data/Raw/liar_valid_name_orig_filtered.csv')
df_new.to_csv('/content/drive/MyDrive/fake-news-explainability/Data/Raw/liar_valid_name_new_filtered.csv')

In [None]:
torch.save(encode_dataframe(df_orig['statement'], df_orig['label']),
           '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/liar/evaluation/liar_valid_name_orig_filtered.pt')
torch.save(encode_dataframe(df_new['statement'], df_new['label']),
           '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/liar/evaluation/liar_valid_name_new_filtered.pt')

