In [1]:
import pandas as pd
import numpy as np
import datetime

from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [19]:
## From colab-setup, replace if changes were made to this file

parent_dir = '/drive/MyDrive/spotify-misinformation'

preprocessing_output_dir = f"{parent_dir}/preprocessing-output"
matched_claims_output_dir = f"{parent_dir}/matched-claims-output"

# Define file paths for where podcast claims and fact checked claims are located

fact_checked_claims_fp = f"{preprocessing_output_dir}/politifact_filtered.csv"
transcript_claims_fp = f"{preprocessing_output_dir}/podcast_claims_context_2.tsv"

# Define filepath for matched claims

matched_claims_fp = f"{matched_claims_output_dir}/matched_claims_context_2.txt"


## Labeling outputs

labeling_output_dir = f"{parent_dir}/labeling-output"
blank_manually_labeled_datasets_dir = f"{labeling_output_dir}/blank-manual-labeling-output"

In [17]:
# Read data
## Recommend using 'High-RAM' and 'GPU' runtime

claims_df = pd.read_csv(fact_checked_claims_fp)

with open(transcript_claims_fp, 'r') as f:
  temp = f.readlines()
  pc_claims = [".".join(line.split("\t")[2:]) for line in temp]
  # print(pc_claims[0])

def get_kb_claim(kb_idx):
  return claims_df['Statement'][int(kb_idx)]

def get_pc_claim(pc_idx):
  return pc_claims[int(pc_idx)]

def get_kb_claim_date(kb_idx):
  return claims_df['Date'][int(kb_idx)]

## Read matched claims dataset
columns = ['Fact Checked Claim Index', 'Podcast Claim Index', 'Cosine Similarity Score']
mc_df = pd.read_csv(matched_claims_fp, names = columns)

# Find top 3000 paired claims ranked by cosine similarity score
sorted_mc_df = mc_df.sort_values(by=['Cosine Similarity Score'], ascending=False)[:3000]

In [18]:
## Print matched claims cosine similarity statistics

print(len(mc_df))
print(mc_df['Cosine Similarity Score'].max())
print(mc_df['Cosine Similarity Score'].mean())
print(mc_df['Cosine Similarity Score'].min())
print(mc_df['Cosine Similarity Score'].std())
print(mc_df['Cosine Similarity Score'].mode())

20
0.4015714228153229
0.22909172102808953
0.20195361971855166
0.042990087505111525
0     0.201954
1     0.202215
2     0.203250
3     0.206484
4     0.207729
5     0.208423
6     0.212744
7     0.212927
8     0.213876
9     0.215912
10    0.218087
11    0.224139
12    0.227115
13    0.228180
14    0.230161
15    0.235457
16    0.238248
17    0.243855
18    0.249506
19    0.401571
dtype: float64


# Getting dataset statistics

In [5]:
claims_df = pd.read_csv(fact_checked_claims_fp)

words = []

for x in claims_df['Statement'].tolist():
  words += x.split(" ")

print(len(words))
print(len(set(words)))

484
345


In [6]:
word_avg = [len(x.split(" ")) for x in claims_df['Statement'].tolist()]
print(sum(word_avg) / len(word_avg))

16.133333333333333


In [7]:
claims_df.iloc[len(claims_df) - 1]

Statement    Says Vice President Kamala Harris called the u...
Link         https://www.politifact.com/factchecks/2021/dec...
Date                                             ember 8, 2021
Source                                         Instagram posts
Label                                               pants-fire
Name: 29, dtype: object

In [8]:

with open(transcript_claims_fp, 'r') as f:
  temp = f.readlines()
  pc_claims = [",".join(line.split(",")[2:]) for line in temp]

print(len(pc_claims))

42


In [9]:
w = set()
for x in pc_claims:
  for y in x.split(" "):
    w.add(y)

print(len(w))

185


# Finding top matches and saving for manual labeling

In [20]:
# print(int(sorted_mc_df.iloc[0]['Fact Checked Claim Index']))
# print(int(sorted_mc_df.iloc[0]['Podcast Claim Index']))

column_names = ['Fact Checked Claim Index', 'Podcast Claim Index', 'Fact Checked Claim', 'Podcast Claim']
labeling_mc = []

for row in sorted_mc_df.iloc:
  kb_index = int(row['Fact Checked Claim Index'])
  pc_index = int(row['Podcast Claim Index'])

  kb_claim = get_kb_claim(kb_index)
  pc_claim = get_pc_claim(pc_index)

  labeling_mc.append([kb_index, pc_index, kb_claim, pc_claim])

labeling_mc_df = pd.DataFrame(labeling_mc, columns=column_names)
labeling_mc_df.head()

Unnamed: 0,Fact Checked Claim Index,Podcast Claim Index,Fact Checked Claim,Podcast Claim
0,15,21,"""Canada joins the no jab, no food trend""","I ain't gonna wait for the Mauri Mauri on me,..."
1,5,27,“Ivanka Trump is joining the Democrats to run ...,"Eh, eh, she cannot suck on my dick. Hey, I'ma..."
2,21,1,“An organ recovery medical team pays tribute t...,"SEP. yeah, I'm going to put it back in the sid..."
3,14,27,“Hillary replaces Kamala Harris.”,"Eh, eh, she cannot suck on my dick. Hey, I'ma..."
4,6,21,"At Lions Gate Hospital in Vancouver, “13 babie...","I ain't gonna wait for the Mauri Mauri on me,..."


In [21]:
labeling_mc_df['Stance Agreement'] = -1

In [22]:
random_df = labeling_mc_df.sample(frac=1).reset_index(drop=True)

labeling_phase = 'top_3000_context_2_single_sentence_'
labeling_dir = f"{blank_manually_labeled_datasets_dir}/{labeling_phase}"

overlap_df = random_df[0:600]
overlap_df.to_csv(labeling_dir + 'omar_overlap.csv', index=False)
overlap_df.to_csv(labeling_dir + 'abhijeet_overlap.csv', index=False)
overlap_df.to_csv(labeling_dir + 'jon_overlap.csv', index=False)

jon_df = random_df[600:1400]
jon_df.to_csv(labeling_dir + 'jon_individual.csv', index=False)

omar_df = random_df[1400:2200]
omar_df.to_csv(labeling_dir + 'omar_individual.csv', index=False)

ahibjeet_df = random_df[2200:]
ahibjeet_df.to_csv(labeling_dir + 'abhijeet_individual.csv', index=False)

# Interrater Scoring

In [24]:
import pandas as pd
import numpy as np

from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

from nltk.metrics.agreement import AnnotationTask
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [25]:
## Assumes that the manually labeled data is a Google Spreadsheet 
## (after clicking 'Open in Sheets' for the csv and manually changing stance agreement to our manual label)

gc = gspread.authorize(GoogleCredentials.get_application_default())

def df_from_google_sheets(path, columns):

  worksheet = gc.open(path).sheet1

  rows = worksheet.get_all_values()

  rows = [x[:5] for x in rows]

  return pd.DataFrame.from_records(rows[1:], columns=columns)

## Scoring

In [None]:
## Reads overlap files of manually labeled data

parent_dir = '/drive/MyDrive/spotify-misinformation'
labeling_output_dir = f"{parent_dir}/labeling-output"
blank_manually_labeled_datasets_dir = f"{labeling_output_dir}/blank-manual-labeling-output"

labeling_phase = 'top_3000_context_2_single_sentence_'
labeling_dir = f"{blank_manually_labeled_datasets_dir}/{labeling_phase}"

column_names = ['Fact Checked Claim Index', 'Podcast Claim Index', 'Fact Checked Claim', 'Podcast Claim', 'Stance Agreement']

jon_overlap = df_from_google_sheets(labeling_phase + 'jon_overlap', columns=column_names)
omar_overlap = df_from_google_sheets(labeling_phase + 'omar_overlap', columns=column_names)
ahibjeet_overlap = df_from_google_sheets(labeling_phase + 'abhijeet_overlap', columns=column_names)

# get first k rows that we've labeled so far

jo_200 = jon_overlap['Stance Agreement'][:600]
oo_200 = omar_overlap['Stance Agreement'][:600]
ao_200 = ahibjeet_overlap['Stance Agreement'][:600]

val = '1'
avg_agree_count = (np.sum(jo_200 == val) + np.sum(oo_200 == val) + np.sum(ao_200 == val)) / 3
print('Avg agree count', avg_agree_count)

val = '2'
avg_disagree_count = (np.sum(jo_200 == val) + np.sum(oo_200 == val) + np.sum(ao_200 == val)) / 3
print('Avg disagree count', avg_disagree_count)

val = '3'
avg_unrelated_count = (np.sum(jo_200 == val) + np.sum(oo_200 == val) + np.sum(ao_200 == val)) / 3
print('Avg unrelated count', avg_unrelated_count)

data = []
data += [('Jon', idx, x) for idx, x in enumerate(jo_200)]
data += [('Omar', idx, x) for idx, x in enumerate(oo_200)]
data += [('Abhijeet', idx, x) for idx, x in enumerate(ao_200)]

task = AnnotationTask(data=data)

print('Kappa score', task.kappa())
print('Multi Kappa score', task.multi_kappa())
print('Krippendorf Alpha', task.alpha())

In [None]:
print('Scores between Jon and Omar')
data = []
data += [('Jon', idx, x) for idx, x in enumerate(jo_200)]
data += [('Omar', idx, x) for idx, x in enumerate(oo_200)]

task = AnnotationTask(data=data)

print('Observed Agreement', task.Ao('Jon', 'Omar'))
print('Kappa score', task.kappa())
print('Multi Kappa score', task.multi_kappa())
print('Krippendorf Alpha', task.alpha())

In [None]:
print('Scores between Jon and Ahbijeet')
data = []
data += [('Jon', idx, x) for idx, x in enumerate(jo_200)]
data += [('Abhijeet', idx, x) for idx, x in enumerate(ao_200)]

task = AnnotationTask(data=data)

print('Observed Agreement', task.Ao('Jon', 'Abhijeet'))
print('Kappa score', task.kappa())
print('Multi Kappa score', task.multi_kappa())
print('Krippendorf Alpha', task.alpha())

In [None]:
print('Scores between Ahbijeet and Omar')
data = []
data += [('Omar', idx, x) for idx, x in enumerate(oo_200)]
data += [('Abhijeet', idx, x) for idx, x in enumerate(ao_200)]

task = AnnotationTask(data=data)

print('Observed Agreement', task.Ao('Abhijeet', 'Omar'))
print('Kappa score', task.kappa())
print('Multi Kappa score', task.multi_kappa())
print('Krippendorf Alpha', task.alpha())