In [4]:
!pip install -U sentence-transformers

import pandas as pd
import time

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

from google.colab import drive
drive.mount('/drive')

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.6 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 10.6 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 52.8 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 44.6 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 631 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |█████████████████████

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Mounted at /drive


In [5]:
class PodcastClaims: 

	def __init__(self, show_uri, episode_uri, transcript_claims, index):
		self.show_uri = show_uri
		self.episode_uri = episode_uri
		self.center_claim = transcript_claims[len(transcript_claims) // 2]
		self.context_claim = ".".join(transcript_claims)
		self.index = index

In [8]:
## From colab-setup, replace if changes were made to this file

parent_dir = '/drive/MyDrive/spotify-misinformation'

preprocessing_output_dir = f"{parent_dir}/preprocessing-output"
matched_claims_output_dir = f"{parent_dir}/matched-claims-output"

# Define file paths for where podcast claims and fact checked claims are located

fact_checked_claims_fp = f"{preprocessing_output_dir}/politifact_filtered.csv"
transcript_claims_fp = f"{preprocessing_output_dir}/podcast_claims_context_2.tsv"

# Define filepath for matched claims

matched_claims_fp = f"{matched_claims_output_dir}/matched_claims_context_2.txt"

# Utilities 

Functions used within testing and if runtime disconnects while matching claims. 

In [9]:
### Find where claims stopped if runtime gets disconnected
### If disconnected, uncomment lines where it stays 'Runtime Disconnected' in loop below
### and copy paste highest_ts_claim
# highest_ts_claim = 0

# with open(matched_claims_fp) as f:
#   for line in f:
#     temp = line.split(",")
#     if int(temp[1]) > highest_ts_claim:
#       highest_ts_claim = int(temp[1])

# print(highest_ts_claim)

In [2]:
## Find amount of transcript claims

# count = 0

# with open(transcript_claims_fp) as f:
#   for line in f:
#     count += 1

# print(count)

In [3]:
## Testing, how to parse podcast claims tsv file

# with open(transcript_claims_fp) as f:
#   for line in f:
#     tl = line.strip().split('\t')
#     claim = PodcastClaims(tl[0], tl[1], tl[2:], 1)
#     print(claim.center_claim)
#     break

# Match Claims Single Sentence

Computes similarity score of a fact checked claim with a single podcast sentence claim.

In [11]:
claims_df = pd.read_csv(fact_checked_claims_fp)

claims = claims_df['Statement'].tolist()
claim_embeddings = model.encode(claims, convert_to_tensor=True)

batch_size = 50000

## Embeddings with lower similarity usually aren't semantically related
## Used to limit paired claims stored that aren't semantically related
similarity_threshold = 0.4

start_time = time.time()

with open(transcript_claims_fp) as all_transcripts:

  with open(matched_claims_fp, 'w') as matched_claims:

    eps = []

    for idx, line in enumerate(all_transcripts):

      ### Runtime Disconnected
      ## Remember to change file open to append instead of write
      # if idx <= 12450000:
      #   continue

      tl = line.strip().split('\t')
      eps.append(PodcastClaims(tl[0], tl[1], tl[2:], idx))

      if idx % batch_size == 0 and idx != 0:
        # every batch size, compute cosine similarites 

        ## NOTE: Here the center_claim from the podcast is compared
        ts_claims = [x.center_claim for x in eps]
        ts_embeddings = model.encode(ts_claims, convert_to_tensor=True)

        cosine_scores = util.pytorch_cos_sim(ts_embeddings, claim_embeddings)

        for ts_idx, kb_idx in (cosine_scores > similarity_threshold).nonzero():
          matched_claims.write(f"{kb_idx}, {eps[ts_idx].index}, {cosine_scores[ts_idx][kb_idx]}\n")

        eps = []

        print(idx, time.time() - start_time)
        
    # save the last bit of claims

    ## NOTE: Here the center_claim from the podcast is compared
    ts_claims = [x.center_claim for x in eps]
    ts_embeddings = model.encode(ts_claims, convert_to_tensor=True)

    cosine_scores = util.pytorch_cos_sim(ts_embeddings, claim_embeddings)

    for ts_idx, kb_idx in (cosine_scores > similarity_threshold).nonzero():
      matched_claims.write(f"{kb_idx}, {eps[ts_idx].index}, {cosine_scores[ts_idx][kb_idx]}\n")

print(time.time() - start_time)

0.03060460090637207


# Match Claims Full Context

Computes similarity score of a fact checked claim and a paragraph from the podcast (which is defined as having a central sentence and the surrounding sentences around it subject to the size of the context). 

Ended up only comparing the podcast claims since similarity scores suffered. Leaving the code here in case it's needed later.

In [None]:
# matched_claims_fp = f"{matched_claims_output_dir}/matched_claims_context_2_full_context.txt"

# claims_df = pd.read_csv(fact_checked_claims_fp)

# claims = claims_df['Statement'].tolist()
# claim_embeddings = model.encode(claims, convert_to_tensor=True)

# batch_size = 50000
# similarity_threshold = 0.4

# start_time = time.time()

# with open(transcript_claims_fp) as all_transcripts:

#   with open(matched_claims_fp, 'w') as matched_claims:

#     eps = []

#     for idx, line in enumerate(all_transcripts):

#       # used for when runtime stops
#       # remember to change file open to append instead of write
#       # if idx <= 12450000:
#       #   continue

#       tl = line.strip().split('\t')
#       eps.append(PodcastClaims(tl[0], tl[1], tl[2:], idx))

#       if idx % batch_size == 0 and idx != 0:
#         # every batch size, compute cosine similarites 

#         ## NOTE: Here the context_claim from the podcast is compared
#         ##       This contains the podcast claim + surrounding context
#         ts_claims = [x.context_claim for x in eps]
#         ts_embeddings = model.encode(ts_claims, convert_to_tensor=True)

#         cosine_scores = util.pytorch_cos_sim(ts_embeddings, claim_embeddings)

#         for ts_idx, kb_idx in (cosine_scores > similarity_threshold).nonzero():
#           matched_claims.write(f"{kb_idx}, {eps[ts_idx].index}, {cosine_scores[ts_idx][kb_idx]}\n")

#         eps = []

#         print(idx, time.time() - start_time)
        
#     # save the last bit of claims

#     ## NOTE: Here the context_claim from the podcast is compared
#     ts_claims = [x.context_claim for x in eps]
#     ts_embeddings = model.encode(ts_claims, convert_to_tensor=True)

#     cosine_scores = util.pytorch_cos_sim(ts_embeddings, claim_embeddings)

#     for ts_idx, kb_idx in (cosine_scores > similarity_threshold).nonzero():
#       matched_claims.write(f"{kb_idx}, {eps[ts_idx].index}, {cosine_scores[ts_idx][kb_idx]}\n")

# print(time.time() - start_time)

50000 45.155678272247314
100000 96.85403060913086
150000 140.59924817085266
200000 182.4154679775238
250000 228.54540419578552
300000 273.93321204185486
350000 326.9467101097107
400000 380.4247193336487
450000 443.7802860736847
500000 504.08126306533813
550000 552.4252882003784
600000 600.1172091960907
650000 656.4657881259918
700000 703.1634609699249
750000 748.4571068286896
800000 796.2829892635345
850000 845.3559839725494
900000 893.3341336250305
950000 946.8825554847717
1000000 1000.5712215900421
1050000 1045.2510960102081
1100000 1087.2823753356934
1150000 1137.8694097995758
1200000 1192.3323059082031
1250000 1235.9014730453491
1300000 1275.3370125293732
1350000 1322.8403897285461
1400000 1365.1133885383606
1450000 1419.7984688282013
1500000 1464.277381181717
1550000 1509.7717423439026
1600000 1557.661364555359
1650000 1611.306000471115
1700000 1656.435460805893
1750000 1699.0211038589478
1800000 1742.9788930416107
1850000 1786.111077785492
1900000 1832.8172554969788
1950000 1869.