In [8]:
import os
gpu_avail = 0
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_avail}"

In [19]:
from datasets import load_dataset, Dataset, load_from_disk
import random
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, InputExample
import torch
from sklearn.preprocessing import StandardScaler
import numpy as np
import json
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from collections import Counter, defaultdict
import pickle
path = '../COVID_pub'
name = 'COVID'

In [11]:
Claims_total = dict()
for split in ['train']:
    with open(os.path.join(path, f'{name}_{split}_Claims.json'), 'r') as file:
        claims = json.load(file)
        Claims_total[split] = claims

In [12]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [13]:
sbert = SentenceTransformer('all-MiniLM-L12-v2', device=device)

  raise LocalEntryNotFoundError(
  return torch.load(checkpoint_file, map_location="cpu")


In [17]:
claims_train= [item for sublist in Claims_total['train'].values() for item in sublist]
len(claims_train)

26772

In [16]:
%%time
embeddings_train = sbert.encode(claims_train, batch_size=4096, show_progress_bar=True, device=device)

Batches: 100%|██████████| 7/7 [00:03<00:00,  2.21it/s]


CPU times: user 11.3 s, sys: 1.32 s, total: 12.7 s
Wall time: 3.74 s


In [18]:
embeddings = embeddings_train
len(embeddings)

26772

In [20]:
with open('SBERT_COVID_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [21]:
scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings)

In [22]:
for val in [11]:
    dbscan = DBSCAN(eps=val, min_samples=3)
    clusters = dbscan.fit_predict(embeddings_scaled)
    n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
    n_noise = list(clusters).count(-1)

    print("-"*30)
    print(val)
    print(f'Number of clusters: {n_clusters}')
    print(f'Number of points classified as noise: {n_noise}')
    
    cluster_counts = Counter(clusters)

    # Print the number of points in each cluster
    n_points = []
    for cluster_id, num_points in cluster_counts.items():
        n_points.append(num_points)
    n_points_sorted = sorted(n_points, reverse=True)

    print(n_points_sorted[:20])

        # Evaluate the result of clustering
    if n_clusters > 1:
        silhouette_avg = silhouette_score(embeddings_scaled, clusters)
        print(f"Silhouette Coefficient: {silhouette_avg:.3f}")
    else:
        print("Not enough clusters to calculate the silhouette score.")

------------------------------
11
Number of clusters: 1347
Number of points classified as noise: 15771
[15771, 3239, 253, 158, 143, 105, 84, 80, 63, 61, 57, 53, 45, 43, 40, 37, 37, 36, 35, 35]
Silhouette Coefficient: -0.071


In [23]:
Sentences_total = []
for split in ['train']:
    with open(os.path.join(path, f'{name}_{split}_Sentences.json'), 'r') as file:
        sentences = json.load(file)
        Sentences_total.extend(sentences)

In [27]:
dataset = load_from_disk(os.path.join(path))
sen2label = dict()
for sen, label in zip(dataset['train']['sentences'], dataset['train']['label']):
    sen2label[sen] = label
len(sen2label)

10662

In [31]:
for cluster, num in Counter(clusters).most_common(50):
    if cluster == -1: continue
    print('-'*40)
    print(f'Cluster:{cluster} / Data points:{num}')
    # Predefined cluster
    predefined_cluster = cluster

    # Get indexes of elements in clusters that are equal to the predefined cluster
    indexes = np.where(clusters == predefined_cluster)[0]

    matched_sentences = [Sentences_total[i] for i in indexes]
    
    print(f'Matched Sentences:{len(matched_sentences)} / Unique Matched Sentences:{len(set(matched_sentences))}')

    cnt_r, cnt_f = 0, 0
    
    for sen in set(matched_sentences):
        if sen2label[sen] == 'real': cnt_r +=1
        else: cnt_f += 1

    print(f'Real:{cnt_r} / Fake:{cnt_f}')

----------------------------------------
Cluster:1 / Data points:3239
Matched Sentences:3239 / Unique Matched Sentences:1178
Real:1010 / Fake:168
----------------------------------------
Cluster:5 / Data points:253
Matched Sentences:253 / Unique Matched Sentences:108
Real:108 / Fake:0
----------------------------------------
Cluster:8 / Data points:158
Matched Sentences:158 / Unique Matched Sentences:153
Real:153 / Fake:0
----------------------------------------
Cluster:10 / Data points:143
Matched Sentences:143 / Unique Matched Sentences:89
Real:89 / Fake:0
----------------------------------------
Cluster:93 / Data points:105
Matched Sentences:105 / Unique Matched Sentences:80
Real:80 / Fake:0
----------------------------------------
Cluster:68 / Data points:84
Matched Sentences:84 / Unique Matched Sentences:81
Real:81 / Fake:0
----------------------------------------
Cluster:9 / Data points:80
Matched Sentences:80 / Unique Matched Sentences:52
Real:52 / Fake:0
-----------------------

In [32]:
only_fake = dict() ## key is cluster number and the value is datapoints in it
only_fake_unique = dict()
only_real = dict() 
only_real_unique = dict()
for cluster, num in tqdm(Counter(clusters).most_common()):
    if cluster == -1: continue
    
    # Predefined cluster
    predefined_cluster = cluster

    # Get indexes of elements in clusters that are equal to the predefined cluster
    indexes = np.where(clusters == predefined_cluster)[0]

    matched_sentences = [Sentences_total[i] for i in indexes]

    cnt_r, cnt_f = 0, 0
    
    for sen in set(matched_sentences):
        if sen2label[sen] == 'real': cnt_r +=1
        else: cnt_f += 1

    if cnt_r == 0 and cnt_f > 0:
        only_fake[cluster] = num
        only_fake_unique[cluster] = len(set(matched_sentences))

    if cnt_f == 0 and cnt_r > 0:
        only_real[cluster] = num
        only_real_unique[cluster] = len(set(matched_sentences))

100%|██████████| 1348/1348 [00:00<00:00, 17693.39it/s]


In [33]:
len(only_fake), len(only_fake_unique), len(only_real), len(only_real_unique)

(571, 571, 735, 735)

In [34]:
new_clusters = []
for d in tqdm(clusters):
    if d == -1 : 
        new_clusters.append(-1)
        continue
    if only_fake.get(d) == None and only_real.get(d) == None: ## Combined cluster
        new_clusters.append(-1)
        continue
    new_clusters.append(d)

100%|██████████| 26772/26772 [00:00<00:00, 644470.18it/s]


In [35]:
len(new_clusters)-new_clusters.count(-1), sum(list(only_fake.values()))+sum(list(only_real.values()))

(7392, 7392)

In [37]:
### filtering clusters have >= 10 unique sentences
cluster_fake = dict()
for k, v in only_fake_unique.items():
    if v >= 10: cluster_fake[k] = v
cluster_real = dict()
for k, v in only_real_unique.items():
    if v >= 10: cluster_real[k] = v 
len(cluster_fake), len(cluster_real), sum(cluster_fake.values())+sum(cluster_real.values())

(7, 47, 1535)

In [38]:
fake_clusters = [str(k) for k in cluster_fake.keys()]
real_clusters = [str(k) for k in cluster_real.keys()]

In [39]:
with open(os.path.join(path, 'real_clusters.json'), 'w') as file:
    json.dump(real_clusters,file)
with open(os.path.join(path, 'fake_clusters.json'), 'w') as file:
    json.dump(fake_clusters,file)

In [40]:
cluster_fin = dict()
cluster_fin.update(cluster_fake)
cluster_fin.update(cluster_real)
len(cluster_fin)

54

In [85]:
s=""
for k in cluster_fin.keys():
    s+=str(k)
    s+= " "
s

'33 136 48 281 170 319 217 5 8 10 93 68 9 124 69 74 125 53 13 473 27 112 100 159 24 128 318 39 73 77 330 221 414 138 156 411 25 450 94 856 0 14 459 97 536 42 45 15 75 214 167 180 23 131 '

In [42]:
random_seed = 123
torch.manual_seed(random_seed)
random.seed(random_seed)

In [43]:
cluster_w_sentences = dict()
for cluster in tqdm(cluster_fin.keys()):
    indexes = np.where(clusters == cluster)[0]
    matched_sentences = list(set([Sentences_total[i] for i in indexes]))
    cluster_w_sentences[cluster] = matched_sentences    

cluster_w_test_sentences = dict()
for cluster, sens in cluster_w_sentences.items():
    test_sens = random.sample(sens, int(0.2*len(sens)))
    cluster_w_test_sentences[cluster] = test_sens

100%|██████████| 54/54 [00:00<00:00, 9523.69it/s]


In [44]:
sum([len(v) for v in cluster_w_test_sentences.values()])

287

In [46]:
## Not included sentences in contrastive learning becuase they are used for test
deleted_sens = [] 
for c, sens in cluster_w_test_sentences.items():
    for sen in sens:
        deleted_sens.append(sen)

In [48]:
## Delete test sentences for new_clusters (set -1)
new_clusters_ = new_clusters.copy()
cnt = 0
for ind, (sen, cluster) in tqdm(enumerate(zip(Sentences_total, new_clusters_))):
    if sen in deleted_sens:
        new_clusters[ind] = -1
        cnt +=1
cnt

26772it [00:00, 151521.77it/s]


1222

In [53]:
new_clusters = [str(v) for v in new_clusters]
cluster_w_test_sentences = {str(k):v for k,v in cluster_w_test_sentences.items()}

In [57]:
with open(os.path.join(path, 'Cluster_w_test_sentences.json'), 'w') as file:
    json.dump(cluster_w_test_sentences,file)
with open(os.path.join(path, 'Cluster_w_train_valid_sentences.json'), 'w') as file:
    json.dump(cluster_w_tv_sentences,file)

In [58]:
sen_for_contrastive, cluster_for_contrastive, claims_for_contrastive = [], [], []
for sen, cluster, claim in zip(Sentences_total, new_clusters, claims_train):
    if cluster != "-1":
        sen_for_contrastive.append(sen)
        cluster_for_contrastive.append(cluster)
        claims_for_contrastive.append(claim)

In [60]:
dict_c = dict()
for c in cluster_for_contrastive:
    if dict_c.get(c) == None: dict_c[c] = 1
    else: dict_c[c] += 1
deleted_clusters = []
for c, v in dict_c.items():
    if v < 3: deleted_clusters.append(c) 
len(deleted_clusters)

28

In [61]:
sentences, clusters, claims = [], [], []
for sen, cluster, claim in zip(sen_for_contrastive, cluster_for_contrastive, claims_for_contrastive):
    if cluster not in deleted_clusters:
        sentences.append(sen)
        clusters.append(cluster)
        claims.append(claim)
len(sentences), len(clusters), len(claims)

(6570, 6570, 6570)

In [62]:
with open(os.path.join(path, 'Sentences_for_contrastive.json'), 'w') as file:
    json.dump(sentences,file)
with open(os.path.join(path, 'Clusters_for_contrastive.json'), 'w') as file:
    json.dump(clusters,file)
with open(os.path.join(path, 'Claims_for_contrastive.json'), 'w') as file:
    json.dump(claims,file)