In [None]:
import csv
import os
import pickle
from tqdm import tqdm 
from nltk import tokenize
from gensim.parsing.preprocessing import remove_stopwords

In [None]:
import sys
sys.path.append('../')
from utils import *
from graphUtils import *

In [None]:
facts_info = pickle.load(open('../../data/politifact/politi_factsInfo.pkl','rb'))
facts = pickle.load(open('../../data/politifact/politi_facts.pkl','rb'))
all_claims = pickle.load(open('../../data/politifact/politi_claims.pkl','rb'))
ground_truth = pickle.load(open('../../data/politifact/politi_GT.pkl','rb'))

# GRAPH CREATION

In [None]:
import networkx as nx

G=nx.Graph()
K = 3
i = 0
nodes_labels = {}
claim_ids = {}
id_claim = {}

for claim in tqdm([g for g in ground_truth.keys()]):
    node = remove_stopwords(normalize_text(claim))
    i+=1

    node_name = str('CLM'+str(i))
    G.add_node(node_name , label= node_name, type='Claim')
    nodes_labels[node_name] = node_name
    claim_ids[node_name] = claim
    id_claim[claim] = node_name
                
        
    n_grams = [gr.replace(' ','_') for gr in find_all_n_grams(node,K)]
    n_grams = sorted(n_grams, key=lambda dist: len(dist),reverse = True)
    
    for tg in n_grams:
        token = tg
            
        G.add_node(token,label=token, type='Token')
        if not G.has_edge(node_name,token): G.add_edge(node_name,token)

In [None]:
i = 0
fact_ids = {}
id_fact = {}
node_maps = []

for fact in tqdm(facts):
    node = remove_stopwords(normalize_text(facts_info[fact]))
    i += 1
    name = str('FCT'+ str(i))
    
    fact_ids[name] = node
    id_fact[node] = name
    
    G.add_node(name,label = name, type='Fact')
    
    n_grams = [gr.replace(' ','_') for gr in find_all_n_grams(node,K)]
    n_grams = sorted(n_grams, key=lambda dist: len(dist),reverse = True)
    
    for tg in n_grams:
        token = tg
        
        if not G.has_node(token): continue
        if not G.has_edge(name,token):            G.add_edge(name,token)


In [None]:
len(G.nodes()),len(G.edges())

# Expansion with ConceptNet

In [None]:
import conceptnet_lite
conceptnet_lite.connect("../../data/conceptnet.db")

In [None]:
from conceptnet_lite import Label, edges_for
from tqdm import tqdm
for node in tqdm(G.copy().nodes()):
    if G.nodes()[node]['type'] != 'Token': continue
    
    try:
        for e in edges_for(Label.get(text=G.nodes()[node]['label'].replace('_',' '), language='en').concepts, same_language=True):
            if e.start.text == node:
                new_node = e.end.text
            else:
                new_node = e.start.text
            rel = e.relation.name
            
            for n in utils.normalize_text(new_node).split():
                if not G.has_node(n):
                    G.add_node(n, label = n, type = 'Token')
            G.add_edge(node,n,type= rel)
    except:
        continue
        
for n in G.copy().nodes():
    if G.degree()[n] < 2:
        G.remove_node(n)

In [None]:
len(G.nodes()),len(G.edges())

# Compression

## SSuM

In [None]:
i = 0
node_ids = {}

for n in G.nodes:
    node_ids[n] = i
    i+=1
inv_nodes = {v: k for k, v in node_ids.items()}

In [None]:
file = open('../../politi_edgelist', 'w')

for e in G.edges():    file.write(str(node_ids[e[0]]) + '\t' + str(node_ids[e[1]]) + '\n')

In [None]:
with open('../SSumM/output/summary_politi_edgelist.txt') as f:
    sum_grapph = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
sum_grapph = [x.strip() for x in sum_grapph] 

In [None]:
super_nodes,super_edges = {},[]
edge_weights = {}

for i in range(1,sum_grapph.index('<Superedge info>')):
    node = sum_grapph[i].split('\t')
    idd = node[0]
    node = [inv_nodes[int(n)] for n in node[1::]]
    super_nodes[idd] = node

for i in range(sum_grapph.index('<Superedge info>')+1,len(sum_grapph)):
    e = sum_grapph[i].split('\t')
    if e[0] not in edge_weights:        edge_weights[e[0]] = {}
    if e[1] not in edge_weights:        edge_weights[e[1]] = {}
        

    edge_weights[e[0]][e[1]] = e[2]
    edge_weights[e[1]][e[0]] = e[2]
    
    super_edges.append((e[0],e[1]))



In [None]:
SG = nx.Graph()

for node in super_nodes:
    name = ''
    if ' '.join(super_nodes[node]).startswith(('Claim','Fact')):
        name = ' '.join(super_nodes[node])
    else:
        name = super_nodes[node][0]
        
    SG.add_node(node , label= name, type='node')
    
for e in super_edges:
    SG.add_edge(e[0],e[1])

In [None]:
G = SG
len(G.nodes()),len(G.edges())

## Bridge Removal

In [None]:
for n in tqdm(G1.copy().nodes()):
    if G1.degree()[n] < 2:
        G1.remove_node(n)
    elif G1.degree()[n] == 2:
        ns = [n for n in nx.neighbors(G1,n)]
        if not G1.has_edge(ns[0],ns[1]):
            G1.add_edge(ns[0],ns[1])
        G1.remove_node(n)

In [None]:
len(G1.nodes()),len(G1.edges())

# MSP

In [None]:
G1 = nx.Graph()

from random import choice
L = int(len(G.nodes())/4)
sp = []
i =0 
pbar = tqdm(total=L,position=0)
while i < L:
    first = choice([n for n in G.nodes() if G.nodes()[n]['type'] == 'Fact'])
    second = choice([n for n in G.nodes() if G.nodes()[n]['type'] == 'Claim'])
    paths = nx.all_shortest_paths(G, first,second, weight=None)
    for path in paths:
        G1.add_nodes_from(path)
        nx.add_path(G1,path)    
    i+=1
    pbar.update(1)

In [None]:
G = G1
len(G.nodes()),len(G.edges())

# RandomWalks

In [None]:
docs = []
random_paths = generate_random_walks(G,100,l=40)
for p in random_paths:
    docs.append(p)

## WordEmbedding Models

In [None]:
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from nltk.tokenize import word_tokenize

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
from tqdm import tqdm 
tagged_data = []
for d in tqdm(docs,position=0):
    tagged_data.append(word_tokenize(d))


In [None]:
%env PYTHONHASHSEED=0
max_epochs = 10
vec_size = 100

model = Word2Vec(size=vec_size, min_count=10, window=20, sg=1, seed=0, workers = 4)

model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

print("Model is Ready")

# Test

In [None]:
claim_facts = {}
for claim in tqdm(ground_truth):
    if claim not in id_claim: continue
    cl_id = id_claim[claim]
    filtered_facts = {}
    
    if cl_id not in model.wv: continue
    claim_facts[cl_id] = distance_w2v (model,cl_id,fact_ids,50000)


In [None]:
for KK in [1,5,20,30000]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for claim in claim_facts:
        if claim_ids[claim] not in ground_truth or len(ground_truth[claim_ids[claim]]) == 0: continue
        
        i+=1
        preds = [fact_ids[f] for (f,j) in claim_facts[claim]][0:KK]
        golds = [f for f in ground_truth[claim_ids[claim]]]

        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)


    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)