In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import itertools
import csv
pd.options.display.max_colwidth=500
from tqdm import tqdm

In [None]:
import sys
sys.path.append('../')
from utils import *
from graphUtils import *

# Create Corona Dataset

In [None]:
import os
all_tables = []

directory = '../../data/Corona/tables/'
i = 0
for root, dirs, files in os.walk(directory):
    for file in files:
        with open(directory + file) as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',')
            
            next(spamreader)
            for row in spamreader:
                temp = [file.split('.')[0].replace('_',' ')]
                temp.append(row[0].lower())
                for r in row[-6::]: temp.append(str(int(float(r))))
                all_tables.append(temp)


In [None]:
table_columns = {1:'table',2:'country',3:'january',4:'february',5:'march',6:'april',7:'may',8:'june'}

In [None]:
all_claims = pickle.load(open('../../data/corona/corona_allClaims','rb'))
user_claims = pickle.load(open('../../data/corona/user_claims.pkl','rb'))
all_tables = pickle.load(open('../../data/corona/corona_tables','rb'))

# Create Graph

In [None]:
import networkx as nx
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

G=nx.Graph()
K = 3
f,k =0,0

i = 0
nodes_labels = {}
row_ids = {}
id_rows = {}

for row in tqdm(all_tables):
    i+=1
    row_name = str('RW'+str(i))
    G.add_node(row_name , label= row_name, type='Row')
    row_ids[row_name] = ' '.join([r for r in row])
    id_rows[' '.join(row)] = row_name
    
    j=0
    for cl in row:
        j+=1
        col_name = table_columns[j]
                
        if not G.has_node(col_name):     G.add_node(col_name , label= col_name, type='Column')
        n_grams = [gr.replace(' ','_') for gr in find_all_n_grams(str(cl),K)]
        
        for tg in n_grams:
            if not G.has_node(tg): G.add_node(tg,label=tg, type='Token')
            G.add_edge(row_name,tg)
            G.add_edge(col_name,tg)
            
            

In [None]:
i = 0
claim_ids = {}
id_claim = {}
all_claims.update(user_claims)
new_nodes = 0

for claim in tqdm(all_claims):
    i += 1
    text = ' '.join([w for w in normalize_text(claim).split() if w not in stop_words])
    claim_name = str('Claim'+str(i))
    G.add_node(claim_name , label= claim_name, type='Claim')
    claim_ids[claim_name] = claim
    id_claim[claim] = claim_name
    
    n_grams = [gr.replace(' ','_') for gr in find_all_n_grams(text,K)]
    n_grams = sorted(n_grams, key=lambda dist: len(dist),reverse = True)
    
    for tg in n_grams:
        token = tg
        
        if not G.has_node(token): continue
        if not G.has_edge(claim_name,token):            G.add_edge(claim_name,token)


In [None]:
ground_truth = {}

for cl in claim_ids:
    ground_truth[cl] = []
    if claim_ids[cl] in user_claims:
        for r in user_claims[claim_ids[cl]]:
            for rr in id_rows:
                if ' '.join(reversed(r)) in rr:             
                    ground_truth[cl].append(id_rows[rr])
    else:
        for r in id_rows:
            if ' '.join(all_claims[claim_ids[cl]][0:2]) in r:
                ground_truth[cl].append(id_rows[r])
                


In [None]:
len(G.nodes()),len(G.edges())

# Expansion with ConceptNet

In [None]:
import conceptnet_lite
conceptnet_lite.connect("../MatchingText/conceptnet.db")

In [None]:
from conceptnet_lite import Label, edges_for
from tqdm import tqdm

new_nodes = []

for node in tqdm(G.copy().nodes(),position=0):
    if G.nodes()[node]['type'] != 'Token': continue
    
    try:
        for e in edges_for(Label.get(text=G.nodes()[node]['label'].replace('_',' '), language='en').concepts, same_language=True):
            if e.start.text == node:
                new_node = e.end.text
            else:
                new_node = e.start.text
            rel = e.relation.name
            
            for n in utils.normalize_text(new_node).split():
                if not G.has_node(n):
                    new_nodes.append(n)
                    G.add_node(n, label = n, type = 'Token')
            G.add_edge(node,n,type= rel)
    except:
        continue
        
for n in G.copy().nodes():
    if G.degree()[n] < 2:
        G.remove_node(n)

In [None]:
len(G.nodes()),len(G.edges())

In [None]:
#nx.write_graphml(G,'../MatchingText/data/Corona/corona_expanded.gml')

# Compression

## SSuM

In [None]:
i = 0
node_ids = {}

for n in G.nodes:
    node_ids[n] = i
    i+=1
inv_nodes = {v: k for k, v in node_ids.items()}

In [None]:
file = open('corona_edgelist', 'w')

for e in G.edges():    file.write(str(node_ids[e[0]]) + '\t' + str(node_ids[e[1]]) + '\n')

In [None]:
with open('../SSumM/output/summary_corona_edgelist.txt') as f:
    sum_grapph = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
sum_grapph = [x.strip() for x in sum_grapph] 

In [None]:
super_nodes,super_edges = {},[]
edge_weights = {}

for i in range(1,sum_grapph.index('<Superedge info>')):
    node = sum_grapph[i].split('\t')
    idd = node[0]
    node = [inv_nodes[int(n)] for n in node[1::]]
    super_nodes[idd] = node

for i in range(sum_grapph.index('<Superedge info>')+1,len(sum_grapph)):
    e = sum_grapph[i].split('\t')
    if e[0] not in edge_weights:        edge_weights[e[0]] = {}
    if e[1] not in edge_weights:        edge_weights[e[1]] = {}
        

    edge_weights[e[0]][e[1]] = e[2]
    edge_weights[e[1]][e[0]] = e[2]
    
    super_edges.append((e[0],e[1]))



In [None]:
SG = nx.Graph()

for node in super_nodes:
    name = ''
    if ' '.join(super_nodes[node]).startswith(('RW','Review','CL')):
        name = ' '.join(super_nodes[node])
    else:
        name = super_nodes[node][0]
        
    SG.add_node(node , label= name, type='node')
    
for e in super_edges:
    SG.add_edge(e[0],e[1])

In [None]:
G = SG
len(G.nodes()),len(G.edges())

# MSP

In [None]:
G1 = nx.Graph()

from random import choice
L = int(len(G.nodes())/4)
sp = []
i =0 
pbar = tqdm(total=L,position=0)
while i < L:
    first = choice([n for n in G.nodes() if G.nodes()[n]['type'] == 'Row'])
    second = choice([n for n in G.nodes() if G.nodes()[n]['type'] == 'Claim'])
    paths = [p for p in nx.all_shortest_paths(G, first,second, weight=None)]
    for p in paths:
        G1.add_nodes_from(p)
        nx.add_path(G1,p)    
    i+=1
    pbar.update(1)
    

In [None]:
G = G1
len(G.nodes()),len(G.edges())

# RandomWalk

In [None]:
docs = []
random_paths = generate_random_walks(G,100,l=25)
for p in random_paths:
    docs.append(p)

# Word Embeddings

In [None]:
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
from tqdm import tqdm 
tagged_data = []
for d in tqdm(docs,position=0):
    tagged_data.append(word_tokenize(d))

In [None]:
%env PYTHONHASHSEED=0
max_epochs = 10
vec_size = 300

model = Word2Vec(size=vec_size, min_count=10, window=3, sg=1, seed=0, workers = 4)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

print("Model is Ready")

In [None]:
movie_reviews = {}
for claim in tqdm(ground_truth,position=0):
    if claim in model.wv:
        movie_reviews[claim] = distance_w2v (model,claim,row_ids,len(row_ids))

In [None]:
for KK in [1,5,20,len(row_ids)]: 
#for KK in [2]: 
    
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for movie in movie_reviews:
        if movie not in ground_truth or len(ground_truth[movie])==0: continue
        if movie in [id_claim[c] for c in user_claims]: continue
            
        i+=1
        preds = [f for (f,j) in movie_reviews[movie]][0:KK]
        golds = [g for g in ground_truth[movie]]
        
        MAP += utils.MAP_K(golds,preds)
        MR += utils.MRR(golds,preds)
        hasP += utils.HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    try:
        print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)
    except:
        print('y')

In [None]:
for KK in [1,5,20,500]: 
#for KK in [2]: 
    
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for movie in movie_reviews:
        if movie not in ground_truth or len(ground_truth[movie])==0: continue
        if movie not in [id_claim[c] for c in user_claims]: continue
            
        i+=1
        preds = [f for (f,j) in movie_reviews[movie]][0:KK]
        golds = [g for g in ground_truth[movie]]
        
        
        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    try:
        print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)
    except:
        print('y')