In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from graph_construction import LoadData

In [5]:
from edges import EdgeWeights
from Hyperbolic import HyperbolicEmbedding

In [6]:
from evaluate import Evaluate

In [7]:
config = {
    'dataset': 'marujo',
    'data_dir': 'data',
    }

marujo_data = LoadData(config)

2021-01-25 02:19:20 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |
| depparse  | ewt     |

2021-01-25 02:19:20 INFO: Use device: gpu
2021-01-25 02:19:20 INFO: Loading: tokenize
2021-01-25 02:19:22 INFO: Loading: pos
2021-01-25 02:19:23 INFO: Loading: lemma
2021-01-25 02:19:23 INFO: Loading: depparse
2021-01-25 02:19:24 INFO: Done loading processors!


***

In [8]:
import string
import numpy as np
import networkx as nx

In [10]:
from tqdm.notebook import tqdm, trange

In [None]:
stopwords = []
with open(r"stopwords.txt",'r', encoding="utf8") as File:
    for line in File.readlines():
        stopwords.append(str(line)[:-1])

In [15]:
def marujo():
    metrics = {'precision': [],
           'recall': [],
           'f1': []}
    
    for item in trange(len(marujo_data.data[:50])):
        data = marujo_data.data[item]['text']
        graph = marujo_data.construct_graph(data)

        distance_matrix = nx.to_numpy_matrix(graph)

        he = HyperbolicEmbedding(distance_matrix, graph)
        new_graph = he.get_graph()

        index = sorted(new_graph.degree, key=lambda x: x[1], reverse=True)
        candidate_key = []
        for i in index:
            if isinstance(i[0], str):
                candidate_key.append(i[0].split('.',1)[0])

        for i in candidate_key:
            if i in set(stopwords) or i in string.punctuation:
                candidate_key.remove(i)

        unique_key = []
        for key in candidate_key:
            if key not in unique_key:
                unique_key.append(key)


        keywords = marujo_data.data[item]['keywords'].split('\n')

        ev = Evaluate('data', 'marujo')
        precision, recall, f1 = ev.evaluate_from_keyword(unique_key[:10], keywords)

        metrics['precision'].append(precision)
        metrics['recall'].append(recall)
        metrics['f1'].append(f1)

    return np.asarray(metrics['precision']).mean(), np.asarray(metrics['recall']).mean(), np.asarray(metrics['f1']).mean()

In [27]:
marujo()

  0%|          | 0/50 [00:00<?, ?it/s]

(0.41600000000000004, 0.10219412510430109, 0.15763271408791493)

In [21]:
ev = Evaluate('data', 'marujo')

In [None]:
ev.evaluate_from_keyword()

In [30]:
ev.evaluate('topicrank', top=10)

100%|██████████| 450/450 [00:00<00:00, 979.39it/s]


(0.3911463844797178, 0.10040809246446378, 0.15276521531077256)

In [None]:
#Weighted Distance Matrix
word_vec_map = marujo_data.vectorization(data)
edges = EdgeWeights(graph, word_vec_map)
distance_matrix = edges.generate_distance()