In [None]:
!pip install nltk
!pip install gensim
!pip install networkx
!pip install googletrans

In [None]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
#nltk.download('universal_tagset')
#nltk.download('treebank')
#nltk.download('maxent_ne_chunker')
#nltk.download('words')

'Very well'

In [17]:
import string

from googletrans import Translator

from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag

def preprocess(text):
    result=[]
    pos = {"J": wn.ADJ, "N": wn.NOUN, "V": wn.VERB, "R": wn.ADV}
    
    # Very long abstracts are ignored
    try:
        if Translator().detect(text).lang == 'pt':
            text = Translator().translate(text,src='pt').text
        elif Translator().detect(text).lang == 'es':
            text = Translator().translate(text,src='es').text
        elif Translator().detect(text).lang == 'fr':
            text = Translator().translate(text,src='fr').text
    except Exception as e:
        text = ''
    
    # We run through the tokenized words joint with their respective pos tag
    for token in pos_tag(word_tokenize(text)):
    
        # For each token, we need to check if it isn't a stopword or a punctuation character
        if token[0].lower() not in stopwords.words('english') and token[0] not in string.punctuation:

            tag = pos.get(token[1][0],None)

            # We'll work only with nouns and adjectives
            if tag == wn.NOUN or tag == wn.ADJ:

                # We lemmatize the token based on it pos tag
                lemma = WordNetLemmatizer().lemmatize(token[0], tag)
                if len(lemma) > 1:
                  # Finally, we add the lemmatized token into the list to be returned
                  result.append(lemma.lower())

    return result

In [3]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel

def main_topic_terms(tokens,nwords=5):
    if len(tokens) > 0:
        # First we make a list with the list of tokens
        texts = [tokens]

        # We use the class Dictionary to map normalized words to their ids 
        texts_dictionary = Dictionary(texts)

        # Convert the document into the bag-of-words format
        corpus = [texts_dictionary.doc2bow(text) for text in texts]

        # And now, we build the LDA model
        lda_model = LdaModel(corpus, num_topics=1, id2word=texts_dictionary, passes=10, alpha='auto')

        # Finally, with the topics on our hands, we take the terms of the main topic and return them
        return [texts_dictionary[id2word[0]] for id2word in lda_model.get_topic_terms(lda_model[corpus[0]][0][0],topn=nwords)]
    
    # Return a empty list if doesn't exist tokens
    return []

In [4]:
from itertools import combinations
import networkx as nx

def get_topics_graph(df):
    # We create a new graph (undirected)
    graph = nx.Graph()
    
    # The magic! We connect every topic words with each other and add the edges to the graph
    [graph.add_edges_from(combination) for combination in 
        [list(combinations(curr,2)) for curr in 
            [main_topic_terms(preprocess(df.loc[index,'resume'])) 
                 if row['abstract'] == True else main_topic_terms(preprocess(df.loc[index,'abstract'])) 
                 for index, row in df.isna().iterrows()
            ]
        ]
    ]
    
    # Finally, we return the graph
    return graph

In [5]:
import pandas as pd

def generate_graphml(input_dataset,output_graphml):
    df = pd.read_csv(input_dataset)
    
    graph = get_topics_graph(df)
    nx.write_graphml(graph, output_graphml)

In [11]:
def print_degree_nodes(graph,n=5):
    top_degree = sorted(graph.degree, key=lambda x: x[1], reverse=True)[:n]
    [print(curr[1],curr[0]) for curr in top_degree]

In [None]:
generate_graphml('datasets/ppgeec.csv','graph/ppgeec.graphml')

In [None]:
generate_graphml('datasets/ppgcsa.csv','graph/ppgcsa.graphml')

In [18]:
generate_graphml('datasets/ppged.csv','graph/ppged.graphml')

In [12]:
graph = nx.read_graphml('graph/ppgeec.graphml')
print_degree_nodes(graph,10)
print("{} nodes, {} edges".format(len(graph), nx.number_of_edges(graph)))

337 system
181 network
158 control
142 model
121 algorithm
116 data
114 antenna
112 method
101 structure
91 controller
1019 nodes, 5436 edges


In [13]:
graph = nx.read_graphml('graph/ppgcsa.graphml')
print_degree_nodes(graph,10)
print("{} nodes, {} edges".format(len(graph), nx.number_of_edges(graph)))

197 group
157 patient
133 health
128 study
85 elderly
83 woman
75 activity
63 physical
60 cell
57 quality
1047 nodes, 4215 edges


In [27]:
graph = nx.read_graphml('graph/ppged.graphml')
print_degree_nodes(graph,12)
print("{} nodes, {} edges".format(len(graph), nx.number_of_edges(graph)))

395 education
338 school
293 teacher
189 student
174 research
169 knowledge
161 study
155 practice
139 social
136 process
118 child
111 educational
996 nodes, 5655 edges
