In [1]:
!pip install nltk langdetect spacy networkx gensim python-louvain scikit-learn fuzzywuzzy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993224 sha256=10210ee8b444f1b4f6b39771053f23ecbff7f3fca94be0f1fa93c35fb8cac9fb
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: fuzzywuzzy, langdetect
Successfully installed fuzzywuzzy-0.18.0 langdetect-1.0.9


In [3]:
!python -m spacy download es_core_news_md

2023-05-21 21:57:07.415228: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting es-core-news-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.5.0/es_core_news_md-3.5.0-py3-none-any.whl (42.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: es-core-news-md
Successfully installed es-core-news-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_md')


In [4]:
!python -m spacy download en_core_web_md

2023-05-21 21:57:24.252512: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [24]:
import json
from itertools import combinations

import networkx as nx
import nltk
import pandas as pd
import spacy
from community import community_louvain
from langdetect import detect
from nltk import pos_tag
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
nlp_en = spacy.load('en_core_web_md')
nlp_es = spacy.load('es_core_news_md')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stopwords_en = stopwords.words('english')
stopwords_es = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [7]:
def load_file(name):
    with open(name, 'r') as file:
        return json.load(file)

In [35]:
def get_author_key(author_dict, name):
    for key, value in author_dict.items():
        if name in value:
            return key
    return ""

In [13]:
def get_relevant_words(lang, word_list):
    relevant_words = []
    if lang == 'es':
        nlp = nlp_es
    else:
        nlp = nlp_en
    for word in word_list:
        doc = nlp(word)
        for token in doc:
            if token.pos_ == 'NOUN' or token.pos_ == 'VERB':
                relevant_words.append(token.lemma_)
    return relevant_words

In [14]:
def process_relevant_words(word_list):
    relevant_words = []
    pos_tags = pos_tag(word_list)
    for word, tag in pos_tags:
        if tag.startswith('N') or tag.startswith('V'):
            relevant_words.append(word)
    return relevant_words

In [15]:
# Generar conceptos de las palabras más representativas identificadas en la comunidad
def generate_concepts(community_keywords):
    concepts = []
    for keywords in community_keywords:
        concept = ' '.join(keywords)
        concepts.append(concept)
    return concepts

In [16]:
# Extraer palabras más representativas utilizando el vectorizador TfidVectorizer
def extract_representative_keywords(communities, lang, num_keywords=2):
    stop_words = stopwords_es if lang == 'es' else stopwords_en
    community_keywords = []
    for community in communities:
        vectorizer = TfidfVectorizer(stop_words=stop_words)
        community_texts = [''.join(node) for node in community]
        if not community_texts:
            continue
        try:
            tfidf_matrix = vectorizer.fit_transform(community_texts)
        except ValueError as e:
            continue
        feature_names = vectorizer.get_feature_names_out()
        avg_tfidf = tfidf_matrix.mean(axis=0).tolist()[0]
        sorted_keywords = sorted(zip(avg_tfidf, feature_names), reverse=True)[:num_keywords]
        community_keywords.append([keyword for _, keyword in sorted_keywords])
    return community_keywords

In [17]:
# Identificar comunidades a partir de algoritmo de Louvain
def identify_communities(word_network):
    partition = community_louvain.best_partition(word_network)
    communities = {}
    for node, community in partition.items():
        if community not in communities:
            communities[community] = []
        communities[community].append(node)

    return list(communities.values())

In [18]:
# Crear red de palabras con NetworkX
# Cada palabra es un nodo y tiene alguna conexión con el resto de palabras que aparecen
def create_word_network(keywords):
    G = nx.Graph()
    for keyword in keywords:
        G.add_node(keyword)
    for pair in combinations(keywords, 2):
        G.add_edge(*pair)
    return G

In [19]:
# Identificar palabras clave de acuerdo al idioma
def identify_keywords(text, lang):
    if lang == 'es':
        nlp = nlp_es
        stop_words = stopwords_es
    else:
        nlp = nlp_en
        stop_words = stopwords_en
    doc = nlp(text)
    keywords = []
    for token in doc:
        if token.text.lower() not in stop_words and token.is_alpha:
            keywords.append(token.lemma_.lower())
    return keywords

In [22]:
# Aplicar lematización tokenización y elminiación de stop words utilizando spaCy
def process_text(text, lang):
    if lang == 'en':
        nlp = spacy.load('en_core_web_md')
    elif lang == 'es':
        nlp = spacy.load('es_core_news_md')
    else:
        return None

    doc = nlp(text)
    return doc

In [72]:
def process_document_description(description):
    description_lang = detect(description)
    processed_text = process_text(description, description_lang)
    if processed_text is not None:
        keywords = identify_keywords(processed_text, description_lang)
        word_network = create_word_network(keywords)
        communities = identify_communities(word_network)
        community_keywords = extract_representative_keywords(communities, description_lang)
        concepts = generate_concepts(community_keywords)
        relevant_concepts = process_relevant_words(concepts)
        relevant_topics = get_relevant_words(description_lang, relevant_concepts)
        return concepts
    return None

In [23]:
authors = load_file('authors.txt')

In [78]:
def process_data():
  documents = pd.read_csv("papersPreprocessed (2).csv")

  total_documents, columnas = documents.shape
  document_number = 1
  total_communities = []
  total_authors = []

  for index, row in documents.iterrows():
    document = row.to_dict()
    print(f'Procesando documento {document_number}/{total_documents}')
    description = None if pd.isna(document['Abstract']) else document['Abstract']
    communities = []
    if description is not None:
      communities = process_document_description(description)
    document_authors = document['Authors'].split(';')
    disambiguated_authors = []
    for author in document_authors:
      disambiguated_authors.append(get_author_key(authors, author))

    total_authors.append(';'.join(disambiguated_authors))
    total_communities.append(communities)
    document_number = document_number + 1

  documents['communities'] = total_communities
  documents['disambiguated_authors'] = total_authors

  return documents

In [None]:
documents_df = process_data()
documents_df.to_csv('papersPreprocessedv3.csv', index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Procesando documento 3515/10194
Procesando documento 3516/10194
Procesando documento 3517/10194
Procesando documento 3518/10194
Procesando documento 3519/10194
Procesando documento 3520/10194
Procesando documento 3521/10194
Procesando documento 3522/10194
Procesando documento 3523/10194
Procesando documento 3524/10194
Procesando documento 3525/10194
Procesando documento 3526/10194
Procesando documento 3527/10194
Procesando documento 3528/10194
Procesando documento 3529/10194
Procesando documento 3530/10194
Procesando documento 3531/10194
Procesando documento 3532/10194
Procesando documento 3533/10194
Procesando documento 3534/10194
Procesando documento 3535/10194
Procesando documento 3536/10194
Procesando documento 3537/10194
Procesando documento 3538/10194
Procesando documento 3539/10194
Procesando documento 3540/10194
Procesando documento 3541/10194
Procesando documento 3542/10194
Procesando documento 3543/10194
Procesa