## Generate final keywords list

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# Carregar arquivo xml
tree = ET.parse('../../data/raw/top-50-review.xml')
root = tree.getroot()

# Definir lista vazia para armazenamento de dados
top_50_list = []

# Iterar sobre os <record> elementos e extrair as informações
for record in root.findall('records/record'):
    
    # Extrair dados de cada campo
    database = record.find('database').text if record.find('database') is not None else None
    ref_type = record.find('ref-type').get('name') if record.find('ref-type') is not None else None
    authors = ", ".join([author.text for author in record.findall('contributors/authors/author')]) if record.findall('contributors/authors/author') is not None else None
    title = record.find('titles/title').text if record.find('titles/title') is not None else None
    secondary_title = record.find('titles/secondary-title').text if record.find('titles/secondary-title') is not None else None
    periodical = record.find('periodical/full-title').text if record.find('periodical/full-title') is not None else None
    pages = record.find('pages').text if record.find('pages') is not None else None
    volume = record.find('volume').text if record.find('volume') is not None else None
    issue = record.find('issue').text if record.find('issue') is not None else None
    keywords = ", ".join([keyword.text for keyword in record.findall('keywords/keyword')]) if record.findall('keywords/keyword') is not None else None
    year = record.find('dates/year').text if record.find('dates/year') is not None else None
    pdf_url = record.find('urls/pdf-urls/url').text if record.find('urls/pdf-urls/url') is not None else None
    web_url = record.find('urls/web-urls/url').text if record.find('urls/web-urls/url') is not None else None
    abstract = record.find('abstract').text if record.find('abstract') is not None else None
    
    # Adicionar os dados à lista
    top_50_list.append([database, ref_type, authors, title, secondary_title, periodical, pages, volume, issue, keywords, year, pdf_url, web_url, abstract])

# Criar o dataframe
top_50_df = pd.DataFrame(top_50_list, columns=['database', 'ref-type', 'authors', 'title', 'secondary-title', 'periodical', 'pages', 'volume', 'issue', 'keywords', 'year', 'pdf_url', 'web_url', 'abstract'])

In [3]:
# Checando o resultado
top_50_df.head(3)

Unnamed: 0,database,ref-type,authors,title,secondary-title,periodical,pages,volume,issue,keywords,year,pdf_url,web_url,abstract
0,top-50-review.enl,Journal Article,"Pedregosa, Fabian, Varoquaux, Gaeel, Gramfort,...",Scikit-learn: Machine Learning in Python,JOURNAL OF MACHINE LEARNING RESEARCH,JOURNAL OF MACHINE LEARNING RESEARCH,2825-2830,12,,Python; supervised learning; unsupervised lear...,2011,,,Scikit-learn is a Python module integrating a ...
1,top-50-review.enl,Journal Article,"CORTES, C, VAPNIK, V",SUPPORT-VECTOR NETWORKS,MACHINE LEARNING,MACHINE LEARNING,273-297,20,3.0,PATTERN RECOGNITION; EFFICIENT LEARNING ALGORI...,1995,,,The support-vector network is a new learning m...
2,top-50-review.enl,Journal Article,"Lecun, Y, Bottou, L, Bengio, Y, Haffner, P",Gradient-based learning applied to document re...,PROCEEDINGS OF THE IEEE,PROCEEDINGS OF THE IEEE,2278-2324,86,11.0,convolutional neural networks; document recogn...,1998,,,Multilayer neural networks trained with the ba...


In [5]:
# Colocar palavas-chave em minúsculas
top_50_df['keywords'] = top_50_df['keywords'].str.lower()

# Dividir as palavras-chave em listas
top_50_df['final_kw_list'] = top_50_df['keywords'].str.split(';')

# Explodir a lista em uma série plana, para que cada palavra-chave esteja em uma linha
final_kw_list = top_50_df['final_kw_list'].explode().str.strip()

from collections import Counter
# Contar as palavras-chave
final_kw_list_counts = final_kw_list.value_counts()
print(final_kw_list_counts)

final_kw_list
                         21
machine learning          6
neural networks           5
deep learning             5
unsupervised learning     3
                         ..
niche                     1
modeling                  1
distribution              1
maximum entropy           1
gans                      1
Name: count, Length: 130, dtype: int64


In [7]:
# Gerar arquivo excel
final_kw_list_counts.to_excel('../../data/processed/_3_final-kw-list.xlsx')