## Generate the initial keywords list

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# Carregar arquivo xml
tree = ET.parse('../../data/raw/seed-articles.xml')
root = tree.getroot()

# Definir lista vazia para armazenamento de dados
seed_articles_list = []

# Iterate sobre os <record> elementos e extrair as informações
for record in root.findall('records/record'):
    
    # Extrair dados de cada campo
    database = record.find('database').text if record.find('database') is not None else None
    ref_type = record.find('ref-type').get('name') if record.find('ref-type') is not None else None
    authors = ", ".join([author.text for author in record.findall('contributors/authors/author')]) if record.findall('contributors/authors/author') is not None else None
    title = record.find('titles/title').text if record.find('titles/title') is not None else None
    secondary_title = record.find('titles/secondary-title').text if record.find('titles/secondary-title') is not None else None
    periodical = record.find('periodical/full-title').text if record.find('periodical/full-title') is not None else None
    pages = record.find('pages').text if record.find('pages') is not None else None
    volume = record.find('volume').text if record.find('volume') is not None else None
    issue = record.find('issue').text if record.find('issue') is not None else None
    keywords = ", ".join([keyword.text for keyword in record.findall('keywords/keyword')]) if record.findall('keywords/keyword') is not None else None
    year = record.find('dates/year').text if record.find('dates/year') is not None else None
    pdf_url = record.find('urls/pdf-urls/url').text if record.find('urls/pdf-urls/url') is not None else None
    web_url = record.find('urls/web-urls/url').text if record.find('urls/web-urls/url') is not None else None
    abstract = record.find('abstract').text if record.find('abstract') is not None else None
    
    # Adicionar os dados à lista
    seed_articles_list.append([database, ref_type, authors, title, secondary_title, periodical, pages, volume, issue, keywords, year, pdf_url, web_url, abstract])

# Criar o dataframe
seed_articles_df = pd.DataFrame(seed_articles_list, columns=['database', 'ref-type', 'authors', 'title', 'secondary-title', 'periodical', 'pages', 'volume', 'issue', 'keywords', 'year', 'pdf_url', 'web_url', 'abstract'])

In [3]:
# Checando o resultado
seed_articles_df.head(3)

Unnamed: 0,database,ref-type,authors,title,secondary-title,periodical,pages,volume,issue,keywords,year,pdf_url,web_url,abstract
0,Emerging-techs-articles.enl,Journal Article,"Sivarajah, Uthayasankar, Kamal, Muhammad Musta...",Critical analysis of Big Data challenges and a...,Journal of Business Research,Journal of Business Research,263-286,70.0,,"Big Data, Big Data Analytics, Challenges, Meth...",2017,internal-pdf://2017 - Sivarajah et al. - Journ...,http://dx.doi.org/10.1016/j.jbusres.2016.08.001,"Big Data (BD), with their potential to ascerta..."
1,Emerging-techs-articles.enl,Journal Article,"Peppard, Joe, Edwards, Chris, Lambert, Rob",Exploiting Big Data from mobile device sensor ...,MIS Quarterly,MIS Quarterly,115-117,10.0,2.0,,2011,"internal-pdf://2011 - Peppard, Edwards, Lamber...",,The role of chief information officer (CIO) is...
2,Emerging-techs-articles.enl,Journal Article,"Tambe, Prasanna, Hitt, Lorin, Rock, Daniel, Br...",NBER WORKING PAPER SERIES DIGITAL CAPITAL AND ...,,,,,,,2020,internal-pdf://2020 - Tambe et al. - Unknown.pdf,http://www.nber.org/papers/w28285,General purpose technologies like information ...


In [5]:
# Colocar palavas-chave em minúsculas
seed_articles_df['keywords'] = seed_articles_df['keywords'].str.lower()

# Dividir as palavras-chave em listas
seed_articles_df['initial_kw_list'] = seed_articles_df['keywords'].str.split(',')

# Explodir a lista em uma série plana, para que cada palavra-chave esteja em uma linha
initial_kw_list = seed_articles_df['initial_kw_list'].explode().str.strip()

from collections import Counter
# Contar as palavras-chave
initial_kw_list_counts = initial_kw_list.value_counts()
print(initial_kw_list_counts)

initial_kw_list
social media                 13
big data                     11
                              7
twitter                       5
market efficiency             4
                             ..
greenwashing                  1
customers                     1
earnings persistence          1
revenues                      1
analyst earnings forecast     1
Name: count, Length: 210, dtype: int64


In [7]:
# Gerar arquivo excel
initial_kw_list_counts.to_excel('../../data/processed/_1_initial-kw-list.xlsx')

A figura 2 apresenta a etapa de geração da lista final de palavras-chave. Conforme apresentado na figura, com base na lista inicial foi feita busca em Web of Science do resultado da busca os artigos foram organizados por número de citações e a lista com os 50 mais citados foi importada para o gerenciador de referências.

![image-2.png](static/images/_01-02-Final-KW-definition.png)

Em seguida essa lista de 50 artigos foi exportada em formato .xml para tratamento dos dados e geração de uma lista final de palavras-cahve, conforme scripts a seguir.

In [None]:
# Load the XML file
tree = ET.parse('data/top-50-review.xml')
root = tree.getroot()

# Define a empty list to store the data
top_50_list = []

# Iterate over the <record> elements and extract the information
for record in root.findall('records/record'):
    
    # Extract data from each field
    database = record.find('database').text if record.find('database') is not None else None
    ref_type = record.find('ref-type').get('name') if record.find('ref-type') is not None else None
    authors = ", ".join([author.text for author in record.findall('contributors/authors/author')]) if record.findall('contributors/authors/author') is not None else None
    title = record.find('titles/title').text if record.find('titles/title') is not None else None
    secondary_title = record.find('titles/secondary-title').text if record.find('titles/secondary-title') is not None else None
    periodical = record.find('periodical/full-title').text if record.find('periodical/full-title') is not None else None
    pages = record.find('pages').text if record.find('pages') is not None else None
    volume = record.find('volume').text if record.find('volume') is not None else None
    issue = record.find('issue').text if record.find('issue') is not None else None
    keywords = ", ".join([keyword.text for keyword in record.findall('keywords/keyword')]) if record.findall('keywords/keyword') is not None else None
    year = record.find('dates/year').text if record.find('dates/year') is not None else None
    pdf_url = record.find('urls/pdf-urls/url').text if record.find('urls/pdf-urls/url') is not None else None
    web_url = record.find('urls/web-urls/url').text if record.find('urls/web-urls/url') is not None else None
    abstract = record.find('abstract').text if record.find('abstract') is not None else None
    
    # Add the extracted data to the list
    top_50_list.append([database, ref_type, authors, title, secondary_title, periodical, pages, volume, issue, keywords, year, pdf_url, web_url, abstract])

# Create a DataFrame with the data
top_50_df = pd.DataFrame(top_50_list, columns=['database', 'ref-type', 'authors', 'title', 'secondary-title', 'periodical', 'pages', 'volume', 'issue', 'keywords', 'year', 'pdf_url', 'web_url', 'abstract'])

# Check the result
top_50_df.head()

In [None]:
# Colocar palavas-chave em minúsculas
top_50_df['keywords'] = top_50_df['keywords'].str.lower()

# Dividir as palavras-chave em listas
top_50_df['final_kw_list'] = top_50_df['keywords'].str.split(';')

# Explodir a lista em uma série plana, para que cada palavra-chave esteja em uma linha
final_kw_list = top_50_df['final_kw_list'].explode().str.strip()

from collections import Counter

# Contar as palavras-chave
final_kw_list_counts = final_kw_list.value_counts()
print(final_kw_list_counts)

In [None]:
# Gerar arquivo excel
final_kw_list_counts.to_excel('data/final-kw-list.xlsx')