# 05. Generate exclude verification file

In [1]:
# Imports iniciais
import xml.etree.ElementTree as ET
import pandas as pd

# Pacote para ignorar mensagem de warning
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Carregar arquivo xml
tree = ET.parse('../../data/raw/JIF-Q1-Article-database-262.xml')
root = tree.getroot()

# Definir lista vazia para armazenamento de dados
JIF_Q1_db_262_list = []

# Iterar sobre os <record> elementos e extrair as informações
for record in root.findall('records/record'):
    
    # Extrair dados de cada campo
    database = record.find('database').text if record.find('database') is not None else None
    ref_type = record.find('ref-type').get('name') if record.find('ref-type') is not None else None
    authors = ", ".join([author.text for author in record.findall('contributors/authors/author')]) if record.findall('contributors/authors/author') is not None else None
    title = record.find('titles/title').text if record.find('titles/title') is not None else None
    secondary_title = record.find('titles/secondary-title').text if record.find('titles/secondary-title') is not None else None
    periodical = record.find('periodical/full-title').text if record.find('periodical/full-title') is not None else None
    pages = record.find('pages').text if record.find('pages') is not None else None
    volume = record.find('volume').text if record.find('volume') is not None else None
    issue = record.find('issue').text if record.find('issue') is not None else None
    keywords = ", ".join([keyword.text for keyword in record.findall('keywords/keyword')]) if record.findall('keywords/keyword') is not None else None
    year = record.find('dates/year').text if record.find('dates/year') is not None else None
    DOI = record.find('electronic-resource-num').text if record.find('electronic-resource-num') is not None else None
    pdf_url = record.find('urls/pdf-urls/url').text if record.find('urls/pdf-urls/url') is not None else None
    web_url = record.find('urls/web-urls/url').text if record.find('urls/web-urls/url') is not None else None
    abstract = record.find('abstract').text if record.find('abstract') is not None else None
    
    # Adicionar os dados à lista
    JIF_Q1_db_262_list.append([database, ref_type, authors, title, secondary_title, periodical, pages, volume, issue, keywords, year, DOI, pdf_url, web_url, abstract])

# Criar o dataframe
df = pd.DataFrame(JIF_Q1_db_262_list, columns=['database', 'ref-type', 'authors', 'title', 'secondary-title', 'periodical', 'pages', 'volume', 'issue', 'keywords', 'year', 'DOI', 'pdf_url', 'web_url', 'abstract'])

In [5]:
# Checando o resultado
df.head(3)

Unnamed: 0,database,ref-type,authors,title,secondary-title,periodical,pages,volume,issue,keywords,year,DOI,pdf_url,web_url,abstract
0,JIF-Q1-Article-database-262.enl,Journal Article,"Scutella, Maryanne, Plewa, Carolin, Reaiche, C...",Virtual agents in the public service: examinin...,PUBLIC MANAGEMENT REVIEW,PUBLIC MANAGEMENT REVIEW,73-88,26,1.0,virtual agent; value-in-use; value co-creation...,2024,10.1080/14719037.2022.2044504,,,The importance of today's public sector delive...
1,JIF-Q1-Article-database-262.enl,Journal Article,"Wang, Dongkun, Peng, Jieyang, Tao, Xiaoming, D...",Boosting urban prediction tasks with domain-sh...,INFORMATION FUSION,INFORMATION FUSION,,107,,Data mining; Traffic prediction; Meta learning...,2024,10.1016/j.inffus.2024.102324,,,Urban prediction tasks refer to predicting urb...
2,JIF-Q1-Article-database-262.enl,Journal Article,"Gallego, Jorge, Rivero, Gonzalo, Martinez, Juan",Preventing rather than punishing: An early war...,INTERNATIONAL JOURNAL OF FORECASTING,INTERNATIONAL JOURNAL OF FORECASTING,360-377,37,1.0,Public procurement; Corruption; Inefficiency; ...,2021,10.1016/j.ijforecast.2020.06.006,,,Is it possible to predict malfeasance in publi...


In [7]:
df.dtypes

database           object
ref-type           object
authors            object
title              object
secondary-title    object
periodical         object
pages              object
volume             object
issue              object
keywords           object
year               object
DOI                object
pdf_url            object
web_url            object
abstract           object
dtype: object

In [9]:
df_para_filtragem = df.drop(['database','ref-type','secondary-title','pages','volume','issue','keywords','pdf_url','web_url'], axis=1)

In [11]:
df_para_filtragem.head(3)

Unnamed: 0,authors,title,periodical,year,DOI,abstract
0,"Scutella, Maryanne, Plewa, Carolin, Reaiche, C...",Virtual agents in the public service: examinin...,PUBLIC MANAGEMENT REVIEW,2024,10.1080/14719037.2022.2044504,The importance of today's public sector delive...
1,"Wang, Dongkun, Peng, Jieyang, Tao, Xiaoming, D...",Boosting urban prediction tasks with domain-sh...,INFORMATION FUSION,2024,10.1016/j.inffus.2024.102324,Urban prediction tasks refer to predicting urb...
2,"Gallego, Jorge, Rivero, Gonzalo, Martinez, Juan",Preventing rather than punishing: An early war...,INTERNATIONAL JOURNAL OF FORECASTING,2021,10.1016/j.ijforecast.2020.06.006,Is it possible to predict malfeasance in publi...


In [15]:
df_para_filtragem.to_excel('../../data/processed/_7_exclude-verification-file.xlsx', index=False)

---