# 03 - Generate journals list

### Identificar journals

In [1]:
# Imports iniciais
import xml.etree.ElementTree as ET
import pandas as pd

# Pacote para ignorar mensagem de warning
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Carregar arquivo xml
tree = ET.parse('../../data/raw/consolidated-sample-784.xml')
root = tree.getroot()

# Definir lista vazia para armazenamento de dados
portf_784_list = []

# Iterar sobre os <record> elementos e extrair as informações
for record in root.findall('records/record'):
    
    # Extrair dados de cada campo
    database = record.find('database').text if record.find('database') is not None else None
    ref_type = record.find('ref-type').get('name') if record.find('ref-type') is not None else None
    authors = ", ".join([author.text for author in record.findall('contributors/authors/author')]) if record.findall('contributors/authors/author') is not None else None
    title = record.find('titles/title').text if record.find('titles/title') is not None else None
    secondary_title = record.find('titles/secondary-title').text if record.find('titles/secondary-title') is not None else None
    periodical = record.find('periodical/full-title').text if record.find('periodical/full-title') is not None else None
    pages = record.find('pages').text if record.find('pages') is not None else None
    volume = record.find('volume').text if record.find('volume') is not None else None
    issue = record.find('issue').text if record.find('issue') is not None else None
    keywords = ", ".join([keyword.text for keyword in record.findall('keywords/keyword')]) if record.findall('keywords/keyword') is not None else None
    year = record.find('dates/year').text if record.find('dates/year') is not None else None
    DOI = record.find('electronic-resource-num').text if record.find('electronic-resource-num') is not None else None
    pdf_url = record.find('urls/pdf-urls/url').text if record.find('urls/pdf-urls/url') is not None else None
    web_url = record.find('urls/web-urls/url').text if record.find('urls/web-urls/url') is not None else None
    abstract = record.find('abstract').text if record.find('abstract') is not None else None
    
    # Adicionar os dados à lista
    portf_784_list.append([database, ref_type, authors, title, secondary_title, periodical, pages, volume, issue, keywords, year, DOI, pdf_url, web_url, abstract])

# Criar o dataframe
df = pd.DataFrame(portf_784_list, columns=['database', 'ref-type', 'authors', 'title', 'secondary-title', 'periodical', 'pages', 'volume', 'issue', 'keywords', 'year', 'DOI', 'pdf_url', 'web_url', 'abstract'])

In [5]:
# Checando o resultado
df.head(3)

Unnamed: 0,database,ref-type,authors,title,secondary-title,periodical,pages,volume,issue,keywords,year,DOI,pdf_url,web_url,abstract
0,,Journal Article,"\n Abbas, Syed Wasim\n , \...",Official Statistics and Big Data Processing wi...,,SYSTEMS,,11,8,artificial intelligence; big data; convex logi...,2023,10.3390/systems11080424,,,Efficient monitoring and achievement of the Su...
1,,Journal Article,"\n Abbas, Syed Wasim\n , \...",Unreported data sources in public sector organ...,,Statistical Journal of the IAOS,359 – 370,35,3,,2019,10.3233/SJI-180466,,,Almost every public sector department produces...
2,,Journal Article,"\n Abdeldayem, Marwan Mohamed\n ...",Trends and opportunities of artificial intelli...,,International Journal of Scientific and Techno...,3867 – 3871,9,1,,2020,,,,The purpose of the study is to draw an underst...


In [7]:
df.dtypes

database           object
ref-type           object
authors            object
title              object
secondary-title    object
periodical         object
pages              object
volume             object
issue              object
keywords           object
year               object
DOI                object
pdf_url            object
web_url            object
abstract           object
dtype: object

In [9]:
types_list = df['ref-type'].value_counts()
print(types_list)

ref-type
Journal Article    784
Name: count, dtype: int64


In [11]:
df['year'].value_counts()

year
2023    171
2022    150
2024    133
2021    104
2020     84
2019     72
2018     39
2017     31
Name: count, dtype: int64

In [13]:
df_journals = df['periodical'].value_counts()
df_journals.head(3)

periodical
GOVERNMENT INFORMATION QUARTERLY    23
SUSTAINABILITY                      13
AI & SOCIETY                        12
Name: count, dtype: int64

In [15]:
# Gerar arquivo excel
df_journals.to_excel('../../data/processed/_5_journal-freq.xlsx')

---