In [1]:
#!conda install -n impacta_env ipykernel --update-deps --force-reinstall
#!pip install bibtexparser
#!pip install pyyaml

In [2]:
import bibtexparser
import pandas as pd
import numpy as np
import os
import glob
import yaml

import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 300)



# Def Functions

In [3]:
#def1
def read_bib(file_path: str):
    '''
    Function to read and parse bib files to dataframe object.
    path: bib file path
    '''
    with open(file_path) as bibtex_file:
        bib_file = bibtexparser.load(bibtex_file)
    df = pd.DataFrame(bib_file.entries)
    
    return df
#########################################################################

#def2
def load_bib(folder_path: str):
    
    '''
    Function to: 
    1) read and parse bib files from a list of directories (folders); 
    2) concatenate multiple dataframes in an only one
    
    folder_path: directories path where are located the bib files
    
    '''
    
    #listing bib files path from acm directory
    list_files = []
    for file in glob.glob(f'{folder_path}/*.bib'):
        list_files.append(file)

    #loading each bib file listed
    list_df = []
    c = 1
    for file in list_files:
        df_temp = read_bib(file) #def
        list_df.append(df_temp)
        print(f'{c} de {len(list_files)}: {file}')
        c += 1

    #concatenating all files in a unique dataframe object
    df = pd.concat(list_df)
    print(f'Shape df_{folder}: ', df.shape)
    
    return df
#########################################################################

#def3
def write_yaml (data, file_name):
  with open(f'../03_OutputFiles/{file_name}.yaml', 'w') as output_file:
    yaml.dump_all(data.to_dict('records'), output_file, default_flow_style=False)
    print(f"{file_name}.yaml successfully written")
#########################################################################

#def4
def write_json (data, file_name):
  data.to_json(f'../03_OutputFiles/{file_name}.json', orient='records', indent=4)
  print(f"{file_name}.json successfully written")
#########################################################################

#def5
def write_csv (data, file_name):
  data.to_csv(f'../03_OutputFiles/{file_name}.csv',sep=';', index = False)
  print(f"{file_name}.csv successfully written")
#########################################################################

#def6
def read_yaml (file_name):
  with open(file_name, "r") as yamlfile:
      data = yaml.load(yamlfile, Loader=yaml.FullLoader)
      print(f"{file_name} read successfully")
  return data

# Reading, loading and concatenating all bibtex files

In [4]:
list_folders = []
for folder in glob.glob(f'../01_Datasets/bibtex/*'):
    list_folders.append(folder)
list_folders

list_df = []
for f in list_folders:
    print('\n',f)
    df_temp = load_bib(f) #def
    list_df.append(df_temp)
df_all_raw = pd.concat(list_df)
print('\nShape df_all_raw: ',df_all_raw.shape)


 ../01_Datasets/bibtex/acm
1 de 15: ../01_Datasets/bibtex/acm/acm (1).bib
2 de 15: ../01_Datasets/bibtex/acm/acm (10).bib
3 de 15: ../01_Datasets/bibtex/acm/acm (11).bib
4 de 15: ../01_Datasets/bibtex/acm/acm (12).bib
5 de 15: ../01_Datasets/bibtex/acm/acm (13).bib
6 de 15: ../01_Datasets/bibtex/acm/acm (14).bib
7 de 15: ../01_Datasets/bibtex/acm/acm (2).bib
8 de 15: ../01_Datasets/bibtex/acm/acm (3).bib
9 de 15: ../01_Datasets/bibtex/acm/acm (4).bib
10 de 15: ../01_Datasets/bibtex/acm/acm (5).bib
11 de 15: ../01_Datasets/bibtex/acm/acm (6).bib
12 de 15: ../01_Datasets/bibtex/acm/acm (7).bib
13 de 15: ../01_Datasets/bibtex/acm/acm (8).bib
14 de 15: ../01_Datasets/bibtex/acm/acm (9).bib
15 de 15: ../01_Datasets/bibtex/acm/acm.bib
Shape df_../01_Datasets/bibtex/sciencedirect:  (1451, 27)

 ../01_Datasets/bibtex/ieee
1 de 5: ../01_Datasets/bibtex/ieee/ieee01.bib
2 de 5: ../01_Datasets/bibtex/ieee/ieee02.bib
3 de 5: ../01_Datasets/bibtex/ieee/ieee03.bib
4 de 5: ../01_Datasets/bibtex/ieee/

In [5]:
df_all_raw.head()

Unnamed: 0,series,location,keywords,numpages,articleno,booktitle,abstract,doi,url,address,publisher,isbn,year,title,author,ENTRYTYPE,ID,pages,month,journal,issn,number,volume,issue_date,note,edition,editor
0,CSAE 2019,"Sanya, China","Crop germplasm resources, Data analysis, Big d...",7,27.0,Proceedings of the 3rd International Conferenc...,Based on understanding the application of big ...,10.1145/3331453.3361308,https://doi.org/10.1145/3331453.3361308,"New York, NY, USA",Association for Computing Machinery,9781450362948,2019,Construction and Implementation of Big Data Fr...,"Jing, Furong and Cao, Yongsheng and Fang, Wei ...",inproceedings,10.1145/3331453.3361308,,,,,,,,,,
1,ICSE '22,"Pittsburgh, Pennsylvania",,12,,Proceedings of the 44th International Conferen...,Massive data from software repositories and co...,10.1145/3510003.3510619,https://doi.org/10.1145/3510003.3510619,"New York, NY, USA",Association for Computing Machinery,9781450392211,2022,Big Data = Big Insights? Operationalising Broo...,"Gote, Christoph and Mavrodiev, Pavlin and Schw...",inproceedings,10.1145/3510003.3510619,262–273,,,,,,,,,
2,ICBIM 2017,"Bei Jing, China","Database, Business Intelligence, Institutional...",5,,Proceedings of the International Conference on...,The applications on business intelligence and ...,10.1145/3134271.3134296,https://doi.org/10.1145/3134271.3134296,"New York, NY, USA",Association for Computing Machinery,9781450352765,2017,Establishment of Business Intelligence and Big...,"Peng, Michael Yao-Ping and Tuan, Sheng-Hwa and...",inproceedings,10.1145/3134271.3134296,121–125,,,,,,,,,
3,AICS 2019,"Wuhan, Hubei, China","big data, agro-meteorological disasters, early...",5,,Proceedings of the 2019 International Conferen...,"Agricultural meteorological disasters, includi...",10.1145/3349341.3349371,https://doi.org/10.1145/3349341.3349371,"New York, NY, USA",Association for Computing Machinery,9781450371506,2019,Quality Control Framework of Big Data for Earl...,"Li, Jiale and Liao, Shunbao",inproceedings,10.1145/3349341.3349371,74–78,,,,,,,,,
4,iiWAS2019,"Munich, Germany","Intelligent smart environments, Big data analy...",3,,Proceedings of the 21st International Conferen...,This paper focuses on big data management and ...,10.1145/3366030.3366044,https://doi.org/10.1145/3366030.3366044,"New York, NY, USA",Association for Computing Machinery,9781450371797,2019,Big Data Management and Analytics in Intellige...,"Cuzzocrea, Alfredo",inproceedings,10.1145/3366030.3366044,5–7,,,,,,,,,


# Features renaming / selection

In [6]:
df_all = df_all_raw.copy()
df_all.rename(columns={'ENTRYTYPE': 'type_publication'}, inplace = True)

cols_to_keep = ['author', 'title', 'keywords', 'abstract', 'year', 'type_publication', 'doi', 'issn', 'isbn','publisher']

df_all = df_all[cols_to_keep]

df_all.head()

Unnamed: 0,author,title,keywords,abstract,year,type_publication,doi,issn,isbn,publisher
0,"Jing, Furong and Cao, Yongsheng and Fang, Wei ...",Construction and Implementation of Big Data Fr...,"Crop germplasm resources, Data analysis, Big d...",Based on understanding the application of big ...,2019,inproceedings,10.1145/3331453.3361308,,9781450362948,Association for Computing Machinery
1,"Gote, Christoph and Mavrodiev, Pavlin and Schw...",Big Data = Big Insights? Operationalising Broo...,,Massive data from software repositories and co...,2022,inproceedings,10.1145/3510003.3510619,,9781450392211,Association for Computing Machinery
2,"Peng, Michael Yao-Ping and Tuan, Sheng-Hwa and...",Establishment of Business Intelligence and Big...,"Database, Business Intelligence, Institutional...",The applications on business intelligence and ...,2017,inproceedings,10.1145/3134271.3134296,,9781450352765,Association for Computing Machinery
3,"Li, Jiale and Liao, Shunbao",Quality Control Framework of Big Data for Earl...,"big data, agro-meteorological disasters, early...","Agricultural meteorological disasters, includi...",2019,inproceedings,10.1145/3349341.3349371,,9781450371506,Association for Computing Machinery
4,"Cuzzocrea, Alfredo",Big Data Management and Analytics in Intellige...,"Intelligent smart environments, Big data analy...",This paper focuses on big data management and ...,2019,inproceedings,10.1145/3366030.3366044,,9781450371797,Association for Computing Machinery


# Dtypes

In [7]:
df_all.dtypes


author              object
title               object
keywords            object
abstract            object
year                object
type_publication    object
doi                 object
issn                object
isbn                object
publisher           object
dtype: object

In [8]:
#Adjusting "year" feature dtype
df_all['year'] = df_all.year.astype('int64')
print('dtypes:\n', df_all.dtypes, '\n')

#Adjusting "doi" feature
df_all['doi'] = df_all['doi'].map(lambda x: np.nan if x !=x else x.split('doi.org/')[1] if len(x.split('doi.org/')) > 1 else x)

#Sorting values by "year"
df_all = df_all.sort_values('year')
display(df_all.head())

dtypes:
 author              object
title               object
keywords            object
abstract            object
year                 int64
type_publication    object
doi                 object
issn                object
isbn                object
publisher           object
dtype: object 



Unnamed: 0,author,title,keywords,abstract,year,type_publication,doi,issn,isbn,publisher
7,A. Hubaux,A new geological tool-the data,,Today data processing technology offers new an...,1973,article,10.1016/0012-8252(73)90089-5,0012-8252,,
9,K. Trenberth and J. Angell and R. Barry and R....,Working Group 1: Observations,,,1991,incollection,10.1016/B978-0-444-88351-3.50043-X,0167-5117,,Elsevier
83,C.K Yeo and S.C Hui and I.Y Soon and B.S Lee,An adaptive protocol for real-time fax communi...,"Internet faxing, Adaptive fax communication, R...",Internet faxing allows users from different lo...,2001,article,10.1016/S0140-3664(00)00342-X,0140-3664,,
7,G. Holmes and T.C. Smith,7 - Data mining,,,2001,incollection,10.1533/9781855736375.2.137,,978-1-85573-565-1,Woodhead Publishing
25,Joaquin Dopazo and Edward Zanders and Ilaria D...,Methods and approaches in the analysis of gene...,"Immunological research, Data analysis, Human g...",The application of high-density DNA array tech...,2001,article,10.1016/S0022-1759(01)00307-6,0022-1759,,


# NaN analysis

In [9]:
print('Shape: ',df_all.shape,'\n')
print(f'Nan Analysis: \n{df_all.isna().sum()}\n')


Shape:  (6775, 10) 

Nan Analysis: 
author               225
title                 13
keywords             878
abstract             489
year                   0
type_publication       0
doi                  109
issn                1779
isbn                5035
publisher           4562
dtype: int64



# Export

## Creating 03_OutputFiles if not exists

In [10]:
if not os.path.exists('../03_OutputFiles/'):
  os.makedirs('../03_OutputFiles/')

## Creating yaml file if not exists

In [11]:
if not os.path.exists("configuration.yaml"):
  configuration_dict = {
        'output_extensions': ['csv', 'json', 'yaml']
    }
  with open('configuration.yaml', 'w') as yamlfile:
    data = yaml.dump(configuration_dict, yamlfile)

configuration_file = read_yaml("configuration.yaml")


print("Output extensions options: ", configuration_file['output_extensions'])

configuration.yaml read successfully
Output extensions options:  ['csv', 'json', 'yaml']


## Reading yaml file and exporting

In [12]:
for extension_name in configuration_file['output_extensions']:
  if extension_name == 'csv':
    write_csv(df_all, 'df_all')
  elif extension_name == 'json':
    write_json(df_all, 'df_all')
  elif extension_name == 'yaml':
    write_yaml(df_all, 'df_all')

df_all.csv successfully written
df_all.json successfully written
df_all.yaml successfully written


In [13]:
#Opcional
#if 'csv' in configuration_file['output_extensions']:
#    write_csv(df_all, 'df_all')
#if 'json' in configuration_file['output_extensions']:
#    write_json(df_all, 'df_all')
#if 'yaml' in configuration_file['output_extensions']:
#    write_yaml(df_all, 'df_all')