In [41]:
#!conda install -n impacta_env ipykernel --update-deps --force-reinstall
#!pip install bibtexparser

In [42]:
import bibtexparser
import pandas as pd
import numpy as np
import os
import glob

import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 300)

# Def Functions

In [43]:
#def1
def read_bib(file_path: str):
    '''
    Function to read and parse bib files to dataframe object.
    path: bib file path
    '''
    with open(file_path) as bibtex_file:
        bib_file = bibtexparser.load(bibtex_file)
    df = pd.DataFrame(bib_file.entries)
    
    return df
#########################################################################

#def2
def load_bib(folder_path: str):
    
    '''
    Function to: 
    1) read and parse bib files from a list of directories (folders); 
    2) concatenate multiple dataframes in an only one
    
    folder_path: directories path where are located the bib files
    
    '''
    
    #listing bib files path from acm directory
    list_files = []
    for file in glob.glob(f'{folder_path}/*.bib'):
        list_files.append(file)

    #loading each bib file listed
    list_df = []
    c = 1
    for file in list_files:
        df_temp = read_bib(file) #def
        list_df.append(df_temp)
        print(f'{c} de {len(list_files)}: {file}')
        c += 1

    #concatenating all files in a unique dataframe object
    df = pd.concat(list_df)
    print(f'Shape df_{folder}: ', df.shape)
    
    return df
#########################################################################

#def3
def write_yaml (data, file_name):
  with open(f'../03_OutputFiles/{file_name}.yaml', 'w') as output_file:
    yaml.dump_all(data.to_dict('records'), output_file, default_flow_style=False)
    print(f"{file_name}.yaml successfully written")
#########################################################################

#def4
def write_json (data, file_name):
  data.to_json(f'../03_OutputFiles/{file_name}.json', orient='records', indent=4)
  print(f"{file_name}.json successfully written")
#########################################################################

#def5
def write_csv (data, file_name):
  data.to_csv(f'../03_OutputFiles/{file_name}.csv',sep=';', index = False)
  print(f"{file_name}.csv successfully written")
#########################################################################

#def6
def read_yaml (file_name):
  with open(file_name, "r") as yamlfile:
      data = yaml.load(yamlfile, Loader=yaml.FullLoader)
      print(f"{file_name} read successfully")
  return data

# Reading, loading and concatenating all bibtex files

In [46]:
list_folders = []
for folder in glob.glob(f'../01_Datasets/*'):
    list_folders.append(folder)
list_folders

list_df = []
for f in list_folders:
    print('\n',f)
    df_temp = load_bib(f) #def
    list_df.append(df_temp)
df_all_raw = pd.concat(list_df)
print('\nShape df_all_raw: ',df_all_raw.shape)


 ../01_Datasets/acm
1 de 15: ../01_Datasets/acm/acm (1).bib
2 de 15: ../01_Datasets/acm/acm (10).bib
3 de 15: ../01_Datasets/acm/acm (11).bib
4 de 15: ../01_Datasets/acm/acm (12).bib
5 de 15: ../01_Datasets/acm/acm (13).bib
6 de 15: ../01_Datasets/acm/acm (14).bib
7 de 15: ../01_Datasets/acm/acm (2).bib
8 de 15: ../01_Datasets/acm/acm (3).bib
9 de 15: ../01_Datasets/acm/acm (4).bib
10 de 15: ../01_Datasets/acm/acm (5).bib
11 de 15: ../01_Datasets/acm/acm (6).bib
12 de 15: ../01_Datasets/acm/acm (7).bib
13 de 15: ../01_Datasets/acm/acm (8).bib
14 de 15: ../01_Datasets/acm/acm (9).bib
15 de 15: ../01_Datasets/acm/acm.bib
Shape df_../01_Datasets/sciencedirect:  (1451, 27)

 ../01_Datasets/ieee
1 de 5: ../01_Datasets/ieee/ieee01.bib
2 de 5: ../01_Datasets/ieee/ieee02.bib
3 de 5: ../01_Datasets/ieee/ieee03.bib
4 de 5: ../01_Datasets/ieee/ieee04.bib
5 de 5: ../01_Datasets/ieee/ieee05.bib
Shape df_../01_Datasets/sciencedirect:  (466, 18)

 ../01_Datasets/sciencedirect
1 de 51: ../01_Datasets

# Features renaming / selection

In [47]:
df_all = df_all_raw.copy()
df_all.rename(columns={'ENTRYTYPE': 'type_publication'}, inplace = True)

cols_to_keep = ['author', 'title', 'keywords', 'abstract', 'year', 'type_publication', 'doi']

df_all = df_all[cols_to_keep]

df_all.head()

Unnamed: 0,author,title,keywords,abstract,year,type_publication,doi
7,A. Hubaux,A new geological tool-the data,,Today data processing technology offers new an...,1973,article,https://doi.org/10.1016/0012-8252(73)90089-5
9,K. Trenberth and J. Angell and R. Barry and R....,Working Group 1: Observations,,,1991,incollection,https://doi.org/10.1016/B978-0-444-88351-3.500...
83,C.K Yeo and S.C Hui and I.Y Soon and B.S Lee,An adaptive protocol for real-time fax communi...,"Internet faxing, Adaptive fax communication, R...",Internet faxing allows users from different lo...,2001,article,https://doi.org/10.1016/S0140-3664(00)00342-X
7,G. Holmes and T.C. Smith,7 - Data mining,,,2001,incollection,https://doi.org/10.1533/9781855736375.2.137
25,Joaquin Dopazo and Edward Zanders and Ilaria D...,Methods and approaches in the analysis of gene...,"Immunological research, Data analysis, Human g...",The application of high-density DNA array tech...,2001,article,https://doi.org/10.1016/S0022-1759(01)00307-6


# Dtypes

In [48]:
df_all.dtypes


author              object
title               object
keywords            object
abstract            object
year                 int64
type_publication    object
doi                 object
dtype: object

In [49]:
#Adjusting "year" feature dtype
df_all['year'] = df_all.year.astype('int64')
print('dtypes:\n', df_all.dtypes, '\n')

#Sorting values by "year"
df_all = df_all.sort_values('year')
display(df_all.head())

dtypes:
 author              object
title               object
keywords            object
abstract            object
year                 int64
type_publication    object
doi                 object
dtype: object 



Unnamed: 0,author,title,keywords,abstract,year,type_publication,doi
7,A. Hubaux,A new geological tool-the data,,Today data processing technology offers new an...,1973,article,https://doi.org/10.1016/0012-8252(73)90089-5
9,K. Trenberth and J. Angell and R. Barry and R....,Working Group 1: Observations,,,1991,incollection,https://doi.org/10.1016/B978-0-444-88351-3.500...
83,C.K Yeo and S.C Hui and I.Y Soon and B.S Lee,An adaptive protocol for real-time fax communi...,"Internet faxing, Adaptive fax communication, R...",Internet faxing allows users from different lo...,2001,article,https://doi.org/10.1016/S0140-3664(00)00342-X
7,G. Holmes and T.C. Smith,7 - Data mining,,,2001,incollection,https://doi.org/10.1533/9781855736375.2.137
25,Joaquin Dopazo and Edward Zanders and Ilaria D...,Methods and approaches in the analysis of gene...,"Immunological research, Data analysis, Human g...",The application of high-density DNA array tech...,2001,article,https://doi.org/10.1016/S0022-1759(01)00307-6


# NaN analysis

In [50]:
print('Shape: ',df_all.shape,'\n')
print(f'Nan Analysis: \n{df_all.isna().sum()}\n')


Shape:  (6467, 7) 

Nan Analysis: 
author              222
title                13
keywords            816
abstract            475
year                  0
type_publication      0
doi                  93
dtype: int64



In [51]:
print('Shape before dropna: ', df_all.shape)
df_all.dropna(axis = 0, inplace = True)
print('Shape after dropna: ', df_all.shape)

Shape before dropna:  (6467, 7)
Shape after dropna:  (5577, 7)


# Drop duplicates

In [52]:
print('Shape before drop_duplicates: ',df_all.shape)
df_all.drop_duplicates(inplace = True)
print('Shape after drop_duplicates: ',df_all.shape)

Shape before drop_duplicates:  (5577, 7)
Shape after drop_duplicates:  (5577, 7)


# Export

## Creating 03_OutputFiles if not exists

In [None]:
if not os.path.exists('../03_OutputFiles/'):
  os.makedirs('../03_OutputFiles/')

## Creating yaml file if not exists

In [None]:
if not os.path.exists("configuration.yaml"):
  configuration_dict = {
        'output_extensions': ['csv', 'json', 'yaml']
    }
  with open('configuration.yaml', 'w') as yamlfile:
    data = yaml.dump(configuration_dict, yamlfile)

configuration_file = read_yaml("configuration.yaml")


print("Output extensions options: ", configuration_file['output_extensions'])

## Reading yaml file and exporting

In [None]:
for extension_name in configuration_file['output_extensions']:
  if extension_name == 'csv':
    write_csv(df_all, 'df_all')
  elif extension_name == 'json':
    write_json(df_all, 'df_all')
  elif extension_name == 'yaml':
    write_yaml(df_all, 'df_all')