In [1]:
#!conda install -n impacta_env ipykernel --update-deps --force-reinstall
#!pip install bibtexparser
#!pip install pyyaml

# Import

In [1]:
import bibtexparser
import pandas as pd
import numpy as np
import os
import glob
import sqlite3

import yaml

import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

from functions import read_bib
from functions import load_bib
from functions import write_yaml
from functions import write_json
from functions import write_csv
from functions import read_yaml

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 300)

# Reading, loading and concatenating all bibtex files

In [3]:
list_folders = []
for folder in glob.glob(f'../01_Datasets/bibtex/*'):
  print(folder)
  list_folders.append(folder)

list_df = []
for f in list_folders:
  print('\n',f)
  df_temp = load_bib(f) #def load_bib
  list_df.append(df_temp)
df_all_raw = pd.concat(list_df)
print('\nShape df_all_raw: ',df_all_raw.shape)

../01_Datasets/bibtex/acm
../01_Datasets/bibtex/ieee
../01_Datasets/bibtex/sciencedirect

 ../01_Datasets/bibtex/acm
1 de 15: ../01_Datasets/bibtex/acm/acm (1).bib
2 de 15: ../01_Datasets/bibtex/acm/acm (10).bib
3 de 15: ../01_Datasets/bibtex/acm/acm (11).bib
4 de 15: ../01_Datasets/bibtex/acm/acm (12).bib
5 de 15: ../01_Datasets/bibtex/acm/acm (13).bib
6 de 15: ../01_Datasets/bibtex/acm/acm (14).bib
7 de 15: ../01_Datasets/bibtex/acm/acm (2).bib
8 de 15: ../01_Datasets/bibtex/acm/acm (3).bib
9 de 15: ../01_Datasets/bibtex/acm/acm (4).bib
10 de 15: ../01_Datasets/bibtex/acm/acm (5).bib
11 de 15: ../01_Datasets/bibtex/acm/acm (6).bib
12 de 15: ../01_Datasets/bibtex/acm/acm (7).bib
13 de 15: ../01_Datasets/bibtex/acm/acm (8).bib
14 de 15: ../01_Datasets/bibtex/acm/acm (9).bib
15 de 15: ../01_Datasets/bibtex/acm/acm.bib
Shape df_acm:  (1451, 27)

 ../01_Datasets/bibtex/ieee
1 de 5: ../01_Datasets/bibtex/ieee/ieee01.bib
2 de 5: ../01_Datasets/bibtex/ieee/ieee02.bib
3 de 5: ../01_Datasets/b

# Features renaming / selection

In [4]:
df_all = df_all_raw.copy()
df_all.rename(columns={'ENTRYTYPE': 'type_publication'}, inplace = True)

cols_to_keep = ['author', 'title', 'keywords', 'abstract', 'year', 'type_publication', 'doi', 'issn', 'isbn','publisher']

df_all = df_all[cols_to_keep]

df_all.head()

Unnamed: 0,author,title,keywords,abstract,year,type_publication,doi,issn,isbn,publisher
0,"Jing, Furong and Cao, Yongsheng and Fang, Wei ...",Construction and Implementation of Big Data Fr...,"Crop germplasm resources, Data analysis, Big d...",Based on understanding the application of big ...,2019,inproceedings,10.1145/3331453.3361308,,9781450362948,Association for Computing Machinery
1,"Gote, Christoph and Mavrodiev, Pavlin and Schw...",Big Data = Big Insights? Operationalising Broo...,,Massive data from software repositories and co...,2022,inproceedings,10.1145/3510003.3510619,,9781450392211,Association for Computing Machinery
2,"Peng, Michael Yao-Ping and Tuan, Sheng-Hwa and...",Establishment of Business Intelligence and Big...,"Database, Business Intelligence, Institutional...",The applications on business intelligence and ...,2017,inproceedings,10.1145/3134271.3134296,,9781450352765,Association for Computing Machinery
3,"Li, Jiale and Liao, Shunbao",Quality Control Framework of Big Data for Earl...,"big data, agro-meteorological disasters, early...","Agricultural meteorological disasters, includi...",2019,inproceedings,10.1145/3349341.3349371,,9781450371506,Association for Computing Machinery
4,"Cuzzocrea, Alfredo",Big Data Management and Analytics in Intellige...,"Intelligent smart environments, Big data analy...",This paper focuses on big data management and ...,2019,inproceedings,10.1145/3366030.3366044,,9781450371797,Association for Computing Machinery


# Dtypes

In [5]:
#Adjusting "year" feature dtype
df_all['year'] = df_all.year.astype('int64')


#Adjusting "doi" feature
df_all['doi'] = df_all['doi'].map(lambda x: np.nan if x !=x else x.split('doi.org/')[1] if len(x.split('doi.org/')) > 1 else x)

#Sorting values by "year"
df_all = df_all.sort_values('year')
display(df_all.head())

Unnamed: 0,author,title,keywords,abstract,year,type_publication,doi,issn,isbn,publisher
7,A. Hubaux,A new geological tool-the data,,Today data processing technology offers new an...,1973,article,10.1016/0012-8252(73)90089-5,0012-8252,,
9,K. Trenberth and J. Angell and R. Barry and R....,Working Group 1: Observations,,,1991,incollection,10.1016/B978-0-444-88351-3.50043-X,0167-5117,,Elsevier
83,C.K Yeo and S.C Hui and I.Y Soon and B.S Lee,An adaptive protocol for real-time fax communi...,"Internet faxing, Adaptive fax communication, R...",Internet faxing allows users from different lo...,2001,article,10.1016/S0140-3664(00)00342-X,0140-3664,,
7,G. Holmes and T.C. Smith,7 - Data mining,,,2001,incollection,10.1533/9781855736375.2.137,,978-1-85573-565-1,Woodhead Publishing
25,Joaquin Dopazo and Edward Zanders and Ilaria D...,Methods and approaches in the analysis of gene...,"Immunological research, Data analysis, Human g...",The application of high-density DNA array tech...,2001,article,10.1016/S0022-1759(01)00307-6,0022-1759,,


# NaN analysis

In [6]:
#Dropna
print('Shape before dropna: ', df_all.shape)
df_all.dropna(subset = ['doi'], axis = 0, inplace = True)
print('Shape after dropna: ', df_all.shape)


Shape before dropna:  (6775, 10)
Shape after dropna:  (6666, 10)


# Exporting to SQlite database

In [7]:
dbfile = '/Users/amigosdadancamooca/Documents/Impacta/Python_for_Data_Engineers/03_OutputFiles/doi.db'
tabela = 'bib'
db = sqlite3.connect(dbfile)
sqlDataTypes={}
for c in df_all.columns:
    if df_all[c].dtype.kind == 'i':  
        sqlDataTypes[c]='INTEGER'
    elif df_all[c].dtype.kind == 'f':
        sqlDataTypes[c]='REAL'
    else:
        sqlDataTypes[c]='TEXT'
df_all.to_sql(tabela, index=False, if_exists='replace', dtype=sqlDataTypes, con=db)   
db.commit()
db.close()     

In [8]:
#Check db content

db = sqlite3.connect(dbfile)

tabela = 'bib'
#imoveisCaros = pd.read_sql_query(f'select * from "{tabela}" where preco>1000000', db)
query_bib = pd.read_sql_query(f'select * from {tabela} LIMIT 10', db)
display(pd.read_sql_query(f'select count(*) from {tabela}', db))
query_bib.head()


Unnamed: 0,count(*)
0,6666


Unnamed: 0,author,title,keywords,abstract,year,type_publication,doi,issn,isbn,publisher
0,A. Hubaux,A new geological tool-the data,,Today data processing technology offers new an...,1973,article,10.1016/0012-8252(73)90089-5,0012-8252,,
1,K. Trenberth and J. Angell and R. Barry and R....,Working Group 1: Observations,,,1991,incollection,10.1016/B978-0-444-88351-3.50043-X,0167-5117,,Elsevier
2,C.K Yeo and S.C Hui and I.Y Soon and B.S Lee,An adaptive protocol for real-time fax communi...,"Internet faxing, Adaptive fax communication, R...",Internet faxing allows users from different lo...,2001,article,10.1016/S0140-3664(00)00342-X,0140-3664,,
3,G. Holmes and T.C. Smith,7 - Data mining,,,2001,incollection,10.1533/9781855736375.2.137,,978-1-85573-565-1,Woodhead Publishing
4,Joaquin Dopazo and Edward Zanders and Ilaria D...,Methods and approaches in the analysis of gene...,"Immunological research, Data analysis, Human g...",The application of high-density DNA array tech...,2001,article,10.1016/S0022-1759(01)00307-6,0022-1759,,


# Export

## Creating 03_OutputFiles if not exists

In [9]:
if not os.path.exists('../03_OutputFiles/'):
  os.makedirs('../03_OutputFiles/')

## Creating yaml file if not exists

In [4]:
if not os.path.exists("../05_Config/configuration.yaml"):
  configuration_dict = {
        'output_extensions': ['csv', 'json', 'yaml']
    }
  with open('05_Config/configuration.yaml', 'w') as yamlfile:
    data = yaml.dump(configuration_dict, yamlfile)

configuration_file = read_yaml("../05_Config/configuration.yaml")


print("Output extensions options: ", configuration_file['output_extensions'])

../05_Config/configuration.yaml read successfully
Output extensions options:  ['json', 'csv']


## Reading yaml file and exporting

In [11]:
for extension_name in configuration_file['output_extensions']:
  if extension_name == 'csv':
    write_csv(df_all, 'df_all')
  elif extension_name == 'json':
    write_json(df_all, 'df_all')
  elif extension_name == 'yaml':
    write_yaml(df_all, 'df_all')

df_all.csv successfully written
df_all.json successfully written
df_all.yaml successfully written


In [12]:
#Opcional
#if 'csv' in configuration_file['output_extensions']:
#    write_csv(df_all, 'df_all')
#if 'json' in configuration_file['output_extensions']:
#    write_json(df_all, 'df_all')
#if 'yaml' in configuration_file['output_extensions']:
#    write_yaml(df_all, 'df_all')