In [41]:
#!conda install -n pysparkpi plenv ipykernel --update-deps --force-reinstall
#!pip install bibtexparser
#!pip install pyyaml
#!pip install pyspark findspark
#!pip install PyArrow

# Import

In [42]:
import bibtexparser
#import pandas as pd
import pandas as pd
import pyspark.pandas as ps
import numpy as np
import os
import glob
import sqlite3

import yaml

import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

from functions import read_bib
from functions import load_bib
from functions import write_yaml
from functions import write_json
from functions import write_csv
from functions import read_yaml

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 300)

# Creating SparkSession and sparkcontext

In [43]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder\
                    .master("local")\
                    .appName('Firstprogram')\
                    .getOrCreate()
sc=spark.sparkContext

# Reading, loading and concatenating all bibtex files

In [44]:
#pd
list_folders = []
for folder in glob.glob(f'../01_Datasets/bibtex/*'):
  print(folder)
  list_folders.append(folder)

list_df = []
for f in list_folders:
  print('\n',f)
  df_temp = load_bib(f) #def load_bib
  list_df.append(df_temp)
df_all_raw = pd.concat(list_df)
df_all_raw.to_csv('../01_Datasets/output/df_all_raw.csv', index = False)
print('\nShape df_all_raw: ',df_all_raw.shape)

../01_Datasets/bibtex\acm
../01_Datasets/bibtex\ieee
../01_Datasets/bibtex\sciencedirect

 ../01_Datasets/bibtex\acm
1 de 15: ../01_Datasets/bibtex\acm\acm (1).bib
2 de 15: ../01_Datasets/bibtex\acm\acm (10).bib
3 de 15: ../01_Datasets/bibtex\acm\acm (11).bib
4 de 15: ../01_Datasets/bibtex\acm\acm (12).bib
5 de 15: ../01_Datasets/bibtex\acm\acm (13).bib
6 de 15: ../01_Datasets/bibtex\acm\acm (14).bib
7 de 15: ../01_Datasets/bibtex\acm\acm (2).bib
8 de 15: ../01_Datasets/bibtex\acm\acm (3).bib
9 de 15: ../01_Datasets/bibtex\acm\acm (4).bib
10 de 15: ../01_Datasets/bibtex\acm\acm (5).bib
11 de 15: ../01_Datasets/bibtex\acm\acm (6).bib
12 de 15: ../01_Datasets/bibtex\acm\acm (7).bib
13 de 15: ../01_Datasets/bibtex\acm\acm (8).bib
14 de 15: ../01_Datasets/bibtex\acm\acm (9).bib
15 de 15: ../01_Datasets/bibtex\acm\acm.bib
Shape df_bibtex\acm:  (1451, 27)

 ../01_Datasets/bibtex\ieee
1 de 5: ../01_Datasets/bibtex\ieee\ieee01.bib
2 de 5: ../01_Datasets/bibtex\ieee\ieee02.bib
3 de 5: ../01_Dat

# Features renaming / selection

In [109]:
#pyspark.pandas (ps)
df_all_raw = ps.read_csv('../01_Datasets/output/df_all_raw.csv')
df_all = df_all_raw.copy()
df_all.rename(columns={'ENTRYTYPE': 'type_publication'}, inplace = True)
cols_to_keep = ['author', 'title', 'keywords', 'abstract', 'year', 'type_publication', 'doi', 'issn', 'isbn','publisher']
df_all = df_all[cols_to_keep]
df_all.head()
print(type(df_all))
print(df_all.dtypes)

<class 'pyspark.pandas.frame.DataFrame'>
author              object
title               object
keywords            object
abstract            object
year                object
type_publication    object
doi                 object
issn                object
isbn                object
publisher           object
dtype: object


# Dtypes

In [110]:
#Converting from ps to pd
df_all = df_all.to_pandas()
print(type(df_all))

#Adjusting "year" feature dtype
df_all['year'] = pd.to_numeric(df_all['year'], errors = 'coerce')
df_all['year'] = df_all['year'].fillna(0).astype('int64', errors = 'raise')
df_all['year'] = df_all['year'].map(lambda x: np.nan if x > 2022 else np.nan if x < 1900 else x)

# #Adjusting "doi" feature
df_all['doi'] = df_all['doi'].apply(lambda x: np.nan if x != x else x.split('doi.org/')[-1] if x is not None else x)

#Sorting values by "year"
df_all = df_all.sort_values('title')

#Converting from pd to ps
df_all = ps.from_pandas(df_all)
print(type(df_all))
print(df_all.dtypes)



<class 'pandas.core.frame.DataFrame'>
<class 'pyspark.pandas.frame.DataFrame'>
author               object
title                object
keywords             object
abstract             object
year                float64
type_publication     object
doi                  object
issn                 object
isbn                 object
publisher            object
dtype: object


# NaN analysis

In [111]:
#ps
#Dropna
print('Shape before dropna: ', df_all.shape)
df_all.dropna(subset = ['doi'], axis = 0, inplace = True)
print('Shape after dropna: ', df_all.shape)


Shape before dropna:  (8904, 10)
Shape after dropna:  (6769, 10)


# Exporting to SQlite database

In [114]:
dbfile = '../03_OutputFiles/doi.db'
tabela = 'bib'
db = sqlite3.connect(dbfile)
sqlDataTypes={}
for c in df_all.columns:
    if df_all[c].dtype.kind == 'i':  
        sqlDataTypes[c]='INTEGER'
    elif df_all[c].dtype.kind == 'f':
        sqlDataTypes[c]='REAL'
    else:
        sqlDataTypes[c]='TEXT'

#Converting from ps to pd
df_all = df_all.to_pandas()
print(type(df_all))

df_all.to_sql(tabela, index=False, if_exists='replace', dtype=sqlDataTypes, con=db)   
db.commit()
db.close()     



<class 'pandas.core.frame.DataFrame'>


In [116]:
#pd
#Check db content
db = sqlite3.connect(dbfile)

tabela = 'bib'
#imoveisCaros = pd.read_sql_query(f'select * from "{tabela}" where preco>1000000', db)
query_bib = pd.read_sql_query(f'select * from {tabela} LIMIT 10', db)
display(pd.read_sql_query(f'select count(*) from {tabela}', db))
query_bib.head()


Unnamed: 0,count(*)
0,6769


Unnamed: 0,author,title,keywords,abstract,year,type_publication,doi,issn,isbn,publisher
0,vendors,"""""intelligent"""" curriculum). In spite of the ...","data integration, collaboration, learning anal...","""Learning analytics are rapidly being implemen...",,and practitioners,in contrast,https://doi.org/10.1145/2330601.2330605,predictive models,sentiments
1,607; 95% CI,$30,971 cases and 3,respectively. Sepsis conferred the greatest e...,,$28,$38,$28,508),597 to $43
2,"Bhargav, Samarth and Sidiropoulos, Georgios an...",'It's on the Tip of My Tongue': A New Dataset...,"known item retrieval, tip of the tongue known ...",The tip of the tongue known-item retrieval (TO...,2022.0,inproceedings,10.1145/3488560.3498421,,9781450391320,Association for Computing Machinery
3,"Feger, Sebastian S. and Wozniak, Pawe\l{} W. a...","'Yes, I Comply!': Motivations and Practices a...","reuse, research data management, human data in...",As science becomes increasingly data-intensive...,2020.0,article,10.1145/3415212,,,Association for Computing Machinery
4,"Feger, Sebastian S. and Wozniak, Pawe\l{} W. a...","'Yes, I Comply!': Motivations and Practices a...","reuse, research data management, human data in...",As science becomes increasingly data-intensive...,2020.0,article,10.1145/3415212,,,Association for Computing Machinery


# Export

## Creating 03_OutputFiles if not exists

In [9]:
if not os.path.exists('../03_OutputFiles/'):
  os.makedirs('../03_OutputFiles/')

## Creating yaml file if not exists

In [4]:
if not os.path.exists("../05_Config/configuration.yaml"):
  configuration_dict = {
        'output_extensions': ['csv', 'json', 'yaml']
    }
  with open('05_Config/configuration.yaml', 'w') as yamlfile:
    data = yaml.dump(configuration_dict, yamlfile)

configuration_file = read_yaml("../05_Config/configuration.yaml")


print("Output extensions options: ", configuration_file['output_extensions'])

../05_Config/configuration.yaml read successfully
Output extensions options:  ['json', 'csv']


## Reading yaml file and exporting

In [11]:
for extension_name in configuration_file['output_extensions']:
  if extension_name == 'csv':
    write_csv(df_all, 'df_all')
  elif extension_name == 'json':
    write_json(df_all, 'df_all')
  elif extension_name == 'yaml':
    write_yaml(df_all, 'df_all')

df_all.csv successfully written
df_all.json successfully written
df_all.yaml successfully written
