In [8]:
import bibtexparser
import pandas as pd
import json
import typing
import requests

# initial data import and processing
# source_path needs to be .bib
# store_path needs to be .csv
def make_dataset(source_path,store_path):
    
    # import data as pandas dataframe
    with open(source_path) as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)
        records_df = pd.DataFrame(bib_database.entries)
        
    # drop rows without target label
    records_df.dropna(subset="literature_review", inplace=True)
    
    # drop rows not published in basket of eight
    top_8 = ["European Journal of Information Systems", "Information Systems Journal", "Information Systems Research", "Journal of AIS", "Journal of Information Technology", "Journal of MIS", "Journal of Strategic Information Systems", "MIS Quarterly"]
    df = records_df.loc[records_df['journal'].isin(top_8)]

    # drop specific rows with insufficient information
    df = df[df.prescreen_exclusion != "complementary material"]
    df = df[df.title != "Editorial"]
    
    # drop columns not needed for the features
    column_filter = df.filter(['colrev.dblp.dblp_key', 'colrev_pdf_id', 'colrev_data_provenance', 'colrev_masterdata_provenance', 'colrev_status', 'colrev_origin', 'colrev.semantic_scholar.id', 
             'pdf_processed', 'prescreen_exclusion', 'note', 'fulltext', 'link', 'file', 'cited_by', 'screening_criteria', 'ID', 'crossmark-restriction', 'man_prep_hints', 
             'keywords', 'language', 'url', 'pages', 'number', 'volume', 'year', 'journal', 'author', 'ENTRYTYPE'])
    df.drop(column_filter, axis=1, inplace=True)
    
    # reset indexes
    df.reset_index(inplace=True, drop=True)
    
    # convert "yes" and "no" labels to 0/1
    df["literature_review"].replace(to_replace="yes", value=1, inplace=True)
    df["literature_review"].replace(to_replace="no", value=0, inplace=True)
    df.astype({"literature_review": int})

    # add references column by accessing the opencitations api

    api_url = "https://opencitations.net/index/coci/api/v1/references/"

    new_column = []

    for index, row in df.loc[:, ["doi"]].iterrows():
        
        references = []
        
        if not pd.isna(row.doi):
            url = f"{api_url}{row.doi}"

            # headers = {"authorization": "YOUR-OPENCITATIONS-ACCESS-TOKEN"}
            headers: typing.Dict[str, str] = {}
            ret = requests.get(url, headers=headers, timeout=300)
            try:
                items = json.loads(ret.text)
                for item in items:
                    references.append(item["cited"])
            except json.decoder.JSONDecodeError:
                print(f"Error retrieving citations from OpenCitations for DOI: {row.doi}")
            
        if len(references) == 0:
            new_column.append(None)
        else:
            new_column.append(references)

    df.insert(loc=len(df.columns), column="references", value=new_column)

    # drop doi column
    df.drop(['doi'], axis = 1, inplace = True)

    # save resulting dataframe to csv file
    df.to_csv(store_path, index=False)

In [9]:
# make original dataset
make_dataset('../../data/external/records_original.bib','../../data/interim/data_original.csv')

# make extended dataset
make_dataset('../../data/external/records_extended.bib','../../data/interim/data_extended.csv')