In [133]:
import numpy as np
import pandas as pd
import json
import os
from tqdm.notebook import tqdm

# read in data from .conferencecorpus:

In [16]:
path = "..\\..\\datasets\\.conferencecorpus\\crossref\\"
json_files = os.listdir(path)[:-1]

In [58]:
json_files[0:5]

['crossref-1.json',
 'crossref-10.json',
 'crossref-11.json',
 'crossref-12.json',
 'crossref-13.json']

# Structure of the files:

file:
&emsp;status: str
&emsp;message-type: str
&emsp;message-version: str
&emsp;message:
&emsp;&emsp;facets: {empty dict}
&emsp;&emsp;next-cursor: str
&emsp;&emsp;total-results: int
&emsp;&emsp;items:
&emsp;&emsp;&emsp;Every entry is part of a list!
&emsp;&emsp;&emsp;&emsp;event
&emsp;&emsp;&emsp;&emsp;&emsp;Here are also different entries depending on the information content of the event
&emsp;&emsp;&emsp;&emsp;title: list
&emsp;&emsp;&emsp;&emsp;DOI: str
&emsp;&emsp;items-per-page: int
&emsp;&emsp;query:
&emsp;&emsp;&emsp;start-index: int
&emsp;&emsp;&emsp;search-terms: None

We are interested in the entries of items!

# Check all docs for their information content:

In [91]:
all_event_info = []
all_title_info = []
all_DOI_info = []

for file in json_files:
    f = open(path+file)
    doc = json.load(f)

    for i,entry in enumerate(doc['message']['items']):
        try:
            all_event_info.extend(list(entry['event'].keys()))
        except KeyError:
            print(f'entry {i} in {file} lead to an Keyerror for \'event\'')
        try:
            all_title_info.append(len(entry['title']))
        except KeyError:
            print(f'entry {i} in {file} lead to an Keyerror for \'title\'')
        try:
            all_DOI_info.append(type(entry['DOI']))
        except KeyError:
            print(f'entry {i} in {file} lead to an Keyerror for \'DOI\'')

    # shorten the length of the lists after each json-file
    all_event_info = list(set(all_event_info))
    all_title_info = list(set(all_title_info))
    all_DOI_info = list(set(all_DOI_info))

    print(f'{file}...done')

crossref-1.json...done
crossref-10.json...done
crossref-11.json...done
crossref-12.json...done
entry 963 in crossref-13.json lead to an Keyerror for 'title'
crossref-13.json...done
crossref-14.json...done
crossref-15.json...done
crossref-16.json...done
crossref-17.json...done
crossref-18.json...done
crossref-19.json...done
entry 436 in crossref-2.json lead to an Keyerror for 'title'
crossref-2.json...done
crossref-20.json...done
crossref-21.json...done
crossref-22.json...done
crossref-23.json...done
crossref-24.json...done
crossref-25.json...done
crossref-26.json...done
crossref-27.json...done
crossref-28.json...done
crossref-29.json...done
crossref-3.json...done
entry 419 in crossref-30.json lead to an Keyerror for 'title'
crossref-30.json...done
crossref-31.json...done
crossref-32.json...done
entry 318 in crossref-33.json lead to an Keyerror for 'title'
crossref-33.json...done
crossref-34.json...done
crossref-35.json...done
crossref-36.json...done
crossref-37.json...done
crossref-38.

In [115]:
# let's check the first keyerror as an example

f = open(path+json_files[4]) # in json-file 13
doc = json.load(f)
print(doc['message']['items'][2]) # at entry 963

# see that there is really no information about the title. However, we still get a name and a DOI for reference which is good

{'event': {'name': '2019 IEEE International Conference on Computational Electromagnetics (ICCEM)', 'start': {'date-parts': [[2019, 3, 20]]}, 'location': 'Shanghai, China', 'end': {'date-parts': [[2019, 3, 22]]}}, 'title': ['2019 IEEE International Conference on Computational Electromagnetics (ICCEM)'], 'DOI': '10.1109/iccem45113.2019'}


In [94]:
# read out all information about the items

print(all_event_info)
print(all_title_info)
print(all_DOI_info)

# event in total has up to 8 entries...
# the title name is packed into a list leading to length of 1 (except for deviating cases from above)...
# DOI is always a string...

['name', 'theme', 'acronym', 'start', 'sponsor', 'end', 'location', 'number']
[1]
[<class 'str'>]


# Let's try to read in the information of the json as a dataframe:

In [176]:
df = pd.DataFrame(columns=['name', 'theme', 'acronym', 'start', 'end', 'sponsor', 'location', 'number', 'title', 'DOI'])
df['sponsor'] = df['sponsor'].astype('object')
index = 0

for file in tqdm(json_files):
    f = open(path+file)
    doc = json.load(f)

    for entry in doc['message']['items']:
        for key in entry.keys():
            if key == 'event':
                for event_key in list(entry['event'].keys()):
                    if event_key in ['start', 'end']:
                        df.loc[index, event_key] = entry['event'][event_key].values()
                    elif event_key == 'sponsor':
                        df.loc[index, 'sponsor'] = ''
                        df.at[index, 'sponsor'] = entry['event']['sponsor']
                    else:
                        df.loc[index, event_key] = entry['event'][event_key]
            elif key == 'DOI':
                df.loc[index, key] = entry['DOI']
            elif key == 'title':
                df.loc[index, key] = entry['title'][0]
        index += 1

# I would have to check if I am performing the setting for 'start', 'end' and 'sponsors correctly...'

  0%|          | 0/67 [00:00<?, ?it/s]

In [177]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66016 entries, 0 to 66015
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      66016 non-null  object
 1   theme     1705 non-null   object
 2   acronym   22365 non-null  object
 3   start     40145 non-null  object
 4   end       39908 non-null  object
 5   sponsor   11222 non-null  object
 6   location  53655 non-null  object
 7   number    5984 non-null   object
 8   title     66010 non-null  object
 9   DOI       66016 non-null  object
dtypes: object(10)
memory usage: 5.0+ MB
