# Zenodo metadata analysis

#### Initialization

Import required libraries use for data analysis as well as plotting graphs. Define some few useful functions.

In [2]:
%matplotlib inline

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn3

# Make the graphs a bit prettier, and bigger
plt.style.use('ggplot')

pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

plt.rcParams['figure.figsize'] = (15, 5)

# # Import data
# df = pd.read_csv('citations.csv', dtype={
#     'source_domain': str,
#     'target_domain': str,
#     'target_publisher': str,
# })

def iter_file(path):
    with open(path, 'r') as fp:
        for line in fp:
            yield json.loads(line)
            
def open_csv(filename, header):
    fp = open(filename, 'w')
    writer = csv.writer(fp)
    writer.writerow(header)
    return fp, writer
    

# Process metadata dump 

The following code will extract specific properties (e.g. authors, record title, keywords, and subjects) from each metadata record and write several CSV files with each of these properties.

In [3]:
import json
import csv

# Name of the file to import
filename = 'records-2019-02-01.json'

# Open the CSV files, and write the first header row
authors_fp, authors = open_csv('authors.csv', ['id', 'name', 'affiliation', 'orcid', 'gnd'])
records_fp, records = open_csv('records.csv', ['id', 'title', 'publication_date'])
keywords_fp, keywords = open_csv('keywords.csv', ['id', 'keyword'])
subjects_fp, subjects = open_csv('subjects.csv', ['id', 'scheme', 'uri', 'term'])

# Iterate over the records metadata JSON dump
for r in iter_file(filename):
    m = r['metadata']
    # Creators (number of fields must match header row)
    for creator in m['creators']:
        authors.writerow([
            r['id'], 
            creator.get('name', ''), 
            creator.get('affiliation', ''),
            creator.get('orcid', ''),
            creator.get('gnd', ''),
        ])
    # Keywords
    for kw in m.get('keywords', []):
        keywords.writerow([
            r['id'], 
            kw, 
        ])
    # Subjects
    for subj in m.get('subjects', []):
        subjects.writerow([
            r['id'], 
            subj.get('scheme', ''),
            subj.get('identifier', ''),
            subj.get('term', ''),
        ])
    # Records
    records.writerow([
        r['id'], 
        m['title'], 
        m['publication_date'],
    ])
        
# Close the CSV files 
authors_fp.close()
records_fp.close()
keywords_fp.close()
subjects_fp.close()

# Import processed data

Read one of the CSV files into a Pandas data frame:

In [5]:
# Import data
df = pd.read_csv('keywords.csv')

Do something useful with the data frame:

In [6]:
for k in df['keyword']:
    if ';' in k:
        print(k)

Biodiversity; marine caves; taxonomy; checklist; diagnostic keys; Dendroceratida; Dictyoceratida; Halisarcida
Biodiversity informatics; network; data; indicators
Biodiversity; marine caves; taxonomy; checklist; diagnostic keys; Dendroceratida; Dictyoceratida; Halisarcida
Biodiversity; marine caves; taxonomy; checklist; diagnostic keys; Dendroceratida; Dictyoceratida; Halisarcida
Azores; terrestrial arthropods; BALA project; laurissilva forest; Linnean
Biodiversity informatics; network; data; indicators
Biting midges; new species; Scandinavia; morphology; molecular barcoding
asterids; Caribbean; herbarium specimens; morphology; taxonomy
Malagasy bioregion; taxonomy; ants; Amblyoponinae; Madagascar; Seychelles
Biting midges; new species; Scandinavia; morphology; molecular barcoding
Malagasy bioregion; taxonomy; ants; Amblyoponinae; Madagascar; Seychelles
Bateman; fish
Vihāragala; Sabha; Saba; Gajabāhu Gāmaṇī Abhaya; Uppaladoṇika
ADHD;metylphenidate;non-randomised studies
ADHD;metylphenid

TypeError: argument of type 'float' is not iterable