## Zenodo Metadata Extraction

In [177]:
import requests

ACCESS_TOKEN = '5lRvVDSnCTTXgdFWLuCN7HLAK2UWKjUbJwPCEiWJxirzVT3VfLsAeHnhflmt'
search_query = 'creators.affiliation:(+university +alabama +birmingham)'

response = requests.get('https://zenodo.org/api/records/',
                        params={'q': search_query,
                                'access_token': ACCESS_TOKEN,
                                'size': 200, # should be 128, add headroom
                                'type' : 'dataset'
                                })

records = response.json()


Generate the necessary dataframe

In [178]:
import pandas as pd

### MAKE THE DATAFRAME
df = pd.json_normalize(records['hits']['hits'][0])
for i in range(1, len(records['hits']['hits'])):
    df_row = pd.json_normalize(records['hits']['hits'][i])
    # df_row = df_row.drop(columns = ['stats'])

    df = pd.concat([df, df_row])

### DROP UNWANTED COLUMNS

unwanted_cols = ['conceptrecid', 'recid', 'revision', 'files', 'owners', 'status', 'state', 'submitted', 'metadata.title', 'metadata.resource_type.title', 
                 'metadata.resource_type.type', 'metadata.communities', 'metadata.relations.version', 'links.self', 'links.doi', 'links.self_doi', 'links.self_doi_html', 
                 'links.parent', 'links.self_iiif_manifest', 'links.self_iiif_sequence', 'links.files', 'links.media_files', 'links.archive', 'links.archive_media', 
                 'links.latest', 'links.latest_html', 'links.versions', 'links.draft', 'links.reserve_doi', 'links.access_links', 'links.access_grants', 'links.access_users',
                 'links.access_request', 'links.access', 'links.communities', 'links.communities-suggestions', 'links.requests', 'stats.downloads', 'stats.unique_downloads',
                 'stats.views', 'stats.unique_views', 'stats.version_downloads', 'stats.version_unique_downloads', 'stats.version_unique_views', 'stats.version_views'
                ]

df = df.drop(columns=unwanted_cols)

df_input = df
# df_input = expand_creators(df)
# df_input = expand_columns(df, col_name = 'metadata.creators', fields = ['name', 'affiliation', 'orcid'])
#df_input = expand_columns(df_input, col_name = 'metadata.related_identifiers', fields = ['identifier', 'relation'])
#df_input = expand_columns(df_input, col_name = 'metadata.grants', fields = ['code', 'title', 'funder'])

df_input.to_csv('zenodo_expanded_raw.csv', index=False, encoding='utf-8-sig')

In [223]:
# Helper functions 
import csv

def csv_to_dict(file_path):
    result_dict = {}
    with open(file_path, mode='r', newline='', encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            key = row[0]  # First column as key
            value = row[1]  # Second column as value
            result_dict[key] = value
    return result_dict

def expand_columns(df1, col_name, fields, df2):
    df1[col_name] = df1[col_name].apply(lambda x: x if isinstance(x, list) else [])

    # Find the maximum number of dictionaries in any row
    max_dicts = df1[col_name].apply(len).max()

    # Iterate over the possible dictionary positions (1 to max_dicts)
    for i in range(max_dicts):
        for field in fields:
            # Create new column names dynamically
            new_col = f'{field}_{i+1}'
            
            # Extract the i-th dictionary's field data if it exists, otherwise set None
            df2[new_col] = df1[col_name].apply(lambda x: x[i].get(field) if i < len(x) else None)
    return df2

def add_col(df1, col1name, df2, col2name):
    # takes col1 out of df1 and adds it onto df2 with new name 
    extracted_col = df1[col1name]
    df2 = pd.concat([df2, extracted_col.rename(col2name)], axis=1)
    return df2

def list_to_string(lst):
    if isinstance(lst, list):  # Check if the value is a list
        return ', '.join(lst)
    else:
        return ""  # Convert non-list values to string or handle as needed

def url_to_html(url, link_text=None):
    if not url:
        return ''  # Return empty string if no URL is provided
    link_text = link_text or url  # Use the URL as the link text if no text is provided
    return f'<a href="{url}">{link_text}</a>'

license_dict = {"mit-license" : ["https://opensource.org/license/mit", "<p>This data is available under the MIT License</p>"],
                "cc-zero" : ["https://creativecommons.org/public-domain/cc0/", "<p>This data is public domain under the CC-0.0 License</p>"],
                "cc-by-4.0" : ["http://creativecommons.org/licenses/by/4.0/", "<p>This data is available under the CC-BY 4.0 License</p>"],
                "cc-by-2.0" : ["http://creativecommons.org/licenses/by/2.0/", "<p>This data is available under the CC-BY 2.0 License</p>"],
                "cc-by" : ["https://creativecommons.org/licenses/by/1.0/", "<p>This data is available under the CC-BY License</p>"]
}

def add_license(df1, df2):
    licenses = []
    access = []
    for license in df1['metadata.license.id']:
        if pd.notnull(license):
            licenses.append(license_dict[license][0])
            access.append(license_dict[license][1])
        else:
            licenses.append('')
            access.append('<p>Access to this data is restricted.</p>')
    df2['distribution_license'] = licenses
    df2['access_link'] = access
    return df2

def separate_name(name):
    # Initialize middle as an empty string
    middle = ""
    
    # Check if the name contains a comma (indicating "last, first" or "last, first m." format)
    if ',' in name:
        parts = name.split(", ")
        last = parts[0]
        
        # Check if there's a second element after the comma
        if len(parts) > 1:
            first_and_middle = parts[1].split()
            first = first_and_middle[0]
            
            # Assign middle if available
            if len(first_and_middle) > 1:
                middle = first_and_middle[1]
        else:
            # Set first to an empty string if there's nothing after the comma
            first = ""
    else:
        # Assume "first last" or "first m. last" format
        parts = name.split()
        
        first = parts[0]
        last = parts[-1]
        
        # Check if there is a middle name/initial
        if len(parts) > 2:
            middle = parts[1]

    return first, middle, last
    
def reformat_name(name):
    # Check if the name contains a comma
    if ', ' in name:
        # Split the string by comma and strip any extra whitespace
        last, first = name.split(", ")
        # Return the string in "first last" format
        return f"{first} {last}"
    else:
        # Return the name unchanged if there's no comma
        return name
    
def add_orcid(df1, df2):
    orcid_pairs = []
    for index, row in df1.iterrows():
        pairs = []
        for author in row['metadata.creators']:
            if 'orcid' in author:  # Check if ORCID is present
                first_last_name = reformat_name(author['name'])
                pairs.append('<p>' + first_last_name + '<a href="https://orcid.org/' + author['orcid'] + '">' +author['orcid']+ '</a></p>')
        orcid_pairs.append("".join(pairs))  # Join multiple name-ORCID pairs with a comma
    # Add this list as a new column in df2
    df2['orcid'] = orcid_pairs
    return df2

def to_html(string):
    if string[0] != "<":
        string = "<p>" + string + "</p>"
    return(string)

def add_funders(df1, df2):
    funders = []
    count = 0
    for index, row in df1.iterrows():
        funder = []
        # Iterate through the grants if they are present
        if isinstance(row['metadata.grants'], list):
            for grant in row['metadata.grants']:
                if pd.notnull(grant):  # Check if grant is not null
                    funder.append('<p>Funder: ' + grant['funder']['name'])
                    if 'doi' in grant['funder']:
                        funder.append('<br>Funder DOI: <a href="https://doi.org/' + grant['funder']['doi'] + '">' +grant['funder']['doi']+ '</a>')
                    if 'title' in grant:
                        funder.append('<br>' + grant['title'])
                    if 'code' in grant:
                        funder.append('<br>' + grant['code'])
                    funder.append('</p>')
        else:
            # If it's not a list put an empty cell
            if pd.notnull(row['metadata.grants']):
                funder.append('')
        # Append the joined funder information to the list
        funders.append("".join(funder))
    # Add the new 'fundref' column to df2
    df2['fundref'] = funders
    return df2

def add_related_items(df1, df2):
    items = []
    count = 0
    for index, row in df1.iterrows():
        item = []
        # Iterate through the grants if they are present
        if isinstance(row['metadata.related_identifiers'], list):
            for id in row['metadata.related_identifiers']:
                if pd.notnull(id):  
                    item.append('<p>' + id['identifier'])

                    if id['relation']:
                        item.append('<br>' + id['relation'])
                    item.append('</p>')
        else:
            # If it's not a list put an empty cell
            if pd.notnull(row['metadata.related_identifiers']):
                item.append('')
        # Append the joined funder information to the list
        items.append("".join(item))
    # Add the new 'fundref' column to df2
    df2['related_data'] = items
    return df2

def add_creators(df1, df2):
    # Find the maximum number of authors across all rows in 'metadata.creators'
    max_authors = df1['metadata.creators'].apply(len).max()

    # Create a list to hold all rows of data for the new DataFrame
    expanded_data = []
    
    # Process each row in df1
    for _, row in df1.iterrows():
        row_data = {}
        creators = row['metadata.creators']
        
        # Populate the row_data dictionary with each author's name and affiliation
        for i, creator in enumerate(creators):
            author_index = i + 1
            name = creator.get('name', "")
            institution = creator.get('affiliation', "")  # Use 'institution' instead of 'affiliation'
            
            # Use the separate_name function to split names
            first_name, middle_name, last_name = separate_name(name)
            
            # Assign names and institution to the row_data dictionary
            row_data[f'author{author_index}_fname'] = first_name
            row_data[f'author{author_index}_mname'] = middle_name
            row_data[f'author{author_index}_lname'] = last_name
            row_data[f'author{author_index}_institution'] = institution  # Change 'affl' to 'institution'
        
        # Append row_data dictionary to expanded_data list
        expanded_data.append(row_data)

    # Convert expanded_data list of dictionaries into a new DataFrame
    df3 = pd.DataFrame(expanded_data)
    
    # Fill missing values with empty strings for any columns where data is missing
    df3 = df3.fillna("")
    
    df2_reset = df2.reset_index(drop=True)
    df3_reset = df3.reset_index(drop=True)
    
    # Concatenate the two DataFrames along the columns
    df_out = pd.concat([df2_reset, df3_reset], axis=1)
    
    return df_out




Build output dataframe 

In [225]:
### BUILD OUTPUT DATAFRAME

# title
df_output = df[['title']]

# orcid
df_output = add_orcid(df1=df_input, df2=df_output)

# publication_date
df_output = add_col(df_input, "metadata.publication_date", df_output, "publication_date")

# abstract
df_output = add_col(df_input, "metadata.description", df_output, "abstract")
df_output['abstract'] = df_output['abstract'].apply(to_html)

# keywords
df_output = add_col(df_input, "metadata.keywords", df_output, "keywords")
df_output["keywords"] = df_output["keywords"].apply(list_to_string)

# disciplines
df_output["disciplines"] = "" #make blank column, we will need to fill in the values

#source_publication
df_output["source_publication"] = "" #make blank column, we will need to fill in the values


# related_data
df_output = add_related_items(df1=df_input, df2=df_output)



# source_fulltext_url
df_output = add_col(df_input, "doi_url", df_output, "source_fulltext_url")

# external_rep
df_output['external_rep'] = "<p>Zenodo</p>" 

# distribution_license and access_link
df_output = add_license(df1=df_input, df2=df_output)

# funder_info
df_output = add_funders(df1=df_input, df2=df_output)

# author info
df_output = add_creators(df1=df_input, df2=df_output)



display(df_output)

df_output.to_excel('zenodo_batch_upload.xlsx', index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['orcid'] = orcid_pairs


Unnamed: 0,title,orcid,publication_date,abstract,keywords,disciplines,source_publication,related_data,source_fulltext_url,external_rep,...,author47_lname,author47_institution,author48_fname,author48_mname,author48_lname,author48_institution,author49_fname,author49_mname,author49_lname,author49_institution
0,Alzheimer's disease risk gene BIN1 induces Tau...,"<p>Yuliya Voskobiynyk<a href=""https://orcid.or...",2020-08-19,<p>Genome-wide association studies identified ...,,,,<p>10.7554/eLife.57354</p>,https://doi.org/10.5061/dryad.rbnzs7h8z,<p>Zenodo</p>,...,,,,,,,,,,
1,Data from: The effect of Speed of Processing t...,,2015-08-04,<p>Older adults experience cognitive deficits ...,"peripheral, Useful Field of View, cognitive in...",,,<p>10.1371/journal.pone.0107808</p>,https://doi.org/10.5061/dryad.4fn70,<p>Zenodo</p>,...,,,,,,,,,,
2,Data for Cell-type-specific alternative splici...,"<p>Emma F. Jones<a href=""https://orcid.org/000...",2024-06-25,<p><span><strong>data.tar.gz </strong>contains...,,,,<p>10.5281/zenodo.12548384</p><p>10.5281/zenod...,https://doi.org/10.5281/zenodo.12535061,<p>Zenodo</p>,...,,,,,,,,,,
3,Data for Long-read RNA sequencing identifies r...,"<p>Emma F. Jones<a href=""https://orcid.org/000...",2023-12-14,<p><span>data_minus_bam.tar.gz contains all fi...,"long-read RNA sequencing, brain, sex, alternat...",,,<p>https://github.com/lasseignelab/230227_EJ_M...,https://doi.org/10.5281/zenodo.10381745,<p>Zenodo</p>,...,,,,,,,,,,
4,Data for Altered Glia-Neuron Communication in ...,"<p>Tabea Soelter<a href=""https://orcid.org/000...",2023-11-28,<p><strong>data.tar.gz contains all files from...,"Alzheimer's disease, neurodegeneration, cell-c...",,,<p>10.5281/zenodo.10211623</p>,https://doi.org/10.5281/zenodo.10214497,<p>Zenodo</p>,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,"Dataset for ""What is Gab? A Bastion of Free Sp...",,2018-09-13,<p>This dataset was used for this project: &qu...,,,,,https://doi.org/10.5281/zenodo.3460400,<p>Zenodo</p>,...,,,,,,,,,,
125,"Public Dataset for ""Large Scale Crowdsourcing ...",,2020-02-21,<p>Dataset for the &quot;Large Scale Crowdsour...,,,,,https://doi.org/10.5281/zenodo.3678559,<p>Zenodo</p>,...,,,,,,,,,,
126,Transposon DNA sequences facilitate the tissue...,,2023-05-24,<p>The uploaded files are in the&nbsp;fasta fo...,"Horizontal gene transfer, circulating tumor DN...",,,,https://doi.org/10.5281/zenodo.7958520,<p>Zenodo</p>,...,,,,,,,,,,
127,Functional connectivity in the face of congeni...,"<p>Pinar Demirayak<a href=""https://orcid.org/0...",2019-09-06,<p>Results of diffusion tensor imaging analysi...,,,,,https://doi.org/10.5281/zenodo.3401600,<p>Zenodo</p>,...,,,,,,,,,,


In [123]:
### SAVE TO .CSV

# df.to_csv('zenodo_raw.csv', index=False)
# print(df_input['title'])

# df_input.to_csv('zenodo_expanded_raw.csv', index=False, encoding='utf-8-sig')

df_output.to_excel('zenodo_batch_upload.xlsx', index=False)

In [124]:
type(records)
records.keys()
print(len(records['hits']['hits']))
records['hits']['hits'][0]['metadata'].keys()

print(records['hits']['hits'][0]['metadata'].keys())
print(records['hits']['hits'][1]['metadata'].keys())
print(records['hits']['hits'][2]['metadata'].keys())

129
dict_keys(['title', 'doi', 'publication_date', 'description', 'access_right', 'creators', 'related_identifiers', 'resource_type', 'license', 'communities', 'relations', 'notes'])
dict_keys(['title', 'doi', 'publication_date', 'description', 'access_right', 'creators', 'contributors', 'related_identifiers', 'custom', 'resource_type', 'license', 'grants', 'relations'])
dict_keys(['title', 'doi', 'publication_date', 'description', 'access_right', 'creators', 'keywords', 'related_identifiers', 'locations', 'resource_type', 'license', 'communities', 'relations', 'notes'])


In [125]:
print(records['hits']['hits'][1]['metadata']['grants'][0]['title'])

UAB Pilot Center for Precision Animal Modeling (C-PAM)


In [126]:
for i in range(len(records['hits']['hits'])):
    if 'grants' in records['hits']['hits'][i]['metadata']:
        print( i, records['hits']['hits'][i]['metadata'].keys())


1 dict_keys(['title', 'doi', 'publication_date', 'description', 'access_right', 'creators', 'contributors', 'related_identifiers', 'custom', 'resource_type', 'license', 'grants', 'relations'])
3 dict_keys(['title', 'doi', 'publication_date', 'description', 'access_right', 'creators', 'contributors', 'keywords', 'related_identifiers', 'resource_type', 'license', 'grants', 'relations'])
8 dict_keys(['title', 'doi', 'publication_date', 'description', 'access_right', 'creators', 'dates', 'language', 'custom', 'resource_type', 'license', 'grants', 'relations'])
13 dict_keys(['title', 'doi', 'publication_date', 'description', 'access_right', 'creators', 'keywords', 'language', 'resource_type', 'license', 'grants', 'relations', 'notes'])
44 dict_keys(['title', 'doi', 'publication_date', 'description', 'access_right', 'creators', 'contributors', 'dates', 'language', 'resource_type', 'journal', 'alternate_identifiers', 'license', 'grants', 'relations'])
60 dict_keys(['title', 'doi', 'publicatio

In [127]:
results = []
record_ids = []

for i in range(len(records['hits']['hits'])):
    c = []
    contributors = ''
    
    record_id = records['hits']['hits'][i]['id']
    record_ids.append(record_id)
    if 'resource_type' in records['hits']['hits'][i]['metadata']:
        resource_type = records['hits']['hits'][i]['metadata']['resource_type']['title']
    else:
        resource_type = "none"
    if 'title' in records['hits']['hits'][i]['metadata']:
        title = records['hits']['hits'][i]['metadata']['title']
    else:
        title = "none"
    if 'contributors' in records['hits']['hits'][i]['metadata']:
        for j in range(len(records['hits']['hits'][i]['metadata']['contributors'])):
            c.append(records['hits']['hits'][i]['metadata']['contributors'][j]['type']) 
        
        contributor_counts = Counter(c)
        
        count = 0
        for contrib in contributor_counts.keys():
            if (count > 0):
                contributors += ','
            contributors += contrib + "=" + str(contributor_counts[contrib])
            count += 1
    else:    
        contributors = 'none'
    
    results.append([record_id,title,resource_type,contributors])

print(record_ids)

NameError: name 'Counter' is not defined