In [1]:

from google.colab import drive
import sys
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import os
import json
import re
import pandas as pd

In [4]:
structure_file = '/content/drive/My Drive/AbelBioToken-main/Proteomics/proteomexchange_datasets.json'

# Check if the file exists
if os.path.isfile(structure_file):
    print("File exists at the specified path.")
else:
    print("File does not exist at the specified path.")

File exists at the specified path.


In [5]:
# Open and load the JSON file
with open(structure_file, 'r') as file:
    data = json.load(file)

In [6]:
print(f"Type of data: {type(data)}")


Type of data: <class 'dict'>


In [7]:
if isinstance(data, dict):
    print("Top-level keys in the JSON data:")
    for key in data.keys():
        print(key)


Top-level keys in the JSON data:
datasets
facets
query
result_set
status


In [8]:
from pprint import pprint

# Pretty-print the JSON data
pprint(data)


{'datasets': [['PXD036477',
               'The human milk peptidome: exploring network insights',
               'PRIDE',
               'Homo sapiens',
               'Q Exactive HF-X',
               '<a href="https://dx.doi.org/10.1038/s41598-024-58127-2" '
               'target="_blank">10.1038/s41598-024-58127-2</a>; <a '
               'href="https://www.ncbi.nlm.nih.gov/pubmed/38555284" '
               'target="_blank">Dekker et al. (2024)</a>',
               'Sjef Boeren',
               '2024-05-23',
               'Breast milk, Homo sapiens, associations, network inference, '
               'peptides, peptidomics'],
              ['RPXD049956',
               'Large-scale Discovery of Substrates of the Human Kinome (EGFR) '
               '[Reanalysis: JPST000508]',
               'jPOST',
               'Homo sapiens',
               'LTQ Orbitrap',
               'no publication',
               'Yasushi Ishihama',
               '2024-02-21',
               'HeLa, Homo

In [12]:

# Function to convert datasets to list of dictionaries
def get_datasets_as_dicts(data):
    datasets = data['datasets']
    field_names = data['result_set']['datasets_title_list']
    datasets_dicts = []

    for dataset in datasets:
        dataset_dict = dict(zip(field_names, dataset))
        datasets_dicts.append(dataset_dict)

    return datasets_dicts

# Load datasets as dictionaries
datasets = get_datasets_as_dicts(data)

# Function to strip HTML tags
def strip_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Clean the 'publication' field
for dataset in datasets:
    if 'publication' in dataset:
        dataset['publication'] = strip_html_tags(dataset['publication'])

# Function to print dataset details
def print_dataset_details(dataset):
    for key, value in dataset.items():
        print(f"{key}: {value}")
    print("\n" + "-"*50 + "\n")

# Example: Print the first dataset
print("First dataset after cleaning 'publication' field:")
print_dataset_details(datasets[0])

# Convert to DataFrame
df = pd.DataFrame(datasets)
print("Datasets DataFrame:")
print(df.head())

# Save DataFrame to CSV
output_file = '/content/drive/My Drive/AbelBioToken-main/Proteomics/proteomexchange_datasets.csv'
df.to_csv(output_file, index=False)
print(f"DataFrame saved to {output_file}")

First dataset after cleaning 'publication' field:
dataset identifier: PXD036477
title: The human milk peptidome: exploring network insights
repository: PRIDE
species: Homo sapiens
instrument: Q Exactive HF-X
publication: 10.1038/s41598-024-58127-2; Dekker et al. (2024)
lab head: Sjef Boeren
announce date: 2024-05-23
keywords: Breast milk, Homo sapiens, associations, network inference, peptides, peptidomics

--------------------------------------------------

Datasets DataFrame:
  dataset identifier                                              title  \
0          PXD036477  The human milk peptidome: exploring network in...   
1         RPXD049956  Large-scale Discovery of Substrates of the Hum...   
2         RPXD049955  Large-scale Discovery of Substrates of the Hum...   
3         RPXD049954  Large-scale Discovery of Substrates of the Hum...   
4         RPXD049953  Large-scale Discovery of Substrates of the Hum...   

  repository       species       instrument  \
0      PRIDE  Homo 

In [13]:
facets = data['facets']

# Print available facets
print("Available facets:")
for facet_name in facets.keys():
    print(facet_name)


Available facets:
instrument
keywords
repository
species
year
