<h1 align="center">upload data version2</h1> 

This Jupyter notebook demonstrates how to upload data to [ToxDataCommons](fairtox.com).

__Created by: Shuangyu Zhap, Michigan State University__

-------

In [1]:
import pandas as pd
import numpy as np
import subprocess
import sys
import gen3
import json
from gen3.submission import Gen3Submission
from gen3.auth import Gen3Auth
from gen3.index import Gen3Index
from gen3.query import Gen3Query
from gen3.metadata import Gen3Metadata
from gen3.file import Gen3File
import os

# download and import some custom Python scripts from https://github.com/cgmeyer/gen3sdk-python
# os.system("wget https://raw.githubusercontent.com/cgmeyer/gen3sdk-python/master/expansion/expansion.py")
from expansion import Gen3Expansion

--2023-06-16 14:36:24--  https://raw.githubusercontent.com/cgmeyer/gen3sdk-python/master/expansion/expansion.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 224796 (220K) [text/plain]
Saving to: 'expansion.py'

     0K .......... .......... .......... .......... .......... 22% 10.6M 0s
    50K .......... .......... .......... .......... .......... 45% 9.65M 0s
   100K .......... .......... .......... .......... .......... 68% 11.1M 0s
   150K .......... .......... .......... .......... .......... 91% 13.2M 0s
   200K .......... .........                                  100% 19.3M=0.02s

2023-06-16 14:36:24 (11.4 MB/s) - 'expansion.py' saved [224796/224796]



In [2]:
api = 'https://fairtox.com/'
cred = '/Users/apple/Desktop/test_gen3/credentials23.json'
auth = Gen3Auth(api, refresh_file=cred)
sub = Gen3Submission(api, auth)
query = Gen3Query(auth)
index = Gen3Index(auth)
file = Gen3File(auth)
metadata = Gen3Metadata(auth)
exp = Gen3Expansion(api,auth,sub)

## create program

In [3]:
prog = 'MyFirstProgram'

prog_txt = """{
    "dbgap_accession_number": "%s",
    "type": "program",
    "name": "%s"
}""" % (prog,prog)

prog_json = json.loads(prog_txt)
data = sub.create_program(json=prog_json)

## create project

In [4]:
proj_txt = """{
    "availability_type": "Open",
    "code": "MyFirstProject",
    "dbgap_accession_number": "MyFirstProject",
    "type": "project",
    "contact_name": "Timonthy Zacharewski",
    "institution": "MSU",
    "description": "test",
    "email_address": "xxxxx@fdas.sdfs",
    "telephone_number": "ssd-asdf-asdf"
    }"""
proj_json = json.loads(proj_txt)
data = sub.create_project(program="MyFirstProgram",json=proj_json) 

## upload other nodes

In [7]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/study.tsv")
df.head()

Unnamed: 0,type,submitter_id,projects.code,study_title,study_description,study_design,study_type,experimental_setting,organism,provenance
0,study,Prj171,MyFirstProject,Prj171_Mm_TCDD_RDDR-28D_Male,Male mice were treated with TCDD every 4 days ...,dose response design,Toxicogenomics,in vivo,Mus musculus,


In [11]:
# study
df = pd.read_table('/Users/apple/Desktop/test_gen3/DH_ExportedFiles/study.tsv')
df["projects.code"].fillna("MyFirstProject", inplace=True)
df.to_csv('/Users/apple/Desktop/test_gen3/DH_ExportedFiles/study.tsv', sep='\t', index=False)
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/study.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/study.tsv with 1 records.
Chunk 1 (chunk size: 30, submitted: 0 of 1)
	 Succeeded: 1 entities.
Finished data submission.
Successful records: 1
Failed invalid records: 0


In [8]:
df = pd.read_table('/Users/apple/Desktop/test_gen3/DH_ExportedFiles/contact.tsv')
df.head()

Unnamed: 0,type,submitter_id,studies.submitter_id,last_name,first_name,middle_name,contact_orcid,contact_email,contact_telephone,contact_department,contact_institution,location,provenance
0,contact,Prj171:Zacharewski:Timothy,Prj171,Zacharewski,Timothy,,0000-0002-3662-7919,tzachare@msu.edu,517-884-2054,Biochemistry and Molecular Biology,Michigan State University,48824:East Lansing,


In [12]:
# contact
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/contact.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/contact.tsv with 1 records.
Chunk 1 (chunk size: 30, submitted: 0 of 1)
	 Succeeded: 1 entities.
Finished data submission.
Successful records: 1
Failed invalid records: 0


In [13]:
df = pd.read_table('/Users/apple/Desktop/test_gen3/DH_ExportedFiles/funding.tsv')
df.head()

Unnamed: 0,type,submitter_id,studies.submitter_id,support_id,support_source,provenance
0,funding,Prj171:R01ES029541:P42ES004911,Prj171,R01ES029541:P42ES004911,NIEHS; Superfund Basic Research Program,


In [14]:
# funding
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/funding.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/funding.tsv with 1 records.
Chunk 1 (chunk size: 30, submitted: 0 of 1)
	 Succeeded: 1 entities.
Finished data submission.
Successful records: 1
Failed invalid records: 0


In [15]:
df = pd.read_table('/Users/apple/Desktop/test_gen3/DH_ExportedFiles/publication.tsv')
df.head()

Unnamed: 0,type,submitter_id,studies.submitter_id,PMC_id,DOI,PMID,provenance
0,publication,Prj171:9418907,Prj171,9418907,10.1016/j.jbc.2022.102301,35931118,


In [16]:
# publication
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/publication.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/publication.tsv with 1 records.
Chunk 1 (chunk size: 30, submitted: 0 of 1)
	 Succeeded: 1 entities.
Finished data submission.
Successful records: 1
Failed invalid records: 0


In [17]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/subject.tsv")
df.head()

Unnamed: 0,type,submitter_id,studies.submitter_id,start_date,start_date_age,experiment_start_date,experiment_start_zt,sex,strain,strain_source,euthanasia_date,euthanasia_zt,euthanasia_method,provenance
0,subject,Prj171:M97,Prj171,2022-03-09,23,2022-03-14,0,male,C57BL/6NCrl,Charles Rivers Laboratories,2022-04-11,0,Carbon dioxide asphyxiation,
1,subject,Prj171:M94,Prj171,2022-03-09,23,2022-03-14,0,male,C57BL/6NCrl,Charles Rivers Laboratories,2022-04-11,0,Carbon dioxide asphyxiation,
2,subject,Prj171:M92,Prj171,2022-03-09,23,2022-03-14,0,male,C57BL/6NCrl,Charles Rivers Laboratories,2022-04-11,0,Carbon dioxide asphyxiation,
3,subject,Prj171:M91,Prj171,2022-03-09,23,2022-03-14,0,male,C57BL/6NCrl,Charles Rivers Laboratories,2022-04-11,0,Carbon dioxide asphyxiation,
4,subject,Prj171:M89,Prj171,2022-03-09,23,2022-03-14,0,male,C57BL/6NCrl,Charles Rivers Laboratories,2022-04-11,0,Carbon dioxide asphyxiation,


In [18]:
# subject
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/subject.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/subject.tsv with 144 records.
Chunk 1 (chunk size: 30, submitted: 0 of 144)
	 Succeeded: 30 entities.
Chunk 2 (chunk size: 30, submitted: 30 of 144)
	 Succeeded: 30 entities.
Chunk 3 (chunk size: 30, submitted: 60 of 144)
	 Succeeded: 30 entities.
Chunk 4 (chunk size: 30, submitted: 90 of 144)
	 Succeeded: 30 entities.
Chunk 5 (chunk size: 30, submitted: 120 of 144)
	 Succeeded: 24 entities.
Finished data submission.
Successful records: 144
Failed invalid records: 0


In [19]:

# treatment 
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/treatment.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/treatment.tsv with 56 records.
Chunk 1 (chunk size: 30, submitted: 0 of 56)
	 Succeeded: 30 entities.
Chunk 2 (chunk size: 30, submitted: 30 of 56)
	 Succeeded: 26 entities.
Finished data submission.
Successful records: 56
Failed invalid records: 0


In [20]:
# housing
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/housing.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/housing.tsv with 48 records.
Chunk 1 (chunk size: 30, submitted: 0 of 48)
	 Succeeded: 30 entities.
Chunk 2 (chunk size: 30, submitted: 30 of 48)
	 Succeeded: 18 entities.
Finished data submission.
Successful records: 48
Failed invalid records: 0


In [21]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/diet.tsv")
df.head()

Unnamed: 0,type,submitter_id,housings.submitter_id,date,feed_catalog_number,feed_description,feed_name,feed_vendor,water_type,feed_paradigm,provenance
0,diet,Prj171:8940:2022-03-09,"Prj171:Cage033:2022-03-09,Prj171:Cage032:2022-...",2022-03-09,8940,Harlan Teklad 8940,Harlan Teklad 8940,Harlan Teklad,Innovive,ad libitum,


In [22]:
# diet 
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/diet.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/diet.tsv with 1 records.
Chunk 1 (chunk size: 30, submitted: 0 of 1)
	 Succeeded: 1 entities.
Finished data submission.
Successful records: 1
Failed invalid records: 0


In [23]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/sample.tsv")
df.head()

Unnamed: 0,type,submitter_id,subjects.submitter_id,date,biospecimen_anatomic_site,method_of_sample_procurement,preservation_method,weight,volume,storage_vessel,collection_protocol,provenance
0,sample,Prj171:L1,Prj171:M1,2022-04-11,Liver,,Snap Frozen,,,2 mL screwcap tube,Tissue was carefully excised and immediately f...,"DataHarmonizer v1.4.10, Sample v4.3.1"
1,sample,Prj171:L2,Prj171:M2,2022-04-11,Liver,,Snap Frozen,,,2 mL screwcap tube,Tissue was carefully excised and immediately f...,"DataHarmonizer v1.4.10, Sample v4.3.1"
2,sample,Prj171:L3,Prj171:M3,2022-04-11,Liver,,Snap Frozen,,,2 mL screwcap tube,Tissue was carefully excised and immediately f...,"DataHarmonizer v1.4.10, Sample v4.3.1"
3,sample,Prj171:L4,Prj171:M4,2022-04-11,Liver,,Snap Frozen,,,2 mL screwcap tube,Tissue was carefully excised and immediately f...,"DataHarmonizer v1.4.10, Sample v4.3.1"
4,sample,Prj171:L5,Prj171:M5,2022-04-11,Liver,,Snap Frozen,,,2 mL screwcap tube,Tissue was carefully excised and immediately f...,"DataHarmonizer v1.4.10, Sample v4.3.1"


In [24]:

data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/sample.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/sample.tsv with 268 records.
Chunk 1 (chunk size: 30, submitted: 0 of 268)
	 Succeeded: 30 entities.
Chunk 2 (chunk size: 30, submitted: 30 of 268)
	 Succeeded: 30 entities.
Chunk 3 (chunk size: 30, submitted: 60 of 268)
	 Succeeded: 30 entities.
Chunk 4 (chunk size: 30, submitted: 90 of 268)
	 Succeeded: 30 entities.
Chunk 5 (chunk size: 30, submitted: 120 of 268)
	 Succeeded: 30 entities.
Chunk 6 (chunk size: 30, submitted: 150 of 268)
	 Succeeded: 30 entities.
Chunk 7 (chunk size: 30, submitted: 180 of 268)
	 Succeeded: 30 entities.
Chunk 8 (chunk size: 30, submitted: 210 of 268)
	 Succeeded: 30 entities.
Chunk 9 (chunk size: 30, submitted: 240 of 268)
	 Succeeded: 28 entities.
Finished data submission.
Successful records: 268
Failed invalid records: 0


In [25]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/aliquot.tsv")
df.head()

Unnamed: 0,type,submitter_id,samples.submitter_id,analyte_type,derivitization,extract_preservation_method,analyte_protocol,provenance
0,aliquot,Prj171:metabolite_extract_L75,Prj171:L75,Aqueous,,-80C,Flash frozen liver samples (about 25 mg) were ...,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
1,aliquot,Prj171:metabolite_extract_L121,Prj171:L121,Aqueous,,-80C,Flash frozen liver samples (about 25 mg) were ...,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
2,aliquot,Prj171:metabolite_extract_L27,Prj171:L27,Aqueous,,-80C,Flash frozen liver samples (about 25 mg) were ...,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
3,aliquot,Prj171:metabolite_extract_L122,Prj171:L122,Aqueous,,-80C,Flash frozen liver samples (about 25 mg) were ...,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
4,aliquot,Prj171:metabolite_extract_L123,Prj171:L123,Aqueous,,-80C,Flash frozen liver samples (about 25 mg) were ...,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"


In [27]:
# ms_aliquot
import requests
COMMONS = "https://fairtox.com/"
API_KEY_FILEPATH = '/Users/apple/Desktop/test_gen3/credentials23.json'

projectname = 'MyFirstProject'
programname = 'MyFirstProgram'
api_url = "{}/api/v0/submission/{}/{}".format(COMMONS,programname,projectname)
df = pd.read_table('/Users/apple/Desktop/test_gen3/DH_ExportedFiles/aliquot.tsv')
col_name = df.columns.tolist()
# this is external link
col_name.remove("samples.submitter_id")

for _, row in df.iterrows():
    jsondata = []
    # this is for the external link
    dic = {
        "samples": [
            {
                "submitter_id": row["samples.submitter_id"]
            }
        ]
    }

    for i in col_name:
        value = row[i]
        if isinstance(value, float) and (value == float('inf') or value == float('-inf') or pd.isna(value)):
            dic[i] = str(value)
        else:
            dic[i] = value

    jsondata.append(dic)
    print(jsondata)
    authn = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
    output = requests.put(api_url, auth=authn, json=jsondata)
    output.json()

[{'samples': [{'submitter_id': 'Prj171:L75'}], 'type': 'aliquot', 'submitter_id': 'Prj171:metabolite_extract_L75', 'analyte_type': 'Aqueous', 'derivitization': 'nan', 'extract_preservation_method': '-80C', 'analyte_protocol': 'Flash frozen liver samples (about 25 mg) were extracted using HPLC-grade methanol and water (5:3 ratio) containing 13C-,15N-labeled amino acid (Sigma; 767964) internal standards. HPLC-grade chloroform (methanol:water:chloroform ratio 5:3:5) was added, vortexed, shaken for 15 min at 4 Celsius, and centrifuged at maximum speed (3000 x g) to achieve phase separation. The methanol:water phase containing the polar metabolites was transferred, dried under nitrogen gas at room temperature. Untargeted extractions were reconstituted with 400 ul of 10 mM tributylamine and 15 mM acetic acid in 97:3 water:methanol for analysis.', 'provenance': 'DataHarmonizer v1.4.10, mass_spec_assay v1.0.0'}]
[{'samples': [{'submitter_id': 'Prj171:L121'}], 'type': 'aliquot', 'submitter_id':

In [31]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/mass_spec_assay.tsv")
df["chromatography_instrument"] = "Waters TQS"
df.to_csv('/Users/apple/Desktop/test_gen3/DH_ExportedFiles/mass_spec_assay.tsv', sep='\t', index=False)
df.head()

Unnamed: 0,type,submitter_id,aliquots.submitter_id,sample_dilution,calibration_standard,chromatography_type,chromatography_column,chromatography_protocol,chromatography_instrument,chromatography_method_filename,...,solventA,solventB,solventC,carrier_gas,oven_temperature_program,ms_type,ms_protocol,ms_method_filename,ion_mode,provenance
0,mass_spec_assay,Prj171:TQSm_042122_010,Prj171:metabolite_extract_L75,1:1,"13C,15N-methionine",Reversed phase,"Waters ACQUITY UPLC HSS T3 (100 x 2.1mm,1.8um)",The mobile phases were 10mM PFHA in water (mob...,Waters TQS,,...,100% water; ; 10mM perfluoroheptanoic acid,100% acetonitrile; ;,,,,ESI,Multiple reaction monitoring in positive ion mode,,POSITIVE,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
1,mass_spec_assay,Prj171:TQSm_042122_011,Prj171:metabolite_extract_L121,1:1,"13C,15N-methionine",Reversed phase,"Waters ACQUITY UPLC HSS T3 (100 x 2.1mm,1.8um)",The mobile phases were 10mM PFHA in water (mob...,Waters TQS,,...,100% water; ; 10mM perfluoroheptanoic acid,100% acetonitrile; ;,,,,ESI,Multiple reaction monitoring in positive ion mode,,POSITIVE,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
2,mass_spec_assay,Prj171:TQSm_042122_012,Prj171:metabolite_extract_L27,1:1,"13C,15N-methionine",Reversed phase,"Waters ACQUITY UPLC HSS T3 (100 x 2.1mm,1.8um)",The mobile phases were 10mM PFHA in water (mob...,Waters TQS,,...,100% water; ; 10mM perfluoroheptanoic acid,100% acetonitrile; ;,,,,ESI,Multiple reaction monitoring in positive ion mode,,POSITIVE,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
3,mass_spec_assay,Prj171:TQSm_042122_013,Prj171:metabolite_extract_L122,1:1,"13C,15N-methionine",Reversed phase,"Waters ACQUITY UPLC HSS T3 (100 x 2.1mm,1.8um)",The mobile phases were 10mM PFHA in water (mob...,Waters TQS,,...,100% water; ; 10mM perfluoroheptanoic acid,100% acetonitrile; ;,,,,ESI,Multiple reaction monitoring in positive ion mode,,POSITIVE,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
4,mass_spec_assay,Prj171:TQSm_042122_014,Prj171:metabolite_extract_L123,1:1,"13C,15N-methionine",Reversed phase,"Waters ACQUITY UPLC HSS T3 (100 x 2.1mm,1.8um)",The mobile phases were 10mM PFHA in water (mob...,Waters TQS,,...,100% water; ; 10mM perfluoroheptanoic acid,100% acetonitrile; ;,,,,ESI,Multiple reaction monitoring in positive ion mode,,POSITIVE,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"


In [32]:
# mass_spec_assay
import requests
COMMONS = "https://fairtox.com/"
API_KEY_FILEPATH = '/Users/apple/Desktop/test_gen3/credentials23.json'

projectname = 'MyFirstProject'
programname = 'MyFirstProgram'
api_url = "{}/api/v0/submission/{}/{}".format(COMMONS,programname,projectname)
df = pd.read_table('/Users/apple/Desktop/test_gen3/DH_ExportedFiles/mass_spec_assay.tsv')
col_name = df.columns.tolist()
# this is external link
col_name.remove("aliquots.submitter_id")

for _, row in df.iterrows():
    jsondata = []
    # this is for the external link
    dic = {
        "aliquots": [
            {
                "submitter_id": row["aliquots.submitter_id"]
            }
        ]
    }

    for i in col_name:
        value = row[i]
        if isinstance(value, float) and (value == float('inf') or value == float('-inf') or pd.isna(value)):
            dic[i] = str(value)
        else:
            dic[i] = value

    jsondata.append(dic)
    print(jsondata)
    authn = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
    output = requests.put(api_url, auth=authn, json=jsondata)
    output.json()

[{'aliquots': [{'submitter_id': 'Prj171:metabolite_extract_L75'}], 'type': 'mass_spec_assay', 'submitter_id': 'Prj171:TQSm_042122_010', 'sample_dilution': '1:1', 'calibration_standard': '13C,15N-methionine', 'chromatography_type': 'Reversed phase', 'chromatography_column': 'Waters ACQUITY UPLC HSS T3 (100 x 2.1mm,1.8um)', 'chromatography_protocol': 'The mobile phases were 10mM PFHA in water (mobile phase A) and acetonitrile (mobile phase B) using the following gradient: 0 min - 100% A, 1.0 min - 100% A, 6.0 min - 35% A, 6.01 min - 10% A, 7.0 min - 10% A, 7.01 min - 100% A, 9.0 min - 100% A', 'chromatography_instrument': 'Waters TQS', 'chromatography_method_filename': 'nan', 'elution_program': '0 min - 100% A, 1.0 min - 100% A, 6.0 min - 35% A, 6.01 min - 10% A, 7.0 min - 10% A, 7.01 min - 100% A, 9.0 min - 100% A', 'flow_rate': '0.3 ml/minute', 'solventA': '100% water; ; 10mM perfluoroheptanoic acid', 'solventB': '100% acetonitrile; ;', 'solventC': 'nan', 'carrier_gas': 'nan', 'oven_te

In [37]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_raw_data.tsv")
file_names = df["file_name"].to_list()
for file_name in file_names:
    path = '/Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_raw_data/' + file_name
    with open(path, 'w') as file:
        file.write(path)
        

In [38]:
# automatically insert md5sum and file_size
import pandas as pd
import hashlib
import os
def calculate_md5(file_path):
    """Calculate the MD5 checksum for a file."""
    md5_hash = hashlib.md5()
    with open(file_path, 'rb') as file:
        for chunk in iter(lambda: file.read(4096), b''):
            md5_hash.update(chunk)
    return md5_hash.hexdigest()

# Provide the file name list and DataFrame
df = pd.read_table("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_raw_data.tsv")
file_names = df["file_name"].to_list()

# Provide the path to the folder containing the files
folder_path = '/Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_raw_data/'

# Update the MD5 checksum for each file name in the DataFrame
for file_name in file_names:
    file_path = folder_path + file_name
    md5_sum = calculate_md5(file_path)
    file_size = os.path.getsize(file_path)
    df.loc[df['file_name'] == file_name, 'md5sum'] = md5_sum
    df.loc[df['file_name'] == file_name, 'file_size'] = file_size
df['file_size'] = df['file_size'].astype(int)
df.to_csv("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_raw_data_revised.tsv", sep='\t', index=False)
df.head()

Unnamed: 0,type,submitter_id,mass_spec_assays.submitter_id,file_name,file_size,md5sum,object_id,data_category,data_format,data_type,file_source_repository,repository_accession_id,repository_download_ftp,provenance
0,ms_raw_data,Prj171:TQSm_042122_010.mzML,Prj171:TQSm_042122_010,TQSm_042122_010.mzML,80,0edb4486c3bda16534226663e0efc929,,targeted metabolomics,.mzML,chromatograms,,,,
1,ms_raw_data,Prj171:TQSm_042122_011.mzML,Prj171:TQSm_042122_011,TQSm_042122_011.mzML,80,575e7adfd53ada1f08f55175e17ead6b,,targeted metabolomics,.mzML,chromatograms,,,,
2,ms_raw_data,Prj171:TQSm_042122_012.mzML,Prj171:TQSm_042122_012,TQSm_042122_012.mzML,80,9d27dee74274d28c76149450710bf659,,targeted metabolomics,.mzML,chromatograms,,,,
3,ms_raw_data,Prj171:TQSm_042122_013.mzML,Prj171:TQSm_042122_013,TQSm_042122_013.mzML,80,65ca95b578bd68c254fab02c651c2a42,,targeted metabolomics,.mzML,chromatograms,,,,
4,ms_raw_data,Prj171:TQSm_042122_014.mzML,Prj171:TQSm_042122_014,TQSm_042122_014.mzML,80,84af47183d95a3384ace7e15d4c7be2e,,targeted metabolomics,.mzML,chromatograms,,,,


In [40]:
# ms_raw_Data
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_raw_data_revised.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_raw_data_revised.tsv with 30 records.
Chunk 1 (chunk size: 30, submitted: 0 of 30)
	 Succeeded: 30 entities.
Chunk 2 (chunk size: 30, submitted: 30 of 30)
	Chunk Failed (status code 400): 0 entities.
	Invalid records in this chunk: 0
Finished data submission.
Successful records: 30
Failed invalid records: 0


In [41]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_analysis.tsv")

df.head()

Unnamed: 0,type,submitter_id,ms_raw_datas.submitter_id,normalization,transformation,transformation_purpose,transformation_description,unit,analysis_protocols,provenance
0,ms_analysis,Prj171:TQSm_042122_010,Prj171:TQSm_042122_010.mzML,normalized to protein,,,,normalized peak area,,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
1,ms_analysis,Prj171:TQSm_042122_011,Prj171:TQSm_042122_011.mzML,normalized to protein,,,,normalized peak area,,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
2,ms_analysis,Prj171:TQSm_042122_012,Prj171:TQSm_042122_012.mzML,normalized to protein,,,,normalized peak area,,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
3,ms_analysis,Prj171:TQSm_042122_013,Prj171:TQSm_042122_013.mzML,normalized to protein,,,,normalized peak area,,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
4,ms_analysis,Prj171:TQSm_042122_014,Prj171:TQSm_042122_014.mzML,normalized to protein,,,,normalized peak area,,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"


In [42]:
# ms_analysis
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_analysis.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_analysis.tsv with 30 records.
Chunk 1 (chunk size: 30, submitted: 0 of 30)
	 Succeeded: 30 entities.
Chunk 2 (chunk size: 30, submitted: 30 of 30)
	Chunk Failed (status code 400): 0 entities.
	Invalid records in this chunk: 0
Finished data submission.
Successful records: 30
Failed invalid records: 0


In [43]:
# create a folder of files
df = pd.read_table("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_analysed_data.tsv")
file_names = df["file_name"].to_list()
for file_name in file_names:
    path = '/Users/apple/Desktop/test_gen3/DH_ExportedFiles/processeddata_ms/' + file_name
    with open(path, 'w') as file:
        file.write(path)

In [44]:
import pandas as pd
import hashlib
import os
def calculate_md5(file_path):
    """Calculate the MD5 checksum for a file."""
    md5_hash = hashlib.md5()
    with open(file_path, 'rb') as file:
        for chunk in iter(lambda: file.read(4096), b''):
            md5_hash.update(chunk)
    return md5_hash.hexdigest()

# Provide the file name list and DataFrame
df = pd.read_table("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_analysed_data.tsv")
file_names = df["file_name"].to_list()

folder_path = '/Users/apple/Desktop/test_gen3/DH_ExportedFiles/processeddata_ms/'

# Update the MD5 checksum for each file name in the DataFrame
for file_name in file_names:
    file_path = folder_path + file_name
    md5_sum = calculate_md5(file_path)
    file_size = os.path.getsize(file_path)
    df.loc[df['file_name'] == file_name, 'md5sum'] = md5_sum
    df.loc[df['file_name'] == file_name, 'file_size'] = file_size
df['file_size'] = df['file_size'].astype(int)
df.to_csv("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_analysed_data_updated.tsv", sep='\t', index=False)
df.head()

Unnamed: 0,type,submitter_id,ms_analyses.submitter_id,file_name,file_size,md5sum,object_id,data_category,data_format,data_type,file_source_repository,repository_accession_id,repository_download_ftp,provenance
0,ms_analysed_data,Prj171:CEC_normalized_peak_area.txt,"Prj171:TQSm_042122_010,Prj171:TQSm_042122_011,...",CEC_normalized_peak_area.txt,93,914f22bde76c63cb776f88855317f8e6,,targeted metabolomics,.mzML,chromatograms,,,,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"


In [45]:
# ms_analysed_data
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_analysed_data_updated.tsv", project_id="MyFirstProgram-MyFirstProject")



Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/ms_analysed_data_updated.tsv with 1 records.
Chunk 1 (chunk size: 30, submitted: 0 of 1)
	 Succeeded: 1 entities.
Finished data submission.
Successful records: 1
Failed invalid records: 0


In [48]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/DH_ExportedFiles/metabolite_id.tsv")
df.head()

Unnamed: 0,type,submitter_id,ms_analysed_datas.submitter_id,metabolite_name,refmet_name,dtxsid,inchikey,mass,elution_time,provenance
0,metabolite_id,Prj171:S-(2-carboxyethyl)-L-cysteine,Prj171:CEC_normalized_peak_area.txt,S-(2-carboxyethyl)-L-cysteine,,DTXSID40193329,"1S/C6H11NO4S/c7-4(6(10)11)3-12-2-1-5(8)9/h4H,1...",193.22,2.6,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
1,metabolite_id,"Prj171:13C,15N-methionine",Prj171:CEC_normalized_peak_area.txt,"13C,15N-methionine",methionine,,"1S/C5H11NO2S/c1-9-3-2-4(6)5(7)8/h4H,2-3,6H2,1H...",149.0511,5.12,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"


In [49]:
# metabolite_id
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/DH_ExportedFiles/metabolite_id.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/DH_ExportedFiles/metabolite_id.tsv with 2 records.
Chunk 1 (chunk size: 30, submitted: 0 of 2)
	 Succeeded: 2 entities.
Finished data submission.
Successful records: 2
Failed invalid records: 0
