In [387]:
import pandas as pd
import os 
import json
import requests
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML

### All phenotypes

In [8]:
### -- get all phenotypes  --
url = 'https://gmrepo.humangut.info/api/get_all_phenotypes'
pheno_01 = requests.post(url, data={})
pheno_01_cont = pheno_01.json().get('phenotypes')

## -- a DateFrame --
phenotypes = pd.DataFrame(pheno_01_cont)

phenotypes

Unnamed: 0,disease,0,all_samples,1,processed_runs,2,valid_runs,3,failed_runs,4,nr_species,5,nr_genus,6,term,7,note,8
0,C537163,C537163,30,30,30,30,0,0,30,30,0,0,0,0,Pediatric Autoimmune Neuropsychiatric Disorder...,Pediatric Autoimmune Neuropsychiatric Disorder...,OBSESSIVE-COMPULSIVE DISORDER and TIC DISORDER...,OBSESSIVE-COMPULSIVE DISORDER and TIC DISORDER...
1,D000066891,D000066891,81,81,0,0,0,0,0,0,0,0,0,0,Critical Care Outcomes,Critical Care Outcomes,A measure of the mortality and morbidity rates...,A measure of the mortality and morbidity rates...
2,D000067011,D000067011,46,46,46,46,46,46,0,0,1717,1717,663,663,Severe Acute Malnutrition,Severe Acute Malnutrition,Acute form of MALNUTRITION which usually affec...,Acute form of MALNUTRITION which usually affec...
3,D000067877,D000067877,1217,1217,710,710,606,606,104,104,2325,2325,796,796,Autism Spectrum Disorder,Autism Spectrum Disorder,Wide continuum of associated cognitive and neu...,Wide continuum of associated cognitive and neu...
4,D000069279,D000069279,28,28,0,0,0,0,0,0,0,0,0,0,Drug Resistant Epilepsy,Drug Resistant Epilepsy,Epileptic condition in which adequate trials o...,Epileptic condition in which adequate trials o...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,D056865,D056865,24,24,0,0,0,0,0,0,0,0,0,0,Ideal Body Weight,Ideal Body Weight,Expected weight of a healthy normal individual...,Expected weight of a healthy normal individual...
128,D058246,D058246,56,56,56,56,56,56,0,0,211,211,92,92,Prehypertension,Prehypertension,Blood pressure levels that are between normote...,Blood pressure levels that are between normote...
129,D060825,D060825,62,62,62,62,62,62,0,0,0,0,77,77,Cognitive Dysfunction,Cognitive Dysfunction,Diminished or impaired mental and/or intellect...,Diminished or impaired mental and/or intellect...
130,D065290,D065290,290,290,0,0,0,0,0,0,0,0,0,0,Acute-On-Chronic Liver Failure,Acute-On-Chronic Liver Failure,Sudden liver failure in the presence of underl...,Sudden liver failure in the presence of underl...


### Statistics on particular phenotype

In [10]:
## -- get summary information by mesh_id
pheno_02_query = {'mesh_id':'D006262'}  ## -- to get statistics on MeSH ID D006262
url = 'https://gmrepo.humangut.info/api/getStatisticsByProjectsByMeshID'
pheno_02 = requests.post(url, data=json.dumps(pheno_02_query))
pheno_02_cont = pheno_02.json()

## --get DataFrame
phenotyp_stats = pd.DataFrame(pheno_02.json())

phenotyp_stats

Unnamed: 0,metadata,metadata_obtained,stats
uid,D006262,True,
term,Health,True,
note,The state of the organism when it functions op...,True,
mappingTo,,True,
nr_total_samples,,True,34019.0
nr_loaded_samples,,True,23907.0
nr_valid_samples,,True,16282.0
nr_species,,True,6206.0
nr_genus,,True,1676.0


## **Associated species of a phenotype

In [12]:
pheno_03_query = {'mesh_id':'D006262'}  ## -- to get statistics on MeSH ID D006262
url = 'https://gmrepo.humangut.info/api/getAssociatedSpeciesByMeshID'
pheno_03 = requests.post(url, data=json.dumps(pheno_03_query))
pheno_03_cont = pheno_03.json()

## --get DataFrame
phenotyp_assoc_species = pd.DataFrame(pheno_03.json())

## --show data header of the resulting DataFrame
phenotyp_assoc_species

Unnamed: 0,disease,taxon_rank_level,ncbi_taxon_id,samples,abus_mean,abus_median,abus_sd,scientific_name
0,D006262,species,820,12002,4.608746,2.110325,6.386639,Bacteroides uniformis
1,D006262,species,821,11417,4.090026,1.254770,6.860151,Bacteroides vulgatus
2,D006262,species,28116,11406,2.305965,0.739456,4.349414,Bacteroides ovatus
3,D006262,species,40520,11034,0.780763,0.348089,1.346551,Blautia obeum
4,D006262,species,39491,10798,2.758767,0.183908,5.977531,[Eubacterium] rectale
...,...,...,...,...,...,...,...,...
455,D006262,species,67296,2,0.015425,0.015425,0.006480,Streptomyces finlayi
456,D006262,species,1263550,2,0.010863,0.010863,0.005858,Edwardsiella piscicida
457,D006262,species,69218,2,0.096905,0.096905,0.117344,Enterobacter cancerogenus
458,D006262,species,378211,2,0.012495,0.012495,0.016327,Methyloversatilis universalis


### Get relative species/genus abundances in samples/runs associated with a phenotype


In [14]:
data_query = {'mesh_id':'D003093',"ncbi_taxon_id" : "40520"}  ## -- to get statistics on MeSH ID D006262
url = 'https://gmrepo.humangut.info/api/getMicrobeAbundancesByPhenotypeMeshIDAndNCBITaxonID'
data = requests.post(url, data=json.dumps(data_query))

## --get DataFrames
hist_data_for_phenotype = pd.DataFrame(data.json().get('hist_data_for_phenotype'))
hist_data_for_phenotype

Unnamed: 0,x,y,counts,cumsum,cumpct
0,1,89.780469,1186,1186,89.780469
1,3,7.115821,94,1280,96.896291
2,5,1.816805,24,1304,98.713096
3,7,0.832702,11,1315,99.545799
4,9,0.454201,6,1321,100.0


In [38]:
## -- count associated runs --
pheno_06_query = {'mesh_id':'D006262'}  
url = 'https://gmrepo.humangut.info/api/countAssociatedRunsByPhenotypeMeshID'
pheno_06 = requests.post(url, data=json.dumps(pheno_06_query))
pheno_06_cont = pheno_06.json()

## -- the resulting variable is a vector --
phenotyp_nr_assoc_runs = pd.DataFrame(pheno_06.json())
print(phenotyp_nr_assoc_runs)

   nr_assoc_runs
0          34019


In [366]:
## -- get all associted runs --
## use skip = 0, limit = 100 to retrieve the first 100 runs, then
##     skip = 100, limit = 100 to retrieve the next 100 runs ....

pheno_07_query = {'mesh_id':'D006262',"skip":0, "limit":100}  
url = 'https://gmrepo.humangut.info/api/getAssociatedRunsByPhenotypeMeshIDLimit'
pheno_07 = requests.post(url, data=json.dumps(pheno_07_query))
pheno_07_cont = pheno_07.json()

## -- the resulting variable is a vector --
phenotyp_a_page_of_assoc_runs = pd.DataFrame(pheno_07.json())
phenotyp_a_page_of_assoc_runs.columns

Index(['checking', 'project_id', 'our_project_id', 'sample_name',
       'original_sample_description', 'curated_sample_description', 'run_id',
       'sample_id', 'second_sample_id', 'experiment_type',
       'nr_reads_sequenced', 'instrument_model', 'disease', 'phenotype',
       'is_disease_stage_available', 'disease_stage', 'more', 'more_info',
       'country', 'collection_date', 'sex', 'host_age', 'diet', 'longitude',
       'latitude', 'BMI', 'Recent.Antibiotics.Use', 'antibiotics_used',
       'Antibiotics.Dose', 'Days.Without.Antibiotics.Use', 'accession_id',
       'QCStatus'],
      dtype='object')

### Associated genus of a phenotype

In [47]:
pheno_04_query = {'mesh_id':'D006262'}  ## -- to get statistics on MeSH ID D006262
url = 'https://gmrepo.humangut.info/api/getAssociatedGeneraByMeshID'
pheno_04 = requests.post(url, data=json.dumps(pheno_04_query))
pheno_04_cont = pheno_04.json()

## --get DataFrame
phenotyp_assoc_genera = pd.DataFrame(pheno_04.json())

## --show data header of the resulting DataFrame
phenotyp_assoc_genera

Unnamed: 0,disease,taxon_rank_level,ncbi_taxon_id,samples,abus_mean,abus_median,abus_sd,scientific_name
0,D006262,genus,816,15443,24.018719,17.775100,21.844879,Bacteroides
1,D006262,genus,216851,13731,6.127041,3.857800,7.454345,Faecalibacterium
2,D006262,genus,572511,13690,2.587185,1.266020,4.440271,Blautia
3,D006262,genus,1678,13523,8.015112,1.343310,17.006023,Bifidobacterium
4,D006262,genus,841,13258,2.248774,0.943384,4.266204,Roseburia
...,...,...,...,...,...,...,...,...
149,D006262,genus,991903,2,0.057874,0.057874,0.046693,Polymorphum
150,D006262,genus,44258,2,0.054114,0.054114,0.015598,Caloramator
151,D006262,genus,92793,2,0.272387,0.272387,0.372208,Aquabacterium
152,D006262,genus,475087,2,0.011037,0.011037,0.000033,Methanosphaerula


## Data Collection

### Fastq

In [19]:
ids = []
sequences = []

fastq_file = "SRR10911129.fastq"

with open(fastq_file, "r") as handle:
    for i, record in enumerate(SeqIO.parse(handle, "fastq")):
        ids.append(record.id)
        sequences.append(record.seq)

df = pd.DataFrame({'ID': ids, 'Sequence': sequences})

df.head()

Unnamed: 0,ID,Sequence
0,SRR10911129.1,"(N, T, G, C, C, A, G, C, C, G, C, C, G, C, G, ..."
1,SRR10911129.1,"(G, G, A, C, T, A, C, T, A, G, G, G, T, T, T, ..."
2,SRR10911129.2,"(N, T, G, C, C, A, G, C, C, G, C, C, G, C, G, ..."
3,SRR10911129.2,"(G, G, A, C, T, A, C, C, A, G, G, G, T, T, T, ..."
4,SRR10911129.3,"(N, T, G, C, C, A, G, C, A, G, C, C, G, C, G, ..."


In [22]:
genomic_sequence = df['Sequence'][0]

result_handle = NCBIWWW.qblast("blastn", "nt", genomic_sequence)

blast_records = NCBIXML.parse(result_handle)

for blast_record in blast_records:
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < 0.05:
                print("Bacteria Name:", alignment.title)

Bacteria Name: gi|380292720|gb|JQ048306.1| Uncultured bacterium clone TA_91 16S ribosomal RNA gene, partial sequence
Bacteria Name: gi|939472076|gb|KT294860.1| Uncultured Klebsiella sp. clone M01598_122_000000000-ADV8A_1_1101_17779_24276 16S ribosomal RNA gene, partial sequence
Bacteria Name: gi|195542990|gb|EU879536.1| Uncultured bacterium clone DE00600H03 16S ribosomal RNA gene, partial sequence
Bacteria Name: gi|195543047|gb|EU879593.1| Uncultured bacterium clone DE00600C01 16S ribosomal RNA gene, partial sequence
Bacteria Name: gi|643012907|gb|KF115476.1| Uncultured Klebsiella sp. clone GVYBRHI01DK8B9 16S ribosomal RNA gene, partial sequence
Bacteria Name: gi|195543029|gb|EU879575.1| Uncultured bacterium clone DE00601H04 16S ribosomal RNA gene, partial sequence
Bacteria Name: gi|195543020|gb|EU879566.1| Uncultured bacterium clone DE00600E12 16S ribosomal RNA gene, partial sequence
Bacteria Name: gi|814556964|gb|KP873177.1| Uncultured bacterium clone OTU_30 16S ribosomal RNA gene, p

### API - Parkinsons Male

In [210]:
runid = "ERR365949"

query = {"run_id":runid}  
url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
data = requests.post(url, data=json.dumps(query)).json()

## --get run List
run = data.get("run")

species = pd.DataFrame(data.get("species"))
genus = pd.DataFrame(data.get("genus"))

#species.drop(['ncbi_taxon_id', 'taxon_rank_level'], axis=1, inplace=True)
genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
genusrename = genusdrop.rename(columns={'relative_abundance': runid})
genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
genusunique.head()

Unnamed: 0,ERR365949,scientific_name
0,14.1255,Bacteroides
1,10.1877,Prevotella
2,9.02015,Ruminococcus
3,8.71108,Faecalibacterium
4,5.24267,Oscillospira


In [74]:
data

sex = data['run']['sex']
phenotypes = data['phenotypes']
bmi = data['run']['BMI']
age = data['run']['host_age']
country = data['run']['country']
mesh_id = phenotypes[0]['disease']

health = pd.DataFrame({'Sex': [sex], 'BMI': [bmi], 'Age': [age], 'Country': [country], 'Mesh ID': [mesh_id]})

health

##use this and the cell above for run id, phenotypes, gender, bmi, weight, country, and taxonomic profile 

Unnamed: 0,Sex,BMI,Age,Country,Mesh ID
0,Female,21.22,38,United States of America,D008171


In [109]:
run_id_file = pd.read_csv('Project data files/user_selected_unique_run_ids_TWk0lZG1sI.txt', 
                          header=None, index_col=None)
ERR_run_ids = run_id_file[run_id_file[0].str.startswith('ERR')][0].tolist()
ERR_run_idsdf = pd.DataFrame(ERR_run_ids)

In [198]:
parkinsons_male_species = []
parkinsons_male_health = []

for ids in ERR_run_ids: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    mesh_id = phenotypes[0]['disease']
    health = pd.DataFrame({'Run ID': ids, 'Sex': [sex], 'BMI': [bmi], 'Age': [age], 'Country': [country], 
                           'Mesh ID': [mesh_id]})
    parkinsons_male_species.append(genusunique)
    parkinsons_male_health.append(health)

In [207]:
combined_health = parkinsons_male_health[0]

for df in parkinsons_male_health[1:]:
    combined_health = pd.concat([combined_health,df], ignore_index=True)

combined_health.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID
0,ERR365909,Male,,61,Finland,D010300
1,ERR365910,Male,,53,Finland,D010300
2,ERR365912,Male,,66,Finland,D010300
3,ERR365914,Male,,63,Finland,D010300
4,ERR365915,Male,,72,Finland,D010300


In [192]:
combined_df = parkinsons_male_species[0] 

for df in parkinsons_male_species[1:]:
    combined_df = pd.merge(combined_df, df, on='scientific_name', how='outer')
    combined_df.fillna(0, inplace=True)

columns_order = ['scientific_name'] + [col for col in combined_df.columns if col != 'scientific_name']
combined_df = combined_df[columns_order]

transposed_df = combined_df.T 


dfreset = transposed_df.reset_index()
dfreset.columns = dfreset.iloc[0]

dffinal = dfreset.drop(0).reset_index(drop=True)


dffinal.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

dffinal.head()

Unnamed: 0,Run ID,Acidaminococcus,Adlercreutzia,Aggregatibacter,Akkermansia,Alistipes,Anaerostipes,Anaerotruncus,Bacteroides,Barnesiella,...,Shewanella,Streptococcus,Subdoligranulum,Succinivibrio,Sutterella,Turicibacter,Unknown,Veillonella,Vibrio,Victivallis
0,ERR365909,0.0,0.0,0.0,0.0,0.42913,0.138836,0.0,7.95153,0.896125,...,0.0,0.0,6.1088,0.0,0.0,0.0,90.3698,0.0,0.0,0.0
1,ERR365910,0.0,0.0,0.0,0.0,0.489687,0.059356,0.0,11.3667,0.0,...,0.0,0.0,4.34783,0.0,0.830984,0.14839,109.304,0.192907,0.0,0.0
2,ERR365912,3.34422,0.0,0.0,0.0,1.34553,0.0,0.0,30.6336,0.0,...,0.0,0.052253,10.5291,0.0,2.25996,0.07838,21.5545,0.0,0.0,0.0
3,ERR365914,0.0,0.0,0.0,0.0,0.191301,0.0,0.0,13.6629,0.0,...,0.0,0.0,4.19855,0.0,2.48691,0.0,111.72,0.0,0.0,0.0
4,ERR365915,3.72852,0.0,0.0,0.0,0.718898,0.0,0.0,43.9381,0.0,...,0.0,0.146217,1.63275,0.0,1.65712,0.0,49.4212,0.085293,0.0,0.0


In [372]:
run_ids = dffinal['Run ID'].unique()

dfmerged = []

for run_id in run_ids:
    merge = pd.merge(combined_health[combined_health['Run ID'] == run_id], dffinal[dffinal['Run ID'] == run_id], 
                    on='Run ID', how='inner')
    dfmerged.append(merge)
    
final_p_male_merge = pd.concat(dfmerged, ignore_index=True)

final_p_male_merge.to_csv('parkinsons_male.csv', index=False)

final_p_male_merge.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Acidaminococcus,Adlercreutzia,Aggregatibacter,Akkermansia,...,Shewanella,Streptococcus,Subdoligranulum,Succinivibrio,Sutterella,Turicibacter,Unknown,Veillonella,Vibrio,Victivallis
0,ERR365909,Male,,61,Finland,D010300,0.0,0.0,0.0,0.0,...,0.0,0.0,6.1088,0.0,0.0,0.0,90.3698,0.0,0.0,0.0
1,ERR365910,Male,,53,Finland,D010300,0.0,0.0,0.0,0.0,...,0.0,0.0,4.34783,0.0,0.830984,0.14839,109.304,0.192907,0.0,0.0
2,ERR365912,Male,,66,Finland,D010300,3.34422,0.0,0.0,0.0,...,0.0,0.052253,10.5291,0.0,2.25996,0.07838,21.5545,0.0,0.0,0.0
3,ERR365914,Male,,63,Finland,D010300,0.0,0.0,0.0,0.0,...,0.0,0.0,4.19855,0.0,2.48691,0.0,111.72,0.0,0.0,0.0
4,ERR365915,Male,,72,Finland,D010300,3.72852,0.0,0.0,0.0,...,0.0,0.146217,1.63275,0.0,1.65712,0.0,49.4212,0.085293,0.0,0.0


### API - Parkinsons Female

In [373]:
run_id_file2 = pd.read_csv('Project data files/user_selected_unique_run_ids_eOkL3H8EJi.txt', 
                          header=None, index_col=None)
ERR_p_female_run_ids = run_id_file2[run_id_file2[0].str.startswith('ERR')][0].tolist()
ERR_p_female_run_idsdf = pd.DataFrame(ERR_p_female_run_ids)

parkinsons_female_species = []
parkinsons_female_health = []

for ids in ERR_p_female_run_ids: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    mesh_id = phenotypes[0]['disease']
    health = pd.DataFrame({'Run ID': ids, 'Sex': [sex], 'BMI': [bmi], 'Age': [age], 'Country': [country], 
                           'Mesh ID': [mesh_id]})
    parkinsons_female_species.append(genusunique)
    parkinsons_female_health.append(health)

combined_pfemale_health = parkinsons_female_health[0]

for df in parkinsons_female_health[1:]:
    combined_pfemale_health = pd.concat([combined_pfemale_health,df], ignore_index=True)


combined_pfemale_species = parkinsons_female_species[0] 

for df in parkinsons_female_species[1:]:
    combined_pfemale_species = pd.merge(combined_pfemale_species, df, on='scientific_name', how='outer')
    combined_pfemale_species.fillna(0, inplace=True)

columns_order_pfemale = ['scientific_name'] + [col for col in combined_pfemale_species.columns if col != 'scientific_name']
combined_pfemale_species = combined_pfemale_species[columns_order_pfemale]

transposed_pfemale_df = combined_pfemale_species.T 


dfreset_pfemale = transposed_pfemale_df.reset_index()
dfreset_pfemale.columns = dfreset_pfemale.iloc[0]

dffinal_pfemale = dfreset_pfemale.drop(0).reset_index(drop=True)


dffinal_pfemale.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids = dffinal_pfemale['Run ID'].unique()

dfmerged_pfemale = []

for run_id in run_ids:
    merge = pd.merge(combined_pfemale_health[combined_pfemale_health['Run ID'] == run_id], 
                     dffinal_pfemale[dffinal_pfemale['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_pfemale.append(merge)
    
final_p_female_merge = pd.concat(dfmerged_pfemale, ignore_index=True)

final_p_female_merge.to_csv('parkinsons_female.csv', index=False)

final_p_female_merge.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Acidaminococcus,Acidovorax,Actinomyces,Adlercreutzia,...,Streptococcus,Subdoligranulum,Succinatimonas,Succiniclasticum,Sutterella,Turicibacter,Unknown,Varibaculum,Veillonella,Victivallis
0,ERR365911,Female,,73,Finland,D010300,0.0,0.0,0.0,0.0,...,0.065172,1.61627,0.0,0.0,2.86757,0.0,65.0156,0.0,0.755996,0.208551
1,ERR365913,Female,,64,Finland,D010300,0.0,0.0,0.0,0.0,...,0.167344,4.09993,0.0,0.0,0.0,0.0,95.6969,0.0,0.0,0.0
2,ERR365918,Female,,73,Finland,D010300,0.0,0.0,0.0,0.0,...,0.091408,3.4082,0.0,0.0,1.37112,0.287281,72.6822,0.0,0.0,0.0
3,ERR365919,Female,,64,Finland,D010300,0.0,0.0,0.0,0.031803,...,0.87989,8.89431,0.0,0.0,0.0,0.0,55.8465,0.0,0.0,0.0
4,ERR365921,Female,,66,Finland,D010300,0.0,0.0,0.0,0.0,...,0.87146,5.73711,0.0,0.0,0.43573,0.0,39.5062,0.0,0.363108,0.0


### API - Alzheimers Male

In [374]:
run_id_file3 = pd.read_csv('Project data files/user_selected_run_list_eRepc1nyt6.txt', header=None, sep='\t')

ERR_alz_male_run_ids = run_id_file3[0]
ERR_alz_male_run_ids_drop = ERR_alz_male_run_ids.drop(index=[0, 2, 3, 4])

alz_male_species = []
alz_male_health = []

for ids in ERR_alz_male_run_ids_drop: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 'Mesh ID': diseases}])
    alz_male_species.append(genusunique)
    alz_male_health.append(health)

combined_malz_health = alz_male_health[0]

for df in alz_male_health[1:]:
    combined_malz_health = pd.concat([combined_malz_health, df], ignore_index=True)


combined_malz_species = alz_male_species[0] 

for df1 in alz_male_species[1:]:
    combined_malz_species = pd.merge(combined_malz_species, df1, on='scientific_name', how='outer')
    combined_malz_species.fillna(0, inplace=True)

columns_order_malz = ['scientific_name'] + [col for col in combined_malz_species.columns if col != 'scientific_name']
combined_malz_species = combined_malz_species[columns_order_malz]

transposed_malz_df = combined_malz_species.T 


dfreset_malz = transposed_malz_df.reset_index()
dfreset_malz.columns = dfreset_malz.iloc[0]

dffinal_malz = dfreset_malz.drop(0).reset_index(drop=True)


dffinal_malz.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids_malz = dffinal_malz['Run ID'].unique()

dfmerged_malz = []

for run_id in run_ids_malz:
    merge = pd.merge(combined_malz_health[combined_malz_health['Run ID'] == run_id], 
                     dffinal_malz[dffinal_malz['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_malz.append(merge)
    
final_malz_merge = pd.concat(dfmerged_malz, ignore_index=True)

final_malz_merge.to_csv('alzheimers_male.csv', index=False)

final_malz_merge.head()



  combined_malz_health = pd.concat([combined_malz_health, df], ignore_index=True)


Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acholeplasma,Acidaminococcus,...,Thermoanaerobacter,Truepera,Tyzzerella,Uliginosibacterium,Unknown,Veillonella,Victivallis,Virgibacillus,Weissella,Yersinia
0,ERR1090510,Male,30.11,31,United States of America,[D000544],0.0,0.008079,0.0,0.937222,...,0.0,0.0,0.339339,0.032318,18.9626,0.0,0.0,0.0,0.0,0.0
1,ERR1843463,Male,24.41,0,United States of America,"[D000544, D001327]",0.0,0.0,0.0,0.036957,...,0.0,0.0,0.0,0.0,42.9544,0.026398,0.042236,0.015839,0.0,0.0
2,ERR1844528,Male,24.41,0,United States of America,"[D000544, D001327]",0.0,0.0,0.0,0.027337,...,0.0,0.005467,0.0,0.0,42.1542,0.049207,0.027337,0.0,0.0,0.005467
3,ERR2091834,Male,23.72,56,United Kingdom,[D000544],0.0,0.0,0.078935,0.036836,...,0.010525,0.0,0.005262,0.0,11.756,0.0,0.0,0.0,0.0,0.0
4,SRR9671458,Male,,64,China,[D000544],0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,183.001,0.0,0.0,0.0,0.0,0.0


### API - Alzheimers Female

In [375]:
run_id_file4 = pd.read_csv('Project data files/user_selected_run_list_y0G7FDBUha.txt', header=None, sep='\t')

ERR_alz_female_run_ids = run_id_file4[0]
ERR_alz_female_run_ids_drop = ERR_alz_female_run_ids.drop(index=[0, 1, 3, 4, 5])

alz_female_species = []
alz_female_health = []

for ids in ERR_alz_female_run_ids_drop: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 'Mesh ID': diseases}])
    alz_female_species.append(genusunique)
    alz_female_health.append(health)

combined_femalz_health = alz_female_health[0]

for df in alz_female_health[1:]:
    combined_femalz_health = pd.concat([combined_femalz_health, df], ignore_index=True)


combined_femalz_species = alz_female_species[0] 

for df1 in alz_female_species[1:]:
    combined_femalz_species = pd.merge(combined_femalz_species, df1, on='scientific_name', how='outer')
    combined_femalz_species.fillna(0, inplace=True)

columns_order_femalz = ['scientific_name'] + [col for col in combined_femalz_species.columns if col != 'scientific_name']
combined_femalz_species = combined_femalz_species[columns_order_femalz]

transposed_femalz_df = combined_femalz_species.T 


dfreset_femalz = transposed_femalz_df.reset_index()
dfreset_femalz.columns = dfreset_femalz.iloc[0]

dffinal_femalz = dfreset_femalz.drop(0).reset_index(drop=True)


dffinal_femalz.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids_femalz = dffinal_femalz['Run ID'].unique()

dfmerged_femalz = []

for run_id in run_ids_femalz:
    merge = pd.merge(combined_femalz_health[combined_femalz_health['Run ID'] == run_id], 
                     dffinal_femalz[dffinal_femalz['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_femalz.append(merge)
    
final_femalz_merge = pd.concat(dfmerged_femalz, ignore_index=True)

final_femalz_merge.to_csv('alzheimers_female.csv', index=False)

final_femalz_merge.head()



  combined_femalz_health = pd.concat([combined_femalz_health, df], ignore_index=True)


Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Acetobacterium,Acholeplasma,Acidaminococcus,Acidihalobacter,...,Treponema,Tyzzerella,Unknown,Varibaculum,Veillonella,Verrucomicrobium,Vibrio,Victivallis,Viridibacillus,Weissella
0,ERR1160800,Female,26.09,53,United States of America,"[D000544, D008881]",0.001165,0.00932,0.33086,0.001165,...,0.0,0.001165,40.5105,0.00932,1.00889,0.005825,0.08854,0.00233,0.001165,0.00233
1,ERR2091942,Female,25.07,53,United Kingdom,"[D000544, D007410]",0.0,0.946936,0.0,0.0,...,0.0,0.003573,35.6012,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SRR9671487,Female,,67,China,[D000544],0.0,0.0,0.0,0.0,...,0.0,0.0,170.064,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SRR9671488,Female,,80,China,[D000544],0.0,0.0,0.0,0.0,...,0.0,0.0,114.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SRR9671489,Female,,57,China,[D000544],0.0,0.0,0.0,0.0,...,0.0,0.0,91.363,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### API - Schizophrenia Male

In [376]:
run_id_file5 = pd.read_csv('Project data files/user_selected_run_list_kcYhadsFw3.txt', header=None, sep='\t')

ERR_s_male_run_ids = run_id_file5[0]
ERR_s_male_run_ids_drop = ERR_s_male_run_ids.drop(index=0)

sch_male_species = []
sch_male_health = []

for ids in ERR_s_male_run_ids_drop: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    if genus.empty:
        continue
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 
                            'Mesh ID': diseases}])
    sch_male_species.append(genusunique)
    sch_male_health.append(health)

combined_msch_health = sch_male_health[0]

for df in sch_male_health[1:]:
    combined_msch_health = pd.concat([combined_msch_health, df], ignore_index=True)


combined_msch_species = sch_male_species[0] 

for df1 in sch_male_species[1:]:
    combined_msch_species = pd.merge(combined_msch_species, df1, on='scientific_name', how='outer')
    combined_msch_species.fillna(0, inplace=True)

columns_order_msch = ['scientific_name'] + [col for col in combined_msch_species.columns if col != 'scientific_name']
combined_msch_species = combined_msch_species[columns_order_msch]

transposed_msch_df = combined_msch_species.T 


dfreset_msch = transposed_msch_df.reset_index()
dfreset_msch.columns = dfreset_msch.iloc[0]

dffinal_msch = dfreset_msch.drop(0).reset_index(drop=True)


dffinal_msch.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids_msch = dffinal_msch['Run ID'].unique()

dfmerged_msch = []

for run_id in run_ids_msch:
    merge = pd.merge(combined_msch_health[combined_msch_health['Run ID'] == run_id], 
                     dffinal_msch[dffinal_msch['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_msch.append(merge)
    
final_msch_merge = pd.concat(dfmerged_msch, ignore_index=True)

final_msch_merge.to_csv('schizophrenia_male.csv', index=False)

final_msch_merge.head()



Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetoanaerobium,Acetobacter,...,Weissella,Wenyingzhuangia,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea,Zymomonas
0,ERR1072629,Male,17.67,64,United States of America,"[D001714, D003863, D007410, D008171, D012559, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00484,0.0
1,ERR1072937,Male,26.51,53,United States of America,"[D001714, D002318, D003863, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073491,Male,28.08,49,United States of America,"[D001714, D003863, D012559]",0.003157,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1075554,Male,30.74,62,United States of America,"[D001289, D001714, D003863, D008881, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1075686,Male,45.84,56,United States of America,"[D001714, D003863, D008171, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### API - Schizophrenia Female

In [377]:
run_id_file6 = pd.read_csv('Project data files/user_selected_run_list_31dJWq43Oe.txt', header=None, sep='\t')

ERR_s_female_run_ids = run_id_file6[0]
ERR_s_female_run_ids_drop = ERR_s_female_run_ids.drop(index=0)

sch_female_species = []
sch_female_health = []

for ids in ERR_s_female_run_ids_drop: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    if genus.empty:
        continue
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 
                            'Mesh ID': diseases}])
    sch_female_species.append(genusunique)
    sch_female_health.append(health)
    

combined_femsch_health = sch_female_health[0]

for df in sch_female_health[1:]:
    combined_femsch_health = pd.concat([combined_femsch_health, df], ignore_index=True)
    
combined_femsch_species = sch_female_species[0] 

for df1 in sch_female_species[1:]:
    combined_femsch_species = pd.merge(combined_femsch_species, df1, on='scientific_name', how='outer')
    combined_femsch_species.fillna(0, inplace=True)

columns_order_femsch = ['scientific_name'] + [col for col in combined_femsch_species.columns if col != 'scientific_name']
combined_femsch_species = combined_femsch_species[columns_order_femsch]

transposed_femsch_df = combined_femsch_species.T 


dfreset_femsch = transposed_femsch_df.reset_index()
dfreset_femsch.columns = dfreset_femsch.iloc[0]

dffinal_femsch = dfreset_femsch.drop(0).reset_index(drop=True)


dffinal_femsch.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids_femsch = dffinal_femsch['Run ID'].unique()

dfmerged_femsch = []

for run_id in run_ids_femsch:
    merge = pd.merge(combined_femsch_health[combined_femsch_health['Run ID'] == run_id], 
                     dffinal_femsch[dffinal_femsch['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_femsch.append(merge)
    
final_femsch_merge = pd.concat(dfmerged_femsch, ignore_index=True)

final_femsch_merge.to_csv('schizophrenia_female.csv', index=False)

final_femsch_merge.head()


Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,...,Williamsia,Wolbachia,Xanthomonas,Xenococcus,Xenorhabdus,Yersinia,Yokenella,Youngiibacter,Zoogloea,Zymomonas
0,ERR1073023,Female,34.72,35,United States of America,"[D001714, D003863, D012559]",0.0,0.005583,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ERR1073394,Female,22.41,27,United States of America,"[D001327, D001714, D003863, D003920, D003967, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073395,Female,22.41,27,United States of America,"[D001327, D001714, D003863, D003920, D003967, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00247,0.0
3,ERR1073490,Female,25.73,37,United States of America,"[D001714, D003863, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1073812,Female,29.71,55,United States of America,"[D001289, D001714, D003015, D003863, D003920, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### API - Bipolar Male

In [378]:
run_id_file7 = pd.read_csv('Project data files/user_selected_run_list_M73s5vbyby.txt', header=None, sep='\t')

ERR_bi_male_run_ids = run_id_file7[0]
ERR_bi_male_run_ids_drop = ERR_bi_male_run_ids.drop(index=0)

bi_male_species = []
bi_male_health = []

for ids in ERR_bi_male_run_ids_drop: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    if genus.empty:
        continue
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 
                            'Mesh ID': diseases}])
    bi_male_species.append(genusunique)
    bi_male_health.append(health)
    

combined_mbi_health = bi_male_health[0]

for df in bi_male_health[1:]:
    combined_mbi_health = pd.concat([combined_mbi_health, df], ignore_index=True)
    
combined_mbi_species = bi_male_species[0] 

for df1 in bi_male_species[1:]:
    combined_mbi_species = pd.merge(combined_mbi_species, df1, on='scientific_name', how='outer')
    combined_mbi_species.fillna(0, inplace=True)

columns_order_mbi = ['scientific_name'] + [col for col in combined_mbi_species.columns if col != 'scientific_name']
combined_mbi_species = combined_mbi_species[columns_order_mbi]

transposed_mbi_df = combined_mbi_species.T 


dfreset_mbi = transposed_mbi_df.reset_index()
dfreset_mbi.columns = dfreset_mbi.iloc[0]

dffinal_mbi = dfreset_mbi.drop(0).reset_index(drop=True)


dffinal_mbi.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids_mbi = dffinal_mbi['Run ID'].unique()

dfmerged_mbi = []

for run_id in run_ids_mbi:
    merge = pd.merge(combined_mbi_health[combined_mbi_health['Run ID'] == run_id], 
                     dffinal_mbi[dffinal_mbi['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_mbi.append(merge)
    
final_mbi_merge = pd.concat(dfmerged_mbi, ignore_index=True)

final_mbi_merge.to_csv('bipolar_male.csv', index=False)

final_mbi_merge.head()


Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,...,Weissella,Wenyingzhuangia,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea,Zymomonas
0,ERR1072629,Male,17.67,64,United States of America,"[D001714, D003863, D007410, D008171, D012559, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00484,0.0
1,ERR1072937,Male,26.51,53,United States of America,"[D001714, D002318, D003863, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073491,Male,28.08,49,United States of America,"[D001714, D003863, D012559]",0.003157,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1075554,Male,30.74,62,United States of America,"[D001289, D001714, D003863, D008881, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1075686,Male,45.84,56,United States of America,"[D001714, D003863, D008171, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### API - Bipolar Female

In [379]:
run_id_file8 = pd.read_csv('Project data files/user_selected_run_list_A2hJFf9rzG.txt', header=None, sep='\t')

ERR_bi_female_run_ids = run_id_file8[0]
ERR_bi_female_run_ids_drop = ERR_bi_female_run_ids.drop(index=0)

bi_female_species = []
bi_female_health = []

for ids in ERR_bi_female_run_ids_drop: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    if genus.empty:
        continue
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 
                            'Mesh ID': diseases}])
    bi_female_species.append(genusunique)
    bi_female_health.append(health)
    

combined_fembi_health = bi_female_health[0]

for df in bi_female_health[1:]:
    combined_fembi_health = pd.concat([combined_fembi_health, df], ignore_index=True)
    
combined_fembi_species = bi_female_species[0] 

for df1 in bi_female_species[1:]:
    combined_fembi_species = pd.merge(combined_fembi_species, df1, on='scientific_name', how='outer')
    combined_fembi_species.fillna(0, inplace=True)

columns_order_fembi = ['scientific_name'] + [col for col in combined_fembi_species.columns if col != 'scientific_name']
combined_fembi_species = combined_fembi_species[columns_order_fembi]

transposed_fembi_df = combined_fembi_species.T 


dfreset_fembi = transposed_fembi_df.reset_index()
dfreset_fembi.columns = dfreset_fembi.iloc[0]

dffinal_fembi = dfreset_fembi.drop(0).reset_index(drop=True)


dffinal_fembi.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids_fembi = dffinal_fembi['Run ID'].unique()

dfmerged_fembi = []

for run_id in run_ids_fembi:
    merge = pd.merge(combined_fembi_health[combined_fembi_health['Run ID'] == run_id], 
                     dffinal_fembi[dffinal_fembi['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_fembi.append(merge)
    
final_fembi_merge = pd.concat(dfmerged_fembi, ignore_index=True)

final_fembi_merge.to_csv('bipolar_female.csv', index=False)

final_fembi_merge.head()


Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,...,Williamsia,Wolbachia,Xanthomonas,Xenococcus,Xenorhabdus,Yersinia,Yokenella,Youngiibacter,Zoogloea,Zymomonas
0,ERR1073023,Female,34.72,35,United States of America,"[D001714, D003863, D012559]",0.0,0.005583,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ERR1073394,Female,22.41,27,United States of America,"[D001327, D001714, D003863, D003920, D003967, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073395,Female,22.41,27,United States of America,"[D001327, D001714, D003863, D003920, D003967, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00247,0.0
3,ERR1073490,Female,25.73,37,United States of America,"[D001714, D003863, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1073812,Female,29.71,55,United States of America,"[D001289, D001714, D003015, D003863, D003920, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### API - Epilepsy Male

In [380]:
run_id_file9 = pd.read_csv('Project data files/user_selected_run_list_CYqElYhtzf.txt', header=None, sep='\t')

ERR_ep_male_run_ids = run_id_file9[0]
ERR_ep_male_run_ids_drop = ERR_ep_male_run_ids.drop(index=0)

ep_male_species = []
ep_male_health = []

for ids in ERR_ep_male_run_ids_drop: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    if genus.empty:
        continue
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 
                            'Mesh ID': diseases}])
    ep_male_species.append(genusunique)
    ep_male_health.append(health)
    

combined_mep_health = ep_male_health[0]

for df in ep_male_health[1:]:
    combined_mep_health = pd.concat([combined_mep_health, df], ignore_index=True)
    
combined_mep_species = ep_male_species[0] 

for df1 in ep_male_species[1:]:
    combined_mep_species = pd.merge(combined_mep_species, df1, on='scientific_name', how='outer')
    combined_mep_species.fillna(0, inplace=True)

columns_order_mep = ['scientific_name'] + [col for col in combined_mep_species.columns if col != 'scientific_name']
combined_mep_species = combined_mep_species[columns_order_mep]

transposed_mep_df = combined_mep_species.T 


dfreset_mep = transposed_mep_df.reset_index()
dfreset_mep.columns = dfreset_mep.iloc[0]

dffinal_mep = dfreset_mep.drop(0).reset_index(drop=True)


dffinal_mep.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids_mep = dffinal_mep['Run ID'].unique()

dfmerged_mep = []

for run_id in run_ids_mep:
    merge = pd.merge(combined_mep_health[combined_mep_health['Run ID'] == run_id], 
                     dffinal_mep[dffinal_mep['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_mep.append(merge)
    
final_mep_merge = pd.concat(dfmerged_mep, ignore_index=True)

final_mep_merge.to_csv('epilepsy_male.csv', index=False)

final_mep_merge.head()


Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetoanaerobium,Acholeplasma,...,Vibrio,Victivallis,Virgibacillus,Vulgatibacter,Weissella,Xanthomonas,Xenorhabdus,Yersinia,Zoogloea,Zymomonas
0,ERR1072832,Male,24.93,58,United States of America,[D004827],0.0,0.0,0.00357,0.0,...,0.0,0.00357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ERR1076844,Male,21.02,36,United States of America,"[D003248, D004827, D008171, D013959]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1089831,Male,22.91,43,United States of America,[D004827],0.0,0.0,0.0,0.0,...,0.0,0.005888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1089867,Male,15.62,6,Canada,[D004827],0.0,0.0,0.0,0.0,...,0.0,0.005153,0.0,0.0,0.010306,0.0,0.0,0.0,0.0,0.0
4,ERR1089868,Male,15.62,6,Canada,[D004827],0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### API - Epilepsy Female

In [381]:
run_id_file10 = pd.read_csv('Project data files/user_selected_run_list_WZhhuTry3w.txt', header=None, sep='\t')

ERR_ep_female_run_ids = run_id_file10[0]
ERR_ep_female_run_ids_drop = ERR_ep_female_run_ids.drop(index=0)

ep_female_species = []
ep_female_health = []

for ids in ERR_ep_female_run_ids_drop: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    if genus.empty:
        continue
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 
                            'Mesh ID': diseases}])
    ep_female_species.append(genusunique)
    ep_female_health.append(health)
    

combined_femep_health = ep_female_health[0]

for df in ep_female_health[1:]:
    combined_femep_health = pd.concat([combined_femep_health, df], ignore_index=True)
    
combined_femep_species = ep_female_species[0] 

for df1 in ep_female_species[1:]:
    combined_femep_species = pd.merge(combined_femep_species, df1, on='scientific_name', how='outer')
    combined_femep_species.fillna(0, inplace=True)

columns_order_femep = ['scientific_name'] + [col for col in combined_femep_species.columns if col != 'scientific_name']
combined_femep_species = combined_femep_species[columns_order_femep]

transposed_femep_df = combined_femep_species.T 


dfreset_femep = transposed_femep_df.reset_index()
dfreset_femep.columns = dfreset_femep.iloc[0]

dffinal_femep = dfreset_femep.drop(0).reset_index(drop=True)


dffinal_femep.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids_femep = dffinal_femep['Run ID'].unique()

dfmerged_femep = []

for run_id in run_ids_femep:
    merge = pd.merge(combined_femep_health[combined_femep_health['Run ID'] == run_id], 
                     dffinal_femep[dffinal_femep['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_femep.append(merge)
    
final_femep_merge = pd.concat(dfmerged_femep, ignore_index=True)

final_femep_merge.to_csv('epilepsy_female.csv', index=False)

final_femep_merge.head()


Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,Acetobacterium,...,Wandonia,Weeksella,Weissella,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea
0,ERR1073812,Female,29.71,55,United States of America,"[D001289, D001714, D003015, D003863, D003920, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ERR1090471,Female,38.01,47,United States of America,"[D001327, D001714, D003248, D003863, D004827, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004069,0.0,0.0,0.0
2,ERR1091746,Female,31.09,59,United States of America,"[D001327, D003920, D004827, D013959]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001851,0.005553,0.0,0.0
3,ERR1091758,Female,21.79,62,United States of America,"[D001289, D001327, D001714, D002318, D003248, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002317,0.0,0.0
4,ERR1160392,Female,35.19,50,United States of America,"[D001714, D003248, D003863, D004827, D008171, ...",0.0,0.0,0.001275,0.001275,...,0.0,0.0,0.00255,0.001275,0.001275,0.0,0.001275,0.005101,0.0,0.0


### API - Health Male

In [388]:
run_id_file11 = pd.read_csv('Project data files/user_selected_run_list_DW8kIIrxag.txt', header=None, sep='\t')

ERR_he_male_run_ids = run_id_file11[0]
ERR_he_male_run_ids_drop = ERR_he_male_run_ids.drop(index=0)
ERR_he_male_run_ids_sampled = ERR_he_male_run_ids_drop.sample(n=300, random_state=42)

he_male_species = []
he_male_health = []

for ids in ERR_he_male_run_ids_sampled: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    if genus.empty:
        continue
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 
                            'Mesh ID': diseases}])
    he_male_species.append(genusunique)
    he_male_health.append(health)
    

combined_mhe_health = he_male_health[0]

for df in he_male_health[1:]:
    combined_mhe_health = pd.concat([combined_mhe_health, df], ignore_index=True)
    
combined_mhe_species = he_male_species[0] 

for df1 in he_male_species[1:]:
    combined_mhe_species = pd.merge(combined_mhe_species, df1, on='scientific_name', how='outer')
    combined_mhe_species.fillna(0, inplace=True)

columns_order_mhe = ['scientific_name'] + [col for col in combined_mhe_species.columns if col != 'scientific_name']
combined_mhe_species = combined_mhe_species[columns_order_mhe]

transposed_mhe_df = combined_mhe_species.T 


dfreset_mhe = transposed_mhe_df.reset_index()
dfreset_mhe.columns = dfreset_mhe.iloc[0]

dffinal_mhe = dfreset_mhe.drop(0).reset_index(drop=True)


dffinal_mhe.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids_mhe = dffinal_mhe['Run ID'].unique()

dfmerged_mhe = []

for run_id in run_ids_mhe:
    merge = pd.merge(combined_mhe_health[combined_mhe_health['Run ID'] == run_id], 
                     dffinal_mhe[dffinal_mhe['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_mhe.append(merge)
    
final_mhe_merge = pd.concat(dfmerged_mhe, ignore_index=True)

final_mhe_merge.to_csv('health_male300.csv', index=False)

final_mhe_merge.head()


  combined_mhe_health = pd.concat([combined_mhe_health, df], ignore_index=True)


Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acetivibrio,Acetoanaerobium,Acetobacter,...,unclassified Clostridiales (miscellaneous),unclassified Clostridiales Family XIII. Incertae Sedis,unclassified Coriobacteriaceae,unclassified Erysipelotrichaceae (miscellaneous),unclassified Lachnospiraceae,unclassified Peptostreptococcaceae,unclassified Peptostreptococcaceae (miscellaneous),unclassified Propionibacteriaceae,unclassified Sutterellaceae,unclassified Tissierellia
0,ERR011245,Male,,54.0,Denmark,[D006262],0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ERR1075960,Male,21.22,0.0,United States of America,[D006262],0.0,0.003877,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SRR5078646,Male,,0.156164,Ireland,[D006262],0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1854747,Male,28.7,62.0,United Kingdom,[D006262],0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR688529,Male,30.9,63.0,Austria,[D006262],0.0,0.0,0.0,0.0,...,0.00926,0.0,0.0,0.02919,2.3641,0.0,0.0,0.0,0.0144,0.0


### API - Health Female

In [389]:
run_id_file12 = pd.read_csv('Project data files/user_selected_run_list_tYBBaBSnGX.txt', header=None, sep='\t')

ERR_he_female_run_ids = run_id_file12[0]
ERR_he_female_run_ids_drop = ERR_he_female_run_ids.drop(index=0)
ERR_he_female_run_ids_sampled = ERR_he_female_run_ids_drop.sample(n=300, random_state=42)

he_female_species = []
he_female_health = []

for ids in ERR_he_female_run_ids_sampled: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    if genus.empty:
        continue
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 
                            'Mesh ID': diseases}])
    he_female_species.append(genusunique)
    he_female_health.append(health)
    

combined_femhe_health = he_female_health[0]

for df in he_female_health[1:]:
    combined_femhe_health = pd.concat([combined_femhe_health, df], ignore_index=True)
    
combined_femhe_species = he_female_species[0] 

for df1 in he_female_species[1:]:
    combined_femhe_species = pd.merge(combined_femhe_species, df1, on='scientific_name', how='outer')
    combined_femhe_species.fillna(0, inplace=True)

columns_order_femhe = ['scientific_name'] + [col for col in combined_femhe_species.columns if col != 'scientific_name']
combined_femhe_species = combined_femhe_species[columns_order_femhe]

transposed_femhe_df = combined_femhe_species.T 


dfreset_femhe = transposed_femhe_df.reset_index()
dfreset_femhe.columns = dfreset_femhe.iloc[0]

dffinal_femhe = dfreset_femhe.drop(0).reset_index(drop=True)


dffinal_femhe.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids_femhe = dffinal_femhe['Run ID'].unique()

dfmerged_femhe = []

for run_id in run_ids_femhe:
    merge = pd.merge(combined_femhe_health[combined_femhe_health['Run ID'] == run_id], 
                     dffinal_femhe[dffinal_femhe['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_femhe.append(merge)
    
final_femhe_merge = pd.concat(dfmerged_femhe, ignore_index=True)

final_femhe_merge.to_csv('health_female300.csv', index=False)

final_femhe_merge.head()


  combined_femhe_health = pd.concat([combined_femhe_health, df], ignore_index=True)


Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acaryochloris,Acetoanaerobium,...,unclassified Clostridiales Family XIII. Incertae Sedis,unclassified Dermatophilaceae,unclassified Erysipelotrichaceae,unclassified Erysipelotrichaceae (miscellaneous),unclassified Lachnospiraceae,unclassified Peptostreptococcaceae,unclassified Peptostreptococcaceae (miscellaneous),unclassified Propionibacteriaceae,unclassified Ruminococcaceae,unclassified Sutterellaceae
0,ERR011117,Female,,42.0,Denmark,[D006262],0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ERR1842901,Female,21.03,63.0,United Kingdom,[D006262],0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR719035,Female,19.0311,26.0,Canada,[D006262],0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SRR5648781,Female,20.0,21.0,United States of America,[D006262],0.001134,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR2032350,Female,24.13,24.0,United States of America,[D006262],0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### API - Depression Male

In [384]:
run_id_file13 = pd.read_csv('Project data files/user_selected_run_list_KTzPlgVEYB.txt', header=None, sep='\t')

ERR_de_male_run_ids = run_id_file13[0]
ERR_de_male_run_ids_drop = ERR_de_male_run_ids.drop(index=0)

de_male_species = []
de_male_health = []

for ids in ERR_de_male_run_ids_drop: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    if genus.empty:
        continue
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 
                            'Mesh ID': diseases}])
    de_male_species.append(genusunique)
    de_male_health.append(health)
    

combined_mde_health = de_male_health[0]

for df in de_male_health[1:]:
    combined_mde_health = pd.concat([combined_mde_health, df], ignore_index=True)
    
combined_mde_species = de_male_species[0] 

for df1 in de_male_species[1:]:
    combined_mde_species = pd.merge(combined_mde_species, df1, on='scientific_name', how='outer')
    combined_mde_species.fillna(0, inplace=True)

columns_order_mde = ['scientific_name'] + [col for col in combined_mde_species.columns if col != 'scientific_name']
combined_mde_species = combined_mde_species[columns_order_mde]

transposed_mde_df = combined_mde_species.T 


dfreset_mde = transposed_mde_df.reset_index()
dfreset_mde.columns = dfreset_mde.iloc[0]

dffinal_mde = dfreset_mde.drop(0).reset_index(drop=True)


dffinal_mde.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids_mde = dffinal_mde['Run ID'].unique()

dfmerged_mde = []

for run_id in run_ids_mde:
    merge = pd.merge(combined_mde_health[combined_mde_health['Run ID'] == run_id], 
                     dffinal_mde[dffinal_mde['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_mde.append(merge)
    
final_mde_merge = pd.concat(dfmerged_mde, ignore_index=True)

final_mde_merge.to_csv('depression_male.csv', index=False)

final_mde_merge.head()


Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetoanaerobium,Acetobacter,...,Weissella,Wenyingzhuangia,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea,Zymomonas
0,ERR1072629,Male,17.67,64,United States of America,"[D001714, D003863, D007410, D008171, D012559, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00484,0.0
1,ERR1072937,Male,26.51,53,United States of America,"[D001714, D002318, D003863, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073491,Male,28.08,49,United States of America,"[D001714, D003863, D012559]",0.003157,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1075554,Male,30.74,62,United States of America,"[D001289, D001714, D003863, D008881, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1075686,Male,45.84,56,United States of America,"[D001714, D003863, D008171, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### API - Depression Female

In [386]:
run_id_file14 = pd.read_csv('Project data files/user_selected_run_list_XL7BBx9USk.txt', header=None, sep='\t')

ERR_de_female_run_ids = run_id_file14[0]
ERR_de_female_run_ids_drop = ERR_de_female_run_ids.drop(index=0)

de_female_species = []
de_female_health = []

for ids in ERR_de_female_run_ids_drop: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    if genus.empty:
        continue
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 
                            'Mesh ID': diseases}])
    de_female_species.append(genusunique)
    de_female_health.append(health)
    

combined_femde_health = de_female_health[0]

for df in de_female_health[1:]:
    combined_femde_health = pd.concat([combined_femde_health, df], ignore_index=True)
    
combined_femde_species = de_female_species[0] 

for df1 in de_female_species[1:]:
    combined_femde_species = pd.merge(combined_femde_species, df1, on='scientific_name', how='outer')
    combined_femde_species.fillna(0, inplace=True)

columns_order_femde = ['scientific_name'] + [col for col in combined_femde_species.columns if col != 'scientific_name']
combined_femde_species = combined_femde_species[columns_order_femde]

transposed_femde_df = combined_femde_species.T 


dfreset_femde = transposed_femde_df.reset_index()
dfreset_femde.columns = dfreset_femde.iloc[0]

dffinal_femde = dfreset_femde.drop(0).reset_index(drop=True)


dffinal_femde.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

run_ids_femde = dffinal_femde['Run ID'].unique()

dfmerged_femde = []

for run_id in run_ids_femde:
    merge = pd.merge(combined_femde_health[combined_femde_health['Run ID'] == run_id], 
                     dffinal_femde[dffinal_femde['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_femde.append(merge)
    
final_femde_merge = pd.concat(dfmerged_femde, ignore_index=True)

final_femde_merge.to_csv('depression_female.csv', index=False)

final_femde_merge.head()


Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,...,Williamsia,Wolbachia,Xanthomonas,Xenococcus,Xenorhabdus,Yersinia,Yokenella,Youngiibacter,Zoogloea,Zymomonas
0,ERR1073023,Female,34.72,35,United States of America,"[D001714, D003863, D012559]",0.0,0.005583,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ERR1073394,Female,22.41,27,United States of America,"[D001327, D001714, D003863, D003920, D003967, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073395,Female,22.41,27,United States of America,"[D001327, D001714, D003863, D003920, D003967, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00247,0.0
3,ERR1073490,Female,25.73,37,United States of America,"[D001714, D003863, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1073812,Female,29.71,55,United States of America,"[D001289, D001714, D003015, D003863, D003920, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
