In [1]:
import pandas as pd
import os 
import json
import requests
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML

### API - Schizophrenia Male

In [5]:
run_id_file5 = pd.read_csv('Project data files/user_selected_run_list_kcYhadsFw3.txt', header=None, sep='\t')

ERR_s_male_run_ids = run_id_file5[0]
ERR_s_male_run_ids_drop = ERR_s_male_run_ids.drop(index=0)

ERR_s_male_run_ids_drop

1      ERR1072629
2      ERR1072937
3      ERR1073491
4      ERR1074896
5      ERR1074954
          ...    
154    ERR1844566
155    ERR1845846
156    ERR1845902
157    ERR1845903
158    ERR2032502
Name: 0, Length: 158, dtype: object

In [12]:
sch_male_species = []
sch_male_health = []

for ids in ERR_s_male_run_ids_drop: 
    query = {"run_id":ids}  
    url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
    data = requests.post(url, data=json.dumps(query)).json()
    run = data.get("run")
    genus = pd.DataFrame(data.get("genus"))
    if genus.empty:
        continue
    genusdrop = genus.drop(['loaded_uid', 'ncbi_taxon_id', 'taxon_rank_level'], axis=1)
    genusrename = genusdrop.rename(columns={'relative_abundance': ids})
    genusunique = genusrename.drop_duplicates(subset='scientific_name', keep='first')
    sex = data['run']['sex']
    phenotypes = data['phenotypes']
    diseases = [item['disease'] for item in phenotypes]
    bmi = data['run']['BMI']
    age = data['run']['host_age']
    country = data['run']['country']
    health = pd.DataFrame([{'Run ID': ids, 'Sex': sex, 'BMI': bmi, 'Age': age, 'Country': country, 
                            'Mesh ID': diseases}])
    sch_male_species.append(genusunique)
    sch_male_health.append(health)



In [15]:
combined_msch_health = sch_male_health[0]

for df in sch_male_health[1:]:
    combined_msch_health = pd.concat([combined_msch_health, df], ignore_index=True)
    
combined_msch_health

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID
0,ERR1072629,Male,17.67,64,United States of America,"[D001714, D003863, D007410, D008171, D012559, ..."
1,ERR1072937,Male,26.51,53,United States of America,"[D001714, D002318, D003863, D012559]"
2,ERR1073491,Male,28.08,49,United States of America,"[D001714, D003863, D012559]"
3,ERR1075554,Male,30.74,62,United States of America,"[D001289, D001714, D003863, D008881, D012559]"
4,ERR1075686,Male,45.84,56,United States of America,"[D001714, D003863, D008171, D012559]"
...,...,...,...,...,...,...
95,ERR1545728,Male,21.95,54,United States of America,"[D001714, D003863, D012559]"
96,ERR1843471,Male,20.97,41,United Kingdom,"[D001714, D003863, D008881, D012559]"
97,ERR1843550,Male,25.82,58,United States of America,"[D001714, D003863, D012559]"
98,ERR1844562,Male,25.82,58,United States of America,"[D001714, D003863, D012559]"


In [17]:
combined_msch_species = sch_male_species[0] 

for df1 in sch_male_species[1:]:
    combined_msch_species = pd.merge(combined_msch_species, df1, on='scientific_name', how='outer')
    combined_msch_species.fillna(0, inplace=True)

columns_order_msch = ['scientific_name'] + [col for col in combined_msch_species.columns if col != 'scientific_name']
combined_msch_species = combined_msch_species[columns_order_msch]

combined_msch_species

Unnamed: 0,scientific_name,ERR1072629,ERR1072937,ERR1073491,ERR1075554,ERR1075686,ERR1075876,ERR1076013,ERR1089705,ERR1089730,...,ERR1250609,ERR1250614,ERR1250635,ERR1250646,ERR1250647,ERR1545728,ERR1843471,ERR1843550,ERR1844562,ERR1845903
0,Abiotrophia,0.00000,0.0,0.003157,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0
1,Acanthopleuribacter,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.00000,0.0,0.0,0.005106,0.0,0.0,0.0
2,Acetoanaerobium,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0
3,Acetobacter,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0
4,Acetobacterium,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,Xenorhabdus,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.007755,0.0,0.003862,...,0.0,0.0,0.005415,0.00000,0.0,0.0,0.005106,0.0,0.0,0.0
763,Yersinia,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0
764,Youngiibacter,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.00485,0.0,0.0,0.000000,0.0,0.0,0.0
765,Zoogloea,0.00484,0.0,0.000000,0.0,0.0,0.003297,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0


In [20]:
transposed_msch_df = combined_msch_species.T 

dfreset_msch = transposed_msch_df.reset_index()
dfreset_msch.columns = dfreset_msch.iloc[0]

dffinal_msch = dfreset_msch.drop(0).reset_index(drop=True)


dffinal_msch.rename(columns={'scientific_name': 'Run ID'}, inplace=True)

dffinal_msch

Unnamed: 0,Run ID,Abiotrophia,Acanthopleuribacter,Acetoanaerobium,Acetobacter,Acetobacterium,Acetohalobium,Acetonema,Acholeplasma,Achromobacter,...,Weissella,Wenyingzhuangia,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea,Zymomonas
0,ERR1072629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00484,0.0
1,ERR1072937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073491,0.003157,0.0,0.0,0.0,0.0,0.0,0.0,0.003157,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1075554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1075686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,ERR1545728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,ERR1843471,0.0,0.005106,0.0,0.0,0.0,0.0,0.0,0.485065,0.0,...,0.0,0.0,0.0,0.0,0.0,0.005106,0.0,0.0,0.0,0.0
97,ERR1843550,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006485,0.0,...,0.006485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,ERR1844562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.006894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
run_ids_msch = dffinal_msch['Run ID'].unique()

dfmerged_msch = []

for run_id in run_ids_msch:
    merge = pd.merge(combined_msch_health[combined_msch_health['Run ID'] == run_id], 
                     dffinal_msch[dffinal_msch['Run ID'] == run_id], on='Run ID', how='inner')
    dfmerged_msch.append(merge)
    
final_msch_merge = pd.concat(dfmerged_msch, ignore_index=True)

final_msch_merge.to_csv('schizophrenia_male.csv', index=False)

final_msch_merge.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetoanaerobium,Acetobacter,...,Weissella,Wenyingzhuangia,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea,Zymomonas
0,ERR1072629,Male,17.67,64,United States of America,"[D001714, D003863, D007410, D008171, D012559, ...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00484,0.0
1,ERR1072937,Male,26.51,53,United States of America,"[D001714, D002318, D003863, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073491,Male,28.08,49,United States of America,"[D001714, D003863, D012559]",0.003157,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1075554,Male,30.74,62,United States of America,"[D001289, D001714, D003863, D008881, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1075686,Male,45.84,56,United States of America,"[D001714, D003863, D008171, D012559]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
