In [36]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import ast
import numpy as np

In [37]:
disease_dict = {
    'D010300': 'Parkinsons',
    'D000544': 'Alzheimers',
    'D001327': 'Autoimmune Diseases',
    'D008881': 'Migrane Disorders',
    'D007410': 'Intestinal Diseases',
    'D001714': 'Bipolar Disorder',
    'D003863': 'Depression', 
    'D008171': 'Lung Diseases', 
    'D012559': 'Schizophrenia', 
    'D013959': 'Thyroid Diseases',
    'D043183': 'Irritable Bowel Syndrome', 
    'D002318': 'Cardiovascular Diseases', 
    'D001289': 'Attention Deficit Disorder with Hyperactivity',  
    'D003248': 'Constipation', 
    'D002446': 'Celiac Disease', 
    'D003920': 'Diabetes Mellitus', 
    'D003967': 'Diarrhea', 
    'D008107': 'Liver Diseases', 
    'D015212': 'Inflammatory Bowel Diseases', 
    'D007674': 'Kidney Diseases',
    'D003015': 'Clostridium Infections', 
    'D000067877': 'Autism Spectrum Disorder', 
    'D004827': 'Epilepsy', 
    'D010661': 'Phenylketonurias',
    'D006262': 'Health'
    
}

### Parkinsons Male ###

In [38]:
park_male_df = pd.read_csv('parkinsons_male.csv')
park_male_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Acidaminococcus,Adlercreutzia,Aggregatibacter,Akkermansia,...,Shewanella,Streptococcus,Subdoligranulum,Succinivibrio,Sutterella,Turicibacter,Unknown,Veillonella,Vibrio,Victivallis
0,ERR365909,Male,,61,Finland,D010300,0.0,0.0,0.0,0.0,...,0.0,0.0,6.1088,0.0,0.0,0.0,90.3698,0.0,0.0,0.0
1,ERR365910,Male,,53,Finland,D010300,0.0,0.0,0.0,0.0,...,0.0,0.0,4.34783,0.0,0.830984,0.14839,109.304,0.192907,0.0,0.0
2,ERR365912,Male,,66,Finland,D010300,3.34422,0.0,0.0,0.0,...,0.0,0.052253,10.5291,0.0,2.25996,0.07838,21.5545,0.0,0.0,0.0
3,ERR365914,Male,,63,Finland,D010300,0.0,0.0,0.0,0.0,...,0.0,0.0,4.19855,0.0,2.48691,0.0,111.72,0.0,0.0,0.0
4,ERR365915,Male,,72,Finland,D010300,3.72852,0.0,0.0,0.0,...,0.0,0.146217,1.63275,0.0,1.65712,0.0,49.4212,0.085293,0.0,0.0


In [39]:
park_male_df.isnull().sum()

Run ID           0
Sex              0
BMI             37
Age              0
Country          0
                ..
Turicibacter     0
Unknown          0
Veillonella      0
Vibrio           0
Victivallis      0
Length: 75, dtype: int64

In [40]:
len(park_male_df)

37

In [41]:
park_male_df.drop('BMI', axis=1, inplace=True)

In [42]:
park_male_df.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,Acidaminococcus,Adlercreutzia,Aggregatibacter,Akkermansia,Alistipes,...,Shewanella,Streptococcus,Subdoligranulum,Succinivibrio,Sutterella,Turicibacter,Unknown,Veillonella,Vibrio,Victivallis
0,ERR365909,Male,61,Finland,D010300,0.0,0.0,0.0,0.0,0.42913,...,0.0,0.0,6.1088,0.0,0.0,0.0,90.3698,0.0,0.0,0.0
1,ERR365910,Male,53,Finland,D010300,0.0,0.0,0.0,0.0,0.489687,...,0.0,0.0,4.34783,0.0,0.830984,0.14839,109.304,0.192907,0.0,0.0
2,ERR365912,Male,66,Finland,D010300,3.34422,0.0,0.0,0.0,1.34553,...,0.0,0.052253,10.5291,0.0,2.25996,0.07838,21.5545,0.0,0.0,0.0
3,ERR365914,Male,63,Finland,D010300,0.0,0.0,0.0,0.0,0.191301,...,0.0,0.0,4.19855,0.0,2.48691,0.0,111.72,0.0,0.0,0.0
4,ERR365915,Male,72,Finland,D010300,3.72852,0.0,0.0,0.0,0.718898,...,0.0,0.146217,1.63275,0.0,1.65712,0.0,49.4212,0.085293,0.0,0.0


In [43]:
park_male_df['Age'].unique()

array([61, 53, 66, 63, 72, 57, 69, 73, 67, 62, 60, 58, 71, 75, 65, 70, 74,
       68, 78, 64])

In [44]:
park_male_df['Country'].unique()

array(['Finland'], dtype=object)

In [45]:
bacteria_park_male_df = park_male_df.drop(['Run ID', 'Sex', 'Age', 'Country', 'Mesh ID'], axis=1)
att_park_male_columns = ['Run ID', 'Sex', 'Age', 'Country', 'Mesh ID']
att_park_male_df = park_male_df[att_park_male_columns]

zero_counts_park_male = (bacteria_park_male_df == 0.0).sum()

threshold_park_male = 0.2 * len(bacteria_park_male_df)

filtered_park_male = zero_counts_park_male[zero_counts_park_male <= threshold_park_male]

filtered_park_male_df = bacteria_park_male_df[filtered_park_male.index]

filtered_park_male = pd.concat([att_park_male_df, filtered_park_male_df], axis=1)

filtered_park_male.to_csv('clean_parkinsons_male.csv')

filtered_park_male.shape

(37, 19)

In [46]:
filtered_park_male.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,Alistipes,Bacteroides,Blautia,Clostridium,Coprococcus,Dorea,Faecalibacterium,Lachnospira,Oscillospira,Parabacteroides,Ruminococcus,Subdoligranulum,Sutterella,Unknown
0,ERR365909,Male,61,Finland,D010300,0.42913,7.95153,0.441752,1.16118,0.416509,0.403887,5.65442,1.43885,4.65733,0.946611,5.2884,6.1088,0.0,90.3698
1,ERR365910,Male,53,Finland,D010300,0.489687,11.3667,0.0,2.00326,1.29099,0.638077,5.08978,3.502,3.93233,0.415492,4.76332,4.34783,0.830984,109.304
2,ERR365912,Male,66,Finland,D010300,1.34553,30.6336,2.42978,0.966688,0.091443,2.07707,9.62769,1.68517,5.55193,0.431091,5.29066,10.5291,2.25996,21.5545
3,ERR365914,Male,63,Finland,D010300,0.191301,13.6629,0.402739,1.71164,0.0,0.332259,4.45026,6.14176,3.03061,1.00685,3.27225,4.19855,2.48691,111.72
4,ERR365915,Male,72,Finland,D010300,0.718898,43.9381,0.121847,4.36213,1.29158,0.255879,0.962593,0.718898,1.76678,3.20458,4.30121,1.63275,1.65712,49.4212


In [47]:
mesh_ids_str = filtered_park_male['Mesh ID']

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_park_male['Mesh ID'] = filtered_park_male['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_park_male.rename(columns={'Mesh ID': 'Condition'}, inplace=True)

filtered_park_male.to_csv('clean_parkinsons_male_condition.csv')

filtered_park_male.head()

Unnamed: 0,Run ID,Sex,Age,Country,Condition,Alistipes,Bacteroides,Blautia,Clostridium,Coprococcus,Dorea,Faecalibacterium,Lachnospira,Oscillospira,Parabacteroides,Ruminococcus,Subdoligranulum,Sutterella,Unknown
0,ERR365909,Male,61,Finland,Parkinsons,0.42913,7.95153,0.441752,1.16118,0.416509,0.403887,5.65442,1.43885,4.65733,0.946611,5.2884,6.1088,0.0,90.3698
1,ERR365910,Male,53,Finland,Parkinsons,0.489687,11.3667,0.0,2.00326,1.29099,0.638077,5.08978,3.502,3.93233,0.415492,4.76332,4.34783,0.830984,109.304
2,ERR365912,Male,66,Finland,Parkinsons,1.34553,30.6336,2.42978,0.966688,0.091443,2.07707,9.62769,1.68517,5.55193,0.431091,5.29066,10.5291,2.25996,21.5545
3,ERR365914,Male,63,Finland,Parkinsons,0.191301,13.6629,0.402739,1.71164,0.0,0.332259,4.45026,6.14176,3.03061,1.00685,3.27225,4.19855,2.48691,111.72
4,ERR365915,Male,72,Finland,Parkinsons,0.718898,43.9381,0.121847,4.36213,1.29158,0.255879,0.962593,0.718898,1.76678,3.20458,4.30121,1.63275,1.65712,49.4212


### Parkinsons Female ###

In [48]:
park_female_df = pd.read_csv('parkinsons_female.csv')
park_female_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Acidaminococcus,Acidovorax,Actinomyces,Adlercreutzia,...,Streptococcus,Subdoligranulum,Succinatimonas,Succiniclasticum,Sutterella,Turicibacter,Unknown,Varibaculum,Veillonella,Victivallis
0,ERR365911,Female,,73,Finland,D010300,0.0,0.0,0.0,0.0,...,0.065172,1.61627,0.0,0.0,2.86757,0.0,65.0156,0.0,0.755996,0.208551
1,ERR365913,Female,,64,Finland,D010300,0.0,0.0,0.0,0.0,...,0.167344,4.09993,0.0,0.0,0.0,0.0,95.6969,0.0,0.0,0.0
2,ERR365918,Female,,73,Finland,D010300,0.0,0.0,0.0,0.0,...,0.091408,3.4082,0.0,0.0,1.37112,0.287281,72.6822,0.0,0.0,0.0
3,ERR365919,Female,,64,Finland,D010300,0.0,0.0,0.0,0.031803,...,0.87989,8.89431,0.0,0.0,0.0,0.0,55.8465,0.0,0.0,0.0
4,ERR365921,Female,,66,Finland,D010300,0.0,0.0,0.0,0.0,...,0.87146,5.73711,0.0,0.0,0.43573,0.0,39.5062,0.0,0.363108,0.0


In [49]:
park_female_df.isnull().sum()

Run ID           0
Sex              0
BMI             37
Age              0
Country          0
                ..
Turicibacter     0
Unknown          0
Varibaculum      0
Veillonella      0
Victivallis      0
Length: 72, dtype: int64

In [50]:
len(park_female_df)

37

In [51]:
park_female_df.drop('BMI', axis=1, inplace=True)

In [52]:
park_female_df.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,Acidaminococcus,Acidovorax,Actinomyces,Adlercreutzia,Akkermansia,...,Streptococcus,Subdoligranulum,Succinatimonas,Succiniclasticum,Sutterella,Turicibacter,Unknown,Varibaculum,Veillonella,Victivallis
0,ERR365911,Female,73,Finland,D010300,0.0,0.0,0.0,0.0,0.0,...,0.065172,1.61627,0.0,0.0,2.86757,0.0,65.0156,0.0,0.755996,0.208551
1,ERR365913,Female,64,Finland,D010300,0.0,0.0,0.0,0.0,0.310782,...,0.167344,4.09993,0.0,0.0,0.0,0.0,95.6969,0.0,0.0,0.0
2,ERR365918,Female,73,Finland,D010300,0.0,0.0,0.0,0.0,0.0,...,0.091408,3.4082,0.0,0.0,1.37112,0.287281,72.6822,0.0,0.0,0.0
3,ERR365919,Female,64,Finland,D010300,0.0,0.0,0.0,0.031803,0.349836,...,0.87989,8.89431,0.0,0.0,0.0,0.0,55.8465,0.0,0.0,0.0
4,ERR365921,Female,66,Finland,D010300,0.0,0.0,0.0,0.0,0.058097,...,0.87146,5.73711,0.0,0.0,0.43573,0.0,39.5062,0.0,0.363108,0.0


In [53]:
park_female_df['Age'].unique()

array([73, 64, 66, 67, 69, 56, 65, 60, 75, 61, 72, 68, 55, 62, 54])

In [54]:
park_female_df['Country'].unique()

array(['Finland'], dtype=object)

In [55]:
bacteria_park_female_df = park_female_df.drop(['Run ID', 'Sex', 'Age', 'Country', 'Mesh ID'], axis=1)
att_park_female_columns = ['Run ID', 'Sex', 'Age', 'Country', 'Mesh ID']
att_park_female_df = park_female_df[att_park_female_columns]

zero_counts_park_female = (bacteria_park_female_df == 0.0).sum()

threshold_park_female = 0.2 * len(bacteria_park_female_df)

filtered_park_female = zero_counts_park_female[zero_counts_park_female <= threshold_park_female]

filtered_park_female_df = bacteria_park_female_df[filtered_park_female.index]

filtered_park_female = pd.concat([att_park_female_df, filtered_park_female_df], axis=1)

filtered_park_female.to_csv('clean_parkinsons_female.csv')

filtered_park_female.shape

(37, 20)

In [56]:
filtered_park_female.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,Alistipes,Anaerostipes,Bacteroides,Blautia,Clostridium,Coprococcus,Dorea,Faecalibacterium,Lachnospira,Oscillospira,Parabacteroides,Ruminococcus,Subdoligranulum,Sutterella,Unknown
0,ERR365911,Female,73,Finland,D010300,3.34984,0.625652,28.3629,0.091241,0.0,0.130344,0.364964,1.02972,0.091241,6.24348,4.0537,8.66788,1.61627,2.86757,65.0156
1,ERR365913,Female,64,Finland,D010300,0.095625,0.083672,7.75759,1.01602,3.50227,0.0,0.0,0.394454,0.382501,5.67774,15.7901,5.6897,4.09993,0.0,95.6969
2,ERR365918,Female,73,Finland,D010300,0.208932,0.352572,13.2802,2.41577,1.94568,0.117524,3.04257,6.68582,3.78689,6.26796,0.378689,8.33116,3.4082,1.37112,72.6822
3,ERR365919,Female,64,Finland,D010300,4.60087,0.29683,13.9086,1.06011,1.06011,1.69617,0.614863,3.79519,3.72098,10.2406,3.03191,13.8026,8.89431,0.0,55.8465
4,ERR365921,Female,66,Finland,D010300,0.726216,0.319535,28.3515,3.31155,4.11038,0.610022,0.610022,15.8025,3.00654,2.16412,4.03776,2.64343,5.73711,0.43573,39.5062


In [57]:
mesh_ids_str = filtered_park_female['Mesh ID']

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_park_female['Mesh ID'] = filtered_park_female['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_park_female.rename(columns={'Mesh ID': 'Condition'}, inplace=True)

filtered_park_female.to_csv('clean_parkinsons_female_condition.csv')

filtered_park_female.head()

Unnamed: 0,Run ID,Sex,Age,Country,Condition,Alistipes,Anaerostipes,Bacteroides,Blautia,Clostridium,Coprococcus,Dorea,Faecalibacterium,Lachnospira,Oscillospira,Parabacteroides,Ruminococcus,Subdoligranulum,Sutterella,Unknown
0,ERR365911,Female,73,Finland,Parkinsons,3.34984,0.625652,28.3629,0.091241,0.0,0.130344,0.364964,1.02972,0.091241,6.24348,4.0537,8.66788,1.61627,2.86757,65.0156
1,ERR365913,Female,64,Finland,Parkinsons,0.095625,0.083672,7.75759,1.01602,3.50227,0.0,0.0,0.394454,0.382501,5.67774,15.7901,5.6897,4.09993,0.0,95.6969
2,ERR365918,Female,73,Finland,Parkinsons,0.208932,0.352572,13.2802,2.41577,1.94568,0.117524,3.04257,6.68582,3.78689,6.26796,0.378689,8.33116,3.4082,1.37112,72.6822
3,ERR365919,Female,64,Finland,Parkinsons,4.60087,0.29683,13.9086,1.06011,1.06011,1.69617,0.614863,3.79519,3.72098,10.2406,3.03191,13.8026,8.89431,0.0,55.8465
4,ERR365921,Female,66,Finland,Parkinsons,0.726216,0.319535,28.3515,3.31155,4.11038,0.610022,0.610022,15.8025,3.00654,2.16412,4.03776,2.64343,5.73711,0.43573,39.5062


### Alzheimers Male

In [85]:
alz_male_df = pd.read_csv('alzheimers_male.csv')
alz_male_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acholeplasma,Acidaminococcus,...,Thermoanaerobacter,Truepera,Tyzzerella,Uliginosibacterium,Unknown,Veillonella,Victivallis,Virgibacillus,Weissella,Yersinia
0,ERR1090510,Male,30.11,31,United States of America,['D000544'],0.0,0.008079,0.0,0.937222,...,0.0,0.0,0.339339,0.032318,18.9626,0.0,0.0,0.0,0.0,0.0
1,ERR1843463,Male,24.41,0,United States of America,"['D000544', 'D001327']",0.0,0.0,0.0,0.036957,...,0.0,0.0,0.0,0.0,42.9544,0.026398,0.042236,0.015839,0.0,0.0
2,ERR1844528,Male,24.41,0,United States of America,"['D000544', 'D001327']",0.0,0.0,0.0,0.027337,...,0.0,0.005467,0.0,0.0,42.1542,0.049207,0.027337,0.0,0.0,0.005467
3,ERR2091834,Male,23.72,56,United Kingdom,['D000544'],0.0,0.0,0.078935,0.036836,...,0.010525,0.0,0.005262,0.0,11.756,0.0,0.0,0.0,0.0,0.0
4,SRR9671458,Male,,64,China,['D000544'],0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,183.001,0.0,0.0,0.0,0.0,0.0


In [86]:
alz_male_df.isnull().sum()

Run ID            0
Sex               0
BMI              23
Age               0
Country           0
                 ..
Veillonella       0
Victivallis       0
Virgibacillus     0
Weissella         0
Yersinia          0
Length: 170, dtype: int64

In [87]:
alz_male_df.shape

(27, 170)

In [88]:
# dropping the BMI column because there are only 4 entries 
alz_male_df.drop('BMI', axis=1, inplace=True)

In [89]:
alz_male_df.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acholeplasma,Acidaminococcus,Acinetobacter,...,Thermoanaerobacter,Truepera,Tyzzerella,Uliginosibacterium,Unknown,Veillonella,Victivallis,Virgibacillus,Weissella,Yersinia
0,ERR1090510,Male,31,United States of America,['D000544'],0.0,0.008079,0.0,0.937222,0.016159,...,0.0,0.0,0.339339,0.032318,18.9626,0.0,0.0,0.0,0.0,0.0
1,ERR1843463,Male,0,United States of America,"['D000544', 'D001327']",0.0,0.0,0.0,0.036957,0.0,...,0.0,0.0,0.0,0.0,42.9544,0.026398,0.042236,0.015839,0.0,0.0
2,ERR1844528,Male,0,United States of America,"['D000544', 'D001327']",0.0,0.0,0.0,0.027337,0.005467,...,0.0,0.005467,0.0,0.0,42.1542,0.049207,0.027337,0.0,0.0,0.005467
3,ERR2091834,Male,56,United Kingdom,['D000544'],0.0,0.0,0.078935,0.036836,0.0,...,0.010525,0.0,0.005262,0.0,11.756,0.0,0.0,0.0,0.0,0.0
4,SRR9671458,Male,64,China,['D000544'],0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,183.001,0.0,0.0,0.0,0.0,0.0


In [90]:
alz_male_df['Age'].unique()

array([31,  0, 56, 64, 82, 81, 72, 62, 75, 61, 60, 78, 69, 85, 71, 59, 63,
       65])

In [91]:
# 31 year old with alzheimers ?? could be a mistake or early-onset 
# going to replace 0 with the median of the ages, not including the 0 or 31 

alz_male_age_filter = alz_male_df[alz_male_df['Age'] > 35]
alz_male_age_median = alz_male_age_filter['Age'].median()
alz_male_df['Age'] = alz_male_df['Age'].replace(0, alz_male_age_median)
# not replacing the 31 at this point

alz_male_df.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acholeplasma,Acidaminococcus,Acinetobacter,...,Thermoanaerobacter,Truepera,Tyzzerella,Uliginosibacterium,Unknown,Veillonella,Victivallis,Virgibacillus,Weissella,Yersinia
0,ERR1090510,Male,31,United States of America,['D000544'],0.0,0.008079,0.0,0.937222,0.016159,...,0.0,0.0,0.339339,0.032318,18.9626,0.0,0.0,0.0,0.0,0.0
1,ERR1843463,Male,70,United States of America,"['D000544', 'D001327']",0.0,0.0,0.0,0.036957,0.0,...,0.0,0.0,0.0,0.0,42.9544,0.026398,0.042236,0.015839,0.0,0.0
2,ERR1844528,Male,70,United States of America,"['D000544', 'D001327']",0.0,0.0,0.0,0.027337,0.005467,...,0.0,0.005467,0.0,0.0,42.1542,0.049207,0.027337,0.0,0.0,0.005467
3,ERR2091834,Male,56,United Kingdom,['D000544'],0.0,0.0,0.078935,0.036836,0.0,...,0.010525,0.0,0.005262,0.0,11.756,0.0,0.0,0.0,0.0,0.0
4,SRR9671458,Male,64,China,['D000544'],0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,183.001,0.0,0.0,0.0,0.0,0.0


In [92]:
alz_male_df['Country'].unique()

array(['United States of America', 'United Kingdom', 'China'],
      dtype=object)

In [93]:
mesh_id_alz_male_df = alz_male_df['Mesh ID'].unique()
mesh_id_alz_male_df
# adding these to the dictionary up top

array(["['D000544']", "['D000544', 'D001327']"], dtype=object)

In [94]:
mapped_diseases_alz_male = [
    [disease_dict[code.strip("[]' ")] for code in disease_list.split(',')]
    for disease_list in mesh_id_alz_male_df
]

print(mapped_diseases_alz_male)

[['Alzheimers'], ['Alzheimers', 'Autoimmune Diseases']]


In [95]:
bacteria_alz_male_df = alz_male_df.drop(['Run ID', 'Sex', 'Age', 'Country', 'Mesh ID'], axis=1)
att_alz_male_columns = ['Run ID', 'Sex', 'Age', 'Country', 'Mesh ID']
att_alz_male_df = alz_male_df[att_alz_male_columns]

zero_counts_alz_male = (bacteria_alz_male_df == 0.0).sum()

threshold_alz_male = 0.2 * len(bacteria_alz_male_df)

filtered_alz_male = zero_counts_alz_male[zero_counts_alz_male <= threshold_alz_male]

filtered_alz_male_df = bacteria_alz_male_df[filtered_alz_male.index]

filtered_alz_male = pd.concat([att_alz_male_df, filtered_alz_male_df], axis=1)

filtered_alz_male.to_csv('clean_alzheimers_male.csv')

filtered_alz_male.shape

(27, 13)

In [96]:
filtered_alz_male.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,Bacteroides,Clostridium,Dorea,Faecalibacterium,Roseburia,Ruminococcus,Streptococcus,Unknown
0,ERR1090510,Male,31,United States of America,['D000544'],33.4895,0.105034,0.0,0.008079,0.048477,0.242385,0.016159,18.9626
1,ERR1843463,Male,70,United States of America,"['D000544', 'D001327']",32.8124,0.00528,0.021118,4.07053,0.221741,0.105591,0.337891,42.9544
2,ERR1844528,Male,70,United States of America,"['D000544', 'D001327']",33.7015,0.010935,0.010935,3.94751,0.235101,0.114817,0.355385,42.1542
3,ERR2091834,Male,56,United Kingdom,['D000544'],0.373625,0.236805,0.036836,3.19423,0.031574,0.273641,0.010525,11.756
4,SRR9671458,Male,64,China,['D000544'],0.027799,0.125096,0.0,0.896518,0.979915,0.576829,0.500382,183.001


In [97]:
for mesh_id_str in filtered_alz_male['Mesh ID']:
    mesh_id_list = ast.literal_eval(mesh_id_str)
    
    mesh_id_string = ', '.join(mesh_id_list)
    
def format_mesh_ids(mesh_id_str):
    mesh_id_list = ast.literal_eval(mesh_id_str)
    return ', '.join(mesh_id_list)

filtered_alz_male['Mesh ID'] = filtered_alz_male['Mesh ID'].apply(format_mesh_ids)

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_alz_male['Mesh ID'] = filtered_alz_male['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_alz_male.to_csv('clean_alzheimers_male_condition.csv')

filtered_alz_male.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,Bacteroides,Clostridium,Dorea,Faecalibacterium,Roseburia,Ruminococcus,Streptococcus,Unknown
0,ERR1090510,Male,31,United States of America,Alzheimers,33.4895,0.105034,0.0,0.008079,0.048477,0.242385,0.016159,18.9626
1,ERR1843463,Male,70,United States of America,"Alzheimers, Autoimmune Diseases",32.8124,0.00528,0.021118,4.07053,0.221741,0.105591,0.337891,42.9544
2,ERR1844528,Male,70,United States of America,"Alzheimers, Autoimmune Diseases",33.7015,0.010935,0.010935,3.94751,0.235101,0.114817,0.355385,42.1542
3,ERR2091834,Male,56,United Kingdom,Alzheimers,0.373625,0.236805,0.036836,3.19423,0.031574,0.273641,0.010525,11.756
4,SRR9671458,Male,64,China,Alzheimers,0.027799,0.125096,0.0,0.896518,0.979915,0.576829,0.500382,183.001


### Alzheimers Female

In [98]:
alz_female_df = pd.read_csv('alzheimers_female.csv')
alz_female_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Acetobacterium,Acholeplasma,Acidaminococcus,Acidihalobacter,...,Treponema,Tyzzerella,Unknown,Varibaculum,Veillonella,Verrucomicrobium,Vibrio,Victivallis,Viridibacillus,Weissella
0,ERR1160800,Female,26.09,53,United States of America,"['D000544', 'D008881']",0.001165,0.00932,0.33086,0.001165,...,0.0,0.001165,40.5105,0.00932,1.00889,0.005825,0.08854,0.00233,0.001165,0.00233
1,ERR2091942,Female,25.07,53,United Kingdom,"['D000544', 'D007410']",0.0,0.946936,0.0,0.0,...,0.0,0.003573,35.6012,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SRR9671487,Female,,67,China,['D000544'],0.0,0.0,0.0,0.0,...,0.0,0.0,170.064,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SRR9671488,Female,,80,China,['D000544'],0.0,0.0,0.0,0.0,...,0.0,0.0,114.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SRR9671489,Female,,57,China,['D000544'],0.0,0.0,0.0,0.0,...,0.0,0.0,91.363,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
alz_female_df.isnull().sum()

Run ID               0
Sex                  0
BMI                 20
Age                  0
Country              0
                    ..
Verrucomicrobium     0
Vibrio               0
Victivallis          0
Viridibacillus       0
Weissella            0
Length: 278, dtype: int64

In [100]:
len(alz_female_df)

22

In [101]:
# dropping the BMI column because only two samples actually have values 
alz_female_df.drop('BMI', axis=1, inplace=True)

In [102]:
alz_female_df.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,Acetobacterium,Acholeplasma,Acidaminococcus,Acidihalobacter,Acidithiobacillus,...,Treponema,Tyzzerella,Unknown,Varibaculum,Veillonella,Verrucomicrobium,Vibrio,Victivallis,Viridibacillus,Weissella
0,ERR1160800,Female,53,United States of America,"['D000544', 'D008881']",0.001165,0.00932,0.33086,0.001165,0.003495,...,0.0,0.001165,40.5105,0.00932,1.00889,0.005825,0.08854,0.00233,0.001165,0.00233
1,ERR2091942,Female,53,United Kingdom,"['D000544', 'D007410']",0.0,0.946936,0.0,0.0,0.0,...,0.0,0.003573,35.6012,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SRR9671487,Female,67,China,['D000544'],0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,170.064,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SRR9671488,Female,80,China,['D000544'],0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,114.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SRR9671489,Female,57,China,['D000544'],0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,91.363,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
alz_female_df['Age'].unique()

array([53, 67, 80, 57, 76, 71, 72, 61, 59, 85, 82, 78, 79, 83])

In [104]:
alz_female_df['Country'].unique()

array(['United States of America', 'United Kingdom', 'China'],
      dtype=object)

In [105]:
mesh_id_alz_female_df = alz_female_df['Mesh ID'].unique()
mesh_id_alz_female_df

array(["['D000544', 'D008881']", "['D000544', 'D007410']", "['D000544']"],
      dtype=object)

In [106]:
mapped_diseases_alz_female = [
    [disease_dict[code.strip("[]' ")] for code in disease_list.split(',')]
    for disease_list in mesh_id_alz_female_df
]

print(mapped_diseases_alz_female)

[['Alzheimers', 'Migrane Disorders'], ['Alzheimers', 'Intestinal Diseases'], ['Alzheimers']]


In [107]:
bacteria_alz_female_df = alz_female_df.drop(['Run ID', 'Sex', 'Age', 'Country', 'Mesh ID'], axis=1)
att_alz_female_columns = ['Run ID', 'Sex', 'Age', 'Country', 'Mesh ID']
att_alz_female_df = alz_female_df[att_alz_female_columns]

zero_counts_alz_female = (bacteria_alz_female_df == 0.0).sum()

threshold_alz_female = 0.2 * len(bacteria_alz_female_df)

filtered_alz_female = zero_counts_alz_female[zero_counts_alz_female <= threshold_alz_female]

filtered_alz_female_df = bacteria_alz_female_df[filtered_alz_female.index]

filtered_alz_female = pd.concat([att_alz_female_df, filtered_alz_female_df], axis=1)

filtered_alz_female.to_csv('clean_alzheimers_female.csv')

filtered_alz_female.shape

(22, 13)

In [108]:
filtered_alz_female.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,Bacteroides,Clostridium,Dorea,Faecalibacterium,Roseburia,Ruminococcus,Streptococcus,Unknown
0,ERR1160800,Female,53,United States of America,"['D000544', 'D008881']",0.855109,0.09786,0.040775,9.80114,0.02563,0.44503,4.65767,40.5105
1,ERR2091942,Female,53,United Kingdom,"['D000544', 'D007410']",12.9212,0.321601,0.017867,15.1724,0.078614,0.360908,0.007147,35.6012
2,SRR9671487,Female,67,China,['D000544'],8.11452,1.05326,0.0,0.0,0.0,0.0,1.64664,170.064
3,SRR9671488,Female,80,China,['D000544'],9.34541,2.0094,9.73367,0.422315,0.463184,7.96948,3.09924,114.27
4,SRR9671489,Female,57,China,['D000544'],2.97554,11.2332,0.0,0.0,0.0,0.0,0.21967,91.363


In [109]:
for mesh_id_str in filtered_alz_female['Mesh ID']:
    mesh_id_list = ast.literal_eval(mesh_id_str)
    
    mesh_id_string = ', '.join(mesh_id_list)
    
def format_mesh_ids(mesh_id_str):
    mesh_id_list = ast.literal_eval(mesh_id_str)
    return ', '.join(mesh_id_list)

filtered_alz_female['Mesh ID'] = filtered_alz_female['Mesh ID'].apply(format_mesh_ids)

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_alz_female['Mesh ID'] = filtered_alz_female['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_alz_female.to_csv('clean_alzheimers_female_condition.csv')

filtered_alz_female.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,Bacteroides,Clostridium,Dorea,Faecalibacterium,Roseburia,Ruminococcus,Streptococcus,Unknown
0,ERR1160800,Female,53,United States of America,"Alzheimers, Migrane Disorders",0.855109,0.09786,0.040775,9.80114,0.02563,0.44503,4.65767,40.5105
1,ERR2091942,Female,53,United Kingdom,"Alzheimers, Intestinal Diseases",12.9212,0.321601,0.017867,15.1724,0.078614,0.360908,0.007147,35.6012
2,SRR9671487,Female,67,China,Alzheimers,8.11452,1.05326,0.0,0.0,0.0,0.0,1.64664,170.064
3,SRR9671488,Female,80,China,Alzheimers,9.34541,2.0094,9.73367,0.422315,0.463184,7.96948,3.09924,114.27
4,SRR9671489,Female,57,China,Alzheimers,2.97554,11.2332,0.0,0.0,0.0,0.0,0.21967,91.363


### Schizophrenia Male

In [110]:
sch_male_df = pd.read_csv('schizophrenia_male.csv')
sch_male_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetoanaerobium,Acetobacter,...,Weissella,Wenyingzhuangia,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea,Zymomonas
0,ERR1072629,Male,17.67,64,United States of America,"['D001714', 'D003863', 'D007410', 'D008171', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00484,0.0
1,ERR1072937,Male,26.51,53,United States of America,"['D001714', 'D002318', 'D003863', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073491,Male,28.08,49,United States of America,"['D001714', 'D003863', 'D012559']",0.003157,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1075554,Male,30.74,62,United States of America,"['D001289', 'D001714', 'D003863', 'D008881', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1075686,Male,45.84,56,United States of America,"['D001714', 'D003863', 'D008171', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
sch_male_df.isnull().sum()

Run ID           0
Sex              0
BMI              0
Age              0
Country          0
                ..
Xenorhabdus      0
Yersinia         0
Youngiibacter    0
Zoogloea         0
Zymomonas        0
Length: 773, dtype: int64

In [112]:
sch_male_df['BMI'].unique()

array([  17.67,   26.51,   28.08,   30.74,   45.84,   20.28,   22.72,
         20.62,   31.47,   27.37,    0.  ,   26.54,   23.49,   21.52,
         22.24,   33.12,   23.18,   25.1 ,   23.99,   22.84,   20.06,
         40.32,   16.98,   31.97,   24.49,   23.75,   22.15,   19.14,
         25.33,   24.41,   25.38,   24.33,   32.14, 4101.24,   21.29,
         19.61,   22.87,   29.99,   25.09,   30.27,   23.59,   26.26,
         29.68,   21.91,   28.09,   24.99,   23.71,   32.92,   20.45,
         24.68,   28.76,   24.48,   20.89,   27.32,   25.21,   20.52,
         33.72,   27.69,   28.35,   27.98,   24.37,   28.75,   22.28,
         23.24,   24.58,   23.06,   45.19,   29.57,   22.05,   33.47,
         21.62,   27.02,   20.8 ,   24.39,   33.91,   28.97,   32.22,
         29.84,   28.12,   25.4 ,   24.8 ,   21.95,   20.97,   25.82,
         29.41])

In [113]:
# replace the BMI's 0 and 4100 with the medians of the rest 

sch_male_bmi_filter = sch_male_df[(sch_male_df['BMI'] > 1) & (sch_male_df['BMI'] < 50)]
sch_male_bmi_median = sch_male_bmi_filter['BMI'].median()
sch_male_df['BMI'] = sch_male_df['BMI'].replace([0, 4101.24], sch_male_bmi_median)

sch_male_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetoanaerobium,Acetobacter,...,Weissella,Wenyingzhuangia,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea,Zymomonas
0,ERR1072629,Male,17.67,64,United States of America,"['D001714', 'D003863', 'D007410', 'D008171', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00484,0.0
1,ERR1072937,Male,26.51,53,United States of America,"['D001714', 'D002318', 'D003863', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073491,Male,28.08,49,United States of America,"['D001714', 'D003863', 'D012559']",0.003157,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1075554,Male,30.74,62,United States of America,"['D001289', 'D001714', 'D003863', 'D008881', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1075686,Male,45.84,56,United States of America,"['D001714', 'D003863', 'D008171', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
sch_male_df['BMI'].unique()

array([17.67, 26.51, 28.08, 30.74, 45.84, 20.28, 22.72, 20.62, 31.47,
       27.37, 24.68, 26.54, 23.49, 21.52, 22.24, 33.12, 23.18, 25.1 ,
       23.99, 22.84, 20.06, 40.32, 16.98, 31.97, 24.49, 23.75, 22.15,
       19.14, 25.33, 24.41, 25.38, 24.33, 32.14, 21.29, 19.61, 22.87,
       29.99, 25.09, 30.27, 23.59, 26.26, 29.68, 21.91, 28.09, 24.99,
       23.71, 32.92, 20.45, 28.76, 24.48, 20.89, 27.32, 25.21, 20.52,
       33.72, 27.69, 28.35, 27.98, 24.37, 28.75, 22.28, 23.24, 24.58,
       23.06, 45.19, 29.57, 22.05, 33.47, 21.62, 27.02, 20.8 , 24.39,
       33.91, 28.97, 32.22, 29.84, 28.12, 25.4 , 24.8 , 21.95, 20.97,
       25.82, 29.41])

In [115]:
sch_male_df['Age'].unique()

array([64, 53, 49, 62, 56, 39, 67, 73,  0, 71, 54, 23, 40, 35, 38, 42, 43,
       21, 61, 34, 72, 27, 32, 25, 24, 65, 47, 51, 63, 46, 52, 45, 68, 48,
       57, 70, 29, 58, 37, 36, 41, 28, 50, 17, 31, 55, 30, 12, 76])

In [116]:
# replace 0 with the median 

sch_male_age_filter = sch_male_df[sch_male_df['Age'] > 1]
sch_male_age_median = sch_male_age_filter['Age'].median()
sch_male_df['Age'] = sch_male_df['Age'].replace(0, sch_male_age_median)

sch_male_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetoanaerobium,Acetobacter,...,Weissella,Wenyingzhuangia,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea,Zymomonas
0,ERR1072629,Male,17.67,64,United States of America,"['D001714', 'D003863', 'D007410', 'D008171', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00484,0.0
1,ERR1072937,Male,26.51,53,United States of America,"['D001714', 'D002318', 'D003863', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073491,Male,28.08,49,United States of America,"['D001714', 'D003863', 'D012559']",0.003157,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1075554,Male,30.74,62,United States of America,"['D001289', 'D001714', 'D003863', 'D008881', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1075686,Male,45.84,56,United States of America,"['D001714', 'D003863', 'D008171', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [117]:
sch_male_df['Age'].unique()

array([64, 53, 49, 62, 56, 39, 67, 73, 48, 71, 54, 23, 40, 35, 38, 42, 43,
       21, 61, 34, 72, 27, 32, 25, 24, 65, 47, 51, 63, 46, 52, 45, 68, 57,
       70, 29, 58, 37, 36, 41, 28, 50, 17, 31, 55, 30, 12, 76])

In [118]:
sch_male_df['Country'].unique()

array(['United States of America', 'United Kingdom', 'Australia',
       'Sweden', 'Canada', 'New Zealand'], dtype=object)

In [119]:
mesh_id_sch_male_df = sch_male_df['Mesh ID'].unique()

In [120]:
mesh_ids_series_sch_male = pd.Series(sch_male_df['Mesh ID'])
flattened_mesh_ids_sch_male = [item for sublist in mesh_ids_series_sch_male.apply(ast.literal_eval) for item in sublist]
unique_mesh_ids_sch_male = pd.Series(flattened_mesh_ids_sch_male).unique()
unique_mesh_ids_sch_male


array(['D001714', 'D003863', 'D007410', 'D008171', 'D012559', 'D013959',
       'D043183', 'D002318', 'D001289', 'D008881', 'D003248', 'D002446',
       'D003920', 'D003967', 'D008107', 'D001327', 'D015212', 'D007674',
       'D003015', 'D000067877', 'D004827', 'D010661'], dtype=object)

In [121]:
mapped_diseases_sch_male = [
    [disease_dict[code.strip("[]' ")] for code in disease_list.split(',')]
    for disease_list in unique_mesh_ids_sch_male
]

print(mapped_diseases_sch_male)

[['Bipolar Disorder'], ['Depression'], ['Intestinal Diseases'], ['Lung Diseases'], ['Schizophrenia'], ['Thyroid Diseases'], ['Irritable Bowel Syndrome'], ['Cardiovascular Diseases'], ['Attention Deficit Disorder with Hyperactivity'], ['Migrane Disorders'], ['Constipation'], ['Celiac Disease'], ['Diabetes Mellitus'], ['Diarrhea'], ['Liver Diseases'], ['Autoimmune Diseases'], ['Inflammatory Bowel Diseases'], ['Kidney Diseases'], ['Clostridium Infections'], ['Autism Spectrum Disorder'], ['Epilepsy'], ['Phenylketonurias']]


In [122]:
bacteria_sch_male_df = sch_male_df.drop(['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID'], axis=1)
att_sch_male_columns = ['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID']
att_sch_male_df = sch_male_df[att_sch_male_columns]

zero_counts_sch_male = (bacteria_sch_male_df == 0.0).sum()

threshold_sch_male = 0.2 * len(bacteria_sch_male_df)

filtered_sch_male = zero_counts_sch_male[zero_counts_sch_male <= threshold_sch_male]

filtered_sch_male_df = bacteria_sch_male_df[filtered_sch_male.index]

filtered_sch_male = pd.concat([att_sch_male_df, filtered_sch_male_df], axis=1)

filtered_sch_male.to_csv('clean_schizophrenia_male.csv')

filtered_sch_male.shape

(100, 37)

In [123]:
filtered_sch_male.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Anaerofilum,Anaerotruncus,...,Odoribacter,Oscillibacter,Parabacteroides,Prevotella,Pseudomonas,Roseburia,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1072629,Male,17.67,64,United States of America,"['D001714', 'D003863', 'D007410', 'D008171', '...",2.15845,17.5047,0.033877,0.159706,...,1.40831,1.97454,11.494,0.029037,0.024198,0.503315,1.45671,0.198422,0.00484,21.9813
1,ERR1072937,Male,26.51,53,United States of America,"['D001714', 'D002318', 'D003863', 'D012559']",0.031791,1.01733,0.005299,0.105971,...,0.243734,0.927251,0.22254,0.052986,41.1699,0.090076,0.021194,0.11127,0.598739,10.1733
2,ERR1073491,Male,28.08,49,United States of America,"['D001714', 'D003863', 'D012559']",0.176768,5.6029,0.025253,0.126263,...,2.07702,0.656566,3.93624,0.022096,0.018939,0.407197,6.1774,0.085227,0.050505,11.0574
3,ERR1075554,Male,30.74,62,United States of America,"['D001289', 'D001714', 'D003863', 'D008881', '...",0.360811,3.15526,0.044181,0.191451,...,0.250359,2.03233,0.025772,0.14727,0.003682,1.39538,0.20986,0.224587,0.206178,25.9674
4,ERR1075686,Male,45.84,56,United States of America,"['D001714', 'D003863', 'D008171', 'D012559']",0.037344,4.91069,0.012448,1.16388,...,0.485467,0.55393,1.35059,0.024896,1.67424,0.149374,1.27591,0.908695,0.037344,55.3059


In [124]:
for mesh_id_str in filtered_sch_male['Mesh ID']:
    mesh_id_list = ast.literal_eval(mesh_id_str)
    
    mesh_id_string = ', '.join(mesh_id_list)
    
def format_mesh_ids(mesh_id_str):
    mesh_id_list = ast.literal_eval(mesh_id_str)
    return ', '.join(mesh_id_list)

filtered_sch_male['Mesh ID'] = filtered_sch_male['Mesh ID'].apply(format_mesh_ids)

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_sch_male['Mesh ID'] = filtered_sch_male['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_sch_male.to_csv('clean_schizophrenia_male_condition.csv')

filtered_sch_male.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Anaerofilum,Anaerotruncus,...,Odoribacter,Oscillibacter,Parabacteroides,Prevotella,Pseudomonas,Roseburia,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1072629,Male,17.67,64,United States of America,"Bipolar Disorder, Depression, Intestinal Disea...",2.15845,17.5047,0.033877,0.159706,...,1.40831,1.97454,11.494,0.029037,0.024198,0.503315,1.45671,0.198422,0.00484,21.9813
1,ERR1072937,Male,26.51,53,United States of America,"Bipolar Disorder, Cardiovascular Diseases, Dep...",0.031791,1.01733,0.005299,0.105971,...,0.243734,0.927251,0.22254,0.052986,41.1699,0.090076,0.021194,0.11127,0.598739,10.1733
2,ERR1073491,Male,28.08,49,United States of America,"Bipolar Disorder, Depression, Schizophrenia",0.176768,5.6029,0.025253,0.126263,...,2.07702,0.656566,3.93624,0.022096,0.018939,0.407197,6.1774,0.085227,0.050505,11.0574
3,ERR1075554,Male,30.74,62,United States of America,"Attention Deficit Disorder with Hyperactivity,...",0.360811,3.15526,0.044181,0.191451,...,0.250359,2.03233,0.025772,0.14727,0.003682,1.39538,0.20986,0.224587,0.206178,25.9674
4,ERR1075686,Male,45.84,56,United States of America,"Bipolar Disorder, Depression, Lung Diseases, S...",0.037344,4.91069,0.012448,1.16388,...,0.485467,0.55393,1.35059,0.024896,1.67424,0.149374,1.27591,0.908695,0.037344,55.3059


### Schizophrenia Female

In [125]:
sch_female_df = pd.read_csv('schizophrenia_female.csv')
sch_female_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,...,Williamsia,Wolbachia,Xanthomonas,Xenococcus,Xenorhabdus,Yersinia,Yokenella,Youngiibacter,Zoogloea,Zymomonas
0,ERR1073023,Female,34.72,35,United States of America,"['D001714', 'D003863', 'D012559']",0.0,0.005583,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ERR1073394,Female,22.41,27,United States of America,"['D001327', 'D001714', 'D003863', 'D003920', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073395,Female,22.41,27,United States of America,"['D001327', 'D001714', 'D003863', 'D003920', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00247,0.0
3,ERR1073490,Female,25.73,37,United States of America,"['D001714', 'D003863', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1073812,Female,29.71,55,United States of America,"['D001289', 'D001714', 'D003015', 'D003863', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [126]:
sch_female_df.isnull().sum()

Run ID           0
Sex              0
BMI              0
Age              0
Country          1
                ..
Yersinia         0
Yokenella        0
Youngiibacter    0
Zoogloea         0
Zymomonas        0
Length: 930, dtype: int64

In [127]:
sch_female_df['Country'].unique()

array(['United States of America', 'Australia', 'United Kingdom',
       'Canada', 'New Zealand', 'Switzerland', nan, 'Norway', 'Ireland'],
      dtype=object)

In [128]:
sch_female_df[sch_female_df['Country'].isna()]

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,...,Williamsia,Wolbachia,Xanthomonas,Xenococcus,Xenorhabdus,Yersinia,Yokenella,Youngiibacter,Zoogloea,Zymomonas
176,ERR1160486,Female,39.2,40,,"['D001714', 'D003863', 'D012559', 'D043183']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000916,0.0,0.0,0.0,0.0


In [129]:
sch_female_df['Age'].describe()
# looks like a normal range

count    249.000000
mean      46.839357
std       13.547047
min       13.000000
25%       36.000000
50%       47.000000
75%       58.000000
max       72.000000
Name: Age, dtype: float64

In [130]:
# sch_female_df['BMI'].unique() produces a long list of results with a couple numbers that don't make sense, 
# including 2000 and 4 
# since the ages range from 13-72, a normal bmi should be above 15
# so, i will replace the BMI's outside of this range with the median of the rest of the group

sch_female_df['BMI'].describe()

count     249.000000
mean       36.558072
std       173.190305
min         1.540000
25%        21.790000
50%        24.300000
75%        28.360000
max      2756.040000
Name: BMI, dtype: float64

In [131]:
sch_female_bmi_filter = sch_female_df[(sch_female_df['BMI'] > 15) & (sch_female_df['BMI'] < 50)]
sch_female_bmi_median = sch_female_bmi_filter['BMI'].median()
sch_female_df['BMI'] = sch_female_df['BMI'].apply(lambda x: sch_female_bmi_median if x <= 15 or x >= 50 else x)

sch_female_df['BMI'].describe()

count    249.000000
mean      25.551466
std        5.652129
min       15.180000
25%       21.890000
50%       24.315000
75%       28.320000
max       48.420000
Name: BMI, dtype: float64

In [132]:
# i could drop this row from the dataframe 
# or i can find the average bmi's of women from each country and assign this person the country that fits 
# the problem with that is that it may not be accurate 
# but by deleting the column, i'm losing part of my sample size 
# i'll try replacing it first and see if i need to change it later on 

sch_female_bmi_us = sch_female_df[sch_female_df['Country'] == 'United States of America']['BMI'].describe()
sch_female_bmi_aus = sch_female_df[sch_female_df['Country'] == 'Australia']['BMI'].describe()
sch_female_bmi_uk = sch_female_df[sch_female_df['Country'] == 'United Kingdom']['BMI'].describe()
sch_female_bmi_can = sch_female_df[sch_female_df['Country'] == 'Canada']['BMI'].describe()
sch_female_bmi_nz = sch_female_df[sch_female_df['Country'] == 'New Zealand']['BMI'].describe()
sch_female_bmi_sw = sch_female_df[sch_female_df['Country'] == 'Switzerland']['BMI'].describe()
sch_female_bmi_nor = sch_female_df[sch_female_df['Country'] == 'Norway']['BMI'].describe()
sch_female_bmi_ir = sch_female_df[sch_female_df['Country'] == 'Ireland']['BMI'].describe()

#could have used a for loop

#print(sch_female_bmi_us, sch_female_bmi_aus, sch_female_bmi_uk, sch_female_bmi_can, sch_female_bmi_nz, 
#     sch_female_bmi_sw, sch_female_bmi_nor, sch_female_bmi_ir)

# this method indicates that the woman without a country designation could realistically be apart of the US
# or the UK 
# i'm deciding to drop this row since it's hard to tell

In [133]:
sch_female_df.dropna(subset=['Country'], inplace=True)

In [134]:
sch_female_df.shape
# confirmed one less row than the original sch_female_df

(248, 930)

In [135]:
mesh_ids_series_sch_female = pd.Series(sch_female_df['Mesh ID'])
flattened_mesh_ids_sch_female = [item for sublist in mesh_ids_series_sch_female.apply(ast.literal_eval) for item in sublist]
unique_mesh_ids_sch_female = pd.Series(flattened_mesh_ids_sch_female).unique()
unique_mesh_ids_sch_female


array(['D001714', 'D003863', 'D012559', 'D001327', 'D003920', 'D003967',
       'D013959', 'D001289', 'D003015', 'D004827', 'D007410', 'D008171',
       'D003248', 'D015212', 'D043183', 'D008107', 'D008881', 'D002318',
       'D007674', 'D002446', 'D000067877'], dtype=object)

In [136]:
mapped_diseases_sch_female = [
    [disease_dict[code.strip("[]' ")] for code in disease_list.split(',')]
    for disease_list in unique_mesh_ids_sch_female
]

print(mapped_diseases_sch_female)

[['Bipolar Disorder'], ['Depression'], ['Schizophrenia'], ['Autoimmune Diseases'], ['Diabetes Mellitus'], ['Diarrhea'], ['Thyroid Diseases'], ['Attention Deficit Disorder with Hyperactivity'], ['Clostridium Infections'], ['Epilepsy'], ['Intestinal Diseases'], ['Lung Diseases'], ['Constipation'], ['Inflammatory Bowel Diseases'], ['Irritable Bowel Syndrome'], ['Liver Diseases'], ['Migrane Disorders'], ['Cardiovascular Diseases'], ['Kidney Diseases'], ['Celiac Disease'], ['Autism Spectrum Disorder']]


In [137]:
bacteria_sch_female_df = sch_female_df.drop(['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID'], axis=1)
att_sch_female_columns = ['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID']
att_sch_female_df = sch_female_df[att_sch_female_columns]

zero_counts_sch_female = (bacteria_sch_female_df == 0.0).sum()

threshold_sch_female = 0.2 * len(bacteria_sch_female_df)

filtered_sch_female = zero_counts_sch_female[zero_counts_sch_female <= threshold_sch_female]

filtered_sch_female_df = bacteria_sch_female_df[filtered_sch_female.index]

filtered_sch_female = pd.concat([att_sch_female_df, filtered_sch_female_df], axis=1)

filtered_sch_female.to_csv('clean_schizophrenia_female.csv')

filtered_sch_female.shape

(248, 36)

In [138]:
filtered_sch_female.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Anaerotruncus,Bacillus,...,Odoribacter,Oscillibacter,Parabacteroides,Prevotella,Pseudomonas,Roseburia,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1073023,Female,34.72,35,United States of America,"['D001714', 'D003863', 'D012559']",0.005583,3.4167,0.904422,3.6121,...,0.022331,0.614113,3.81867,0.027914,0.005583,1.74743,1.57436,0.217731,0.206565,28.2101
1,ERR1073394,Female,22.41,27,United States of America,"['D001327', 'D001714', 'D003863', 'D003920', '...",0.005104,6.11698,0.053591,0.030623,...,0.604808,1.253,1.81953,0.117389,0.005104,1.43674,0.247537,0.010208,0.010208,3.81514
2,ERR1073395,Female,22.41,27,United States of America,"['D001327', 'D001714', 'D003863', 'D003920', '...",0.00247,6.74687,0.069173,0.027175,...,0.140817,1.339,1.64781,0.027175,0.012352,2.03073,0.303869,0.00247,0.019764,4.92366
3,ERR1073490,Female,25.73,37,United States of America,"['D001714', 'D003863', 'D012559']",0.331675,3.57188,0.290216,0.041459,...,0.424161,2.61194,2.0921,0.031892,0.328486,0.711188,0.373134,0.105243,0.054216,36.5066
4,ERR1073812,Female,29.71,55,United States of America,"['D001289', 'D001714', 'D003015', 'D003863', '...",0.00528,3.12022,0.068634,0.021118,...,0.211182,0.628267,0.950319,2.41804,0.047516,0.158387,0.021118,0.036957,0.047516,59.0993


In [139]:
for mesh_id_str in filtered_sch_female['Mesh ID']:
    mesh_id_list = ast.literal_eval(mesh_id_str)
    
    mesh_id_string = ', '.join(mesh_id_list)
    
def format_mesh_ids(mesh_id_str):
    mesh_id_list = ast.literal_eval(mesh_id_str)
    return ', '.join(mesh_id_list)

filtered_sch_female['Mesh ID'] = filtered_sch_female['Mesh ID'].apply(format_mesh_ids)

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_sch_female['Mesh ID'] = filtered_sch_female['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_sch_female.to_csv('clean_schizophrenia_female_condition.csv')

filtered_sch_female.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Anaerotruncus,Bacillus,...,Odoribacter,Oscillibacter,Parabacteroides,Prevotella,Pseudomonas,Roseburia,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1073023,Female,34.72,35,United States of America,"Bipolar Disorder, Depression, Schizophrenia",0.005583,3.4167,0.904422,3.6121,...,0.022331,0.614113,3.81867,0.027914,0.005583,1.74743,1.57436,0.217731,0.206565,28.2101
1,ERR1073394,Female,22.41,27,United States of America,"Autoimmune Diseases, Bipolar Disorder, Depress...",0.005104,6.11698,0.053591,0.030623,...,0.604808,1.253,1.81953,0.117389,0.005104,1.43674,0.247537,0.010208,0.010208,3.81514
2,ERR1073395,Female,22.41,27,United States of America,"Autoimmune Diseases, Bipolar Disorder, Depress...",0.00247,6.74687,0.069173,0.027175,...,0.140817,1.339,1.64781,0.027175,0.012352,2.03073,0.303869,0.00247,0.019764,4.92366
3,ERR1073490,Female,25.73,37,United States of America,"Bipolar Disorder, Depression, Schizophrenia",0.331675,3.57188,0.290216,0.041459,...,0.424161,2.61194,2.0921,0.031892,0.328486,0.711188,0.373134,0.105243,0.054216,36.5066
4,ERR1073812,Female,29.71,55,United States of America,"Attention Deficit Disorder with Hyperactivity,...",0.00528,3.12022,0.068634,0.021118,...,0.211182,0.628267,0.950319,2.41804,0.047516,0.158387,0.021118,0.036957,0.047516,59.0993


### Bipolar Male

In [140]:
bi_male_df = pd.read_csv('bipolar_male.csv')
bi_male_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,...,Weissella,Wenyingzhuangia,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea,Zymomonas
0,ERR1072629,Male,17.67,64,United States of America,"['D001714', 'D003863', 'D007410', 'D008171', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00484,0.0
1,ERR1072937,Male,26.51,53,United States of America,"['D001714', 'D002318', 'D003863', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073491,Male,28.08,49,United States of America,"['D001714', 'D003863', 'D012559']",0.003157,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1075554,Male,30.74,62,United States of America,"['D001289', 'D001714', 'D003863', 'D008881', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1075686,Male,45.84,56,United States of America,"['D001714', 'D003863', 'D008171', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:
bi_male_df.isnull().sum()

Run ID           0
Sex              0
BMI              0
Age              0
Country          0
                ..
Xenorhabdus      0
Yersinia         0
Youngiibacter    0
Zoogloea         0
Zymomonas        0
Length: 783, dtype: int64

In [142]:
bi_male_df['BMI'].unique()
# need to replace the 0 and 4101.24 with the median of the rest

array([  17.67,   26.51,   28.08,   30.74,   45.84,   20.28,   22.72,
         20.62,   31.47,   27.37,    0.  ,   26.54,   23.49,   21.52,
         22.24,   33.12,   23.18,   25.1 ,   23.99,   22.84,   20.06,
         40.32,   16.98,   31.97,   24.49,   23.75,   22.15,   19.14,
         25.33,   24.41,   25.38,   24.33,   32.14, 4101.24,   21.29,
         19.61,   22.87,   29.99,   25.09,   30.27,   23.59,   26.26,
         29.68,   21.91,   28.09,   24.99,   23.71,   32.92,   20.45,
         24.68,   28.76,   24.48,   20.89,   27.32,   25.21,   20.52,
         33.72,   27.69,   28.35,   27.98,   24.37,   28.75,   22.28,
         23.24,   24.58,   23.06,   45.19,   29.57,   22.05,   33.47,
         21.62,   27.02,   20.8 ,   24.39,   33.91,   28.97,   32.22,
         29.84,   28.12,   25.4 ,   24.8 ,   21.95,   20.97,   25.82,
         29.41,   27.95,   28.24,   25.06,   25.42,   28.3 ,   22.64,
         25.84,   44.72,   32.28,   31.32,   27.45,   24.65,   38.03,
         36.41,   21

In [143]:
bi_male_bmi_filter = bi_male_df[(bi_male_df['BMI'] > 1) & (bi_male_df['BMI'] < 50)]
bi_male_bmi_median = bi_male_bmi_filter['BMI'].median()
bi_male_df['BMI'] = bi_male_df['BMI'].replace([0, 4101.24], bi_male_bmi_median)

bi_male_df['BMI'].unique()

array([17.67, 26.51, 28.08, 30.74, 45.84, 20.28, 22.72, 20.62, 31.47,
       27.37, 25.27, 26.54, 23.49, 21.52, 22.24, 33.12, 23.18, 25.1 ,
       23.99, 22.84, 20.06, 40.32, 16.98, 31.97, 24.49, 23.75, 22.15,
       19.14, 25.33, 24.41, 25.38, 24.33, 32.14, 21.29, 19.61, 22.87,
       29.99, 25.09, 30.27, 23.59, 26.26, 29.68, 21.91, 28.09, 24.99,
       23.71, 32.92, 20.45, 24.68, 28.76, 24.48, 20.89, 27.32, 25.21,
       20.52, 33.72, 27.69, 28.35, 27.98, 24.37, 28.75, 22.28, 23.24,
       24.58, 23.06, 45.19, 29.57, 22.05, 33.47, 21.62, 27.02, 20.8 ,
       24.39, 33.91, 28.97, 32.22, 29.84, 28.12, 25.4 , 24.8 , 21.95,
       20.97, 25.82, 29.41, 27.95, 28.24, 25.06, 25.42, 28.3 , 22.64,
       25.84, 44.72, 32.28, 31.32, 27.45, 24.65, 38.03, 36.41, 21.22,
       28.5 , 25.8 , 32.48, 31.26, 24.  ])

In [144]:
bi_male_df['Age'].unique()
# replace 0 with the median of the rest 

array([64, 53, 49, 62, 56, 39, 67, 73,  0, 71, 54, 23, 40, 35, 38, 42, 43,
       21, 61, 34, 72, 27, 32, 25, 24, 65, 47, 51, 63, 46, 52, 45, 68, 48,
       57, 70, 29, 58, 37, 36, 41, 28, 50, 17, 31, 55, 30, 12, 76, 60, 59,
       87, 74])

In [145]:
bi_male_age_filter = bi_male_df[(bi_male_df['Age'] > 1)]
bi_male_age_median = bi_male_age_filter['Age'].median()
bi_male_df['Age'] = bi_male_df['Age'].replace(0, bi_male_age_median)

bi_male_df['Age'].nunique()

52

In [146]:
bi_male_df['Country'].unique()

array(['United States of America', 'United Kingdom', 'Australia',
       'Sweden', 'Canada', 'New Zealand'], dtype=object)

In [147]:
mesh_ids_series_bi_male = pd.Series(bi_male_df['Mesh ID'])
flattened_mesh_ids_bi_male = [item for sublist in mesh_ids_series_bi_male.apply(ast.literal_eval) for item in sublist]
unique_mesh_ids_bi_male = pd.Series(flattened_mesh_ids_bi_male).unique()
unique_mesh_ids_bi_male


array(['D001714', 'D003863', 'D007410', 'D008171', 'D012559', 'D013959',
       'D043183', 'D002318', 'D001289', 'D008881', 'D003248', 'D002446',
       'D003920', 'D003967', 'D008107', 'D001327', 'D015212', 'D007674',
       'D003015', 'D000067877', 'D004827', 'D010661'], dtype=object)

In [148]:
mapped_diseases_bi_male = [
    [disease_dict[code.strip("[]' ")] for code in disease_list.split(',')]
    for disease_list in unique_mesh_ids_bi_male
]

print(mapped_diseases_bi_male)

[['Bipolar Disorder'], ['Depression'], ['Intestinal Diseases'], ['Lung Diseases'], ['Schizophrenia'], ['Thyroid Diseases'], ['Irritable Bowel Syndrome'], ['Cardiovascular Diseases'], ['Attention Deficit Disorder with Hyperactivity'], ['Migrane Disorders'], ['Constipation'], ['Celiac Disease'], ['Diabetes Mellitus'], ['Diarrhea'], ['Liver Diseases'], ['Autoimmune Diseases'], ['Inflammatory Bowel Diseases'], ['Kidney Diseases'], ['Clostridium Infections'], ['Autism Spectrum Disorder'], ['Epilepsy'], ['Phenylketonurias']]


In [149]:
bacteria_bi_male_df = bi_male_df.drop(['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID'], axis=1)
att_bi_male_columns = ['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID']
att_bi_male_df = bi_male_df[att_bi_male_columns]

zero_counts_bi_male = (bacteria_bi_male_df == 0.0).sum()

threshold_bi_male = 0.2 * len(bacteria_bi_male_df)

filtered_bi_male = zero_counts_bi_male[zero_counts_bi_male <= threshold_bi_male]

filtered_bi_male_df = bacteria_bi_male_df[filtered_bi_male.index]

filtered_bi_male = pd.concat([att_bi_male_df, filtered_bi_male_df], axis=1)

filtered_bi_male.to_csv('clean_bipolar_male.csv')

filtered_bi_male.shape

(123, 32)

In [150]:
filtered_bi_male.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Anaerofilum,Anaerotruncus,...,Lachnoclostridium,Odoribacter,Oscillibacter,Parabacteroides,Prevotella,Roseburia,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1072629,Male,17.67,64,United States of America,"['D001714', 'D003863', 'D007410', 'D008171', '...",2.15845,17.5047,0.033877,0.159706,...,1.23409,1.40831,1.97454,11.494,0.029037,0.503315,1.45671,0.198422,0.00484,21.9813
1,ERR1072937,Male,26.51,53,United States of America,"['D001714', 'D002318', 'D003863', 'D012559']",0.031791,1.01733,0.005299,0.105971,...,0.328512,0.243734,0.927251,0.22254,0.052986,0.090076,0.021194,0.11127,0.598739,10.1733
2,ERR1073491,Male,28.08,49,United States of America,"['D001714', 'D003863', 'D012559']",0.176768,5.6029,0.025253,0.126263,...,2.60732,2.07702,0.656566,3.93624,0.022096,0.407197,6.1774,0.085227,0.050505,11.0574
3,ERR1075554,Male,30.74,62,United States of America,"['D001289', 'D001714', 'D003863', 'D008881', '...",0.360811,3.15526,0.044181,0.191451,...,3.41666,0.250359,2.03233,0.025772,0.14727,1.39538,0.20986,0.224587,0.206178,25.9674
4,ERR1075686,Male,45.84,56,United States of America,"['D001714', 'D003863', 'D008171', 'D012559']",0.037344,4.91069,0.012448,1.16388,...,0.304973,0.485467,0.55393,1.35059,0.024896,0.149374,1.27591,0.908695,0.037344,55.3059


In [151]:
for mesh_id_str in filtered_bi_male['Mesh ID']:
    mesh_id_list = ast.literal_eval(mesh_id_str)
    
    mesh_id_string = ', '.join(mesh_id_list)
    
def format_mesh_ids(mesh_id_str):
    mesh_id_list = ast.literal_eval(mesh_id_str)
    return ', '.join(mesh_id_list)

filtered_bi_male['Mesh ID'] = filtered_bi_male['Mesh ID'].apply(format_mesh_ids)

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_bi_male['Mesh ID'] = filtered_bi_male['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_bi_male.to_csv('clean_bipolar_male_condition.csv')

filtered_bi_male.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Anaerofilum,Anaerotruncus,...,Lachnoclostridium,Odoribacter,Oscillibacter,Parabacteroides,Prevotella,Roseburia,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1072629,Male,17.67,64,United States of America,"Bipolar Disorder, Depression, Intestinal Disea...",2.15845,17.5047,0.033877,0.159706,...,1.23409,1.40831,1.97454,11.494,0.029037,0.503315,1.45671,0.198422,0.00484,21.9813
1,ERR1072937,Male,26.51,53,United States of America,"Bipolar Disorder, Cardiovascular Diseases, Dep...",0.031791,1.01733,0.005299,0.105971,...,0.328512,0.243734,0.927251,0.22254,0.052986,0.090076,0.021194,0.11127,0.598739,10.1733
2,ERR1073491,Male,28.08,49,United States of America,"Bipolar Disorder, Depression, Schizophrenia",0.176768,5.6029,0.025253,0.126263,...,2.60732,2.07702,0.656566,3.93624,0.022096,0.407197,6.1774,0.085227,0.050505,11.0574
3,ERR1075554,Male,30.74,62,United States of America,"Attention Deficit Disorder with Hyperactivity,...",0.360811,3.15526,0.044181,0.191451,...,3.41666,0.250359,2.03233,0.025772,0.14727,1.39538,0.20986,0.224587,0.206178,25.9674
4,ERR1075686,Male,45.84,56,United States of America,"Bipolar Disorder, Depression, Lung Diseases, S...",0.037344,4.91069,0.012448,1.16388,...,0.304973,0.485467,0.55393,1.35059,0.024896,0.149374,1.27591,0.908695,0.037344,55.3059


### Bipolar Female

In [152]:
bi_female_df = pd.read_csv('bipolar_female.csv')
bi_female_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,...,Williamsia,Wolbachia,Xanthomonas,Xenococcus,Xenorhabdus,Yersinia,Yokenella,Youngiibacter,Zoogloea,Zymomonas
0,ERR1073023,Female,34.72,35,United States of America,"['D001714', 'D003863', 'D012559']",0.0,0.005583,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ERR1073394,Female,22.41,27,United States of America,"['D001327', 'D001714', 'D003863', 'D003920', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073395,Female,22.41,27,United States of America,"['D001327', 'D001714', 'D003863', 'D003920', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00247,0.0
3,ERR1073490,Female,25.73,37,United States of America,"['D001714', 'D003863', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1073812,Female,29.71,55,United States of America,"['D001289', 'D001714', 'D003015', 'D003863', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
bi_female_df.isnull().sum()

Run ID           0
Sex              0
BMI              0
Age              0
Country          1
                ..
Yersinia         0
Yokenella        0
Youngiibacter    0
Zoogloea         0
Zymomonas        0
Length: 956, dtype: int64

In [154]:
bi_female_df['Country'].unique()

array(['United States of America', 'Australia', 'United Kingdom',
       'Canada', 'New Zealand', 'Switzerland', nan, 'Norway', 'Ireland'],
      dtype=object)

In [155]:
bi_female_df[bi_female_df['Country'].isna()]
# same entry from the sch_female_df and since i already tried and couldn't replace it with accuracy,
# i'm going to drop this row right away

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,...,Williamsia,Wolbachia,Xanthomonas,Xenococcus,Xenorhabdus,Yersinia,Yokenella,Youngiibacter,Zoogloea,Zymomonas
176,ERR1160486,Female,39.2,40,,"['D001714', 'D003863', 'D012559', 'D043183']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000916,0.0,0.0,0.0,0.0


In [156]:
bi_female_df.dropna(subset=['Country'], inplace=True)

In [157]:
bi_female_df['Country'].unique()

array(['United States of America', 'Australia', 'United Kingdom',
       'Canada', 'New Zealand', 'Switzerland', 'Norway', 'Ireland'],
      dtype=object)

In [158]:
bi_female_df['Age'].unique()
# doesn't look like anything needs to be changed or replaced

array([35, 27, 37, 55, 28, 47, 41, 57, 67, 36, 53, 56, 43, 48, 60, 44, 45,
       33, 30, 63, 66, 46, 61, 59, 38, 68, 51, 70, 62, 40, 42, 52, 54, 32,
       39, 13, 19, 26, 49, 22, 29, 65, 34, 64, 58, 69, 31, 50, 21, 24, 25,
       20, 72, 71])

In [159]:
bi_female_df['BMI'].describe()
# there are outliers here and since a normal bmi for adults (no one under the age of 18 in the df)
# again going to replace anything less than 15 and greater than 50 for a larger range 

count     316.000000
mean       35.063829
std       153.740943
min         1.540000
25%        21.927500
50%        24.815000
75%        29.665000
max      2756.040000
Name: BMI, dtype: float64

In [160]:
bi_female_bmi_filter = bi_female_df[(bi_female_df['BMI'] > 15) & (bi_female_df['BMI'] < 50)]
bi_female_bmi_median = bi_female_bmi_filter['BMI'].median()
bi_female_df['BMI'] = bi_female_df['BMI'].apply(lambda x: bi_female_bmi_median if x <= 15 or x >= 50 else x)

bi_female_df['BMI'].describe()

count    316.000000
mean      26.309367
std        6.183916
min       15.180000
25%       21.970000
50%       24.815000
75%       29.245000
max       48.420000
Name: BMI, dtype: float64

In [161]:
mesh_ids_series_bi_female = pd.Series(bi_female_df['Mesh ID'])
flattened_mesh_ids_bi_female = [item for sublist in mesh_ids_series_bi_female.apply(ast.literal_eval) for item in sublist]
unique_mesh_ids_bi_female = pd.Series(flattened_mesh_ids_bi_female).unique()
unique_mesh_ids_bi_female


array(['D001714', 'D003863', 'D012559', 'D001327', 'D003920', 'D003967',
       'D013959', 'D001289', 'D003015', 'D004827', 'D007410', 'D008171',
       'D003248', 'D015212', 'D043183', 'D008107', 'D008881', 'D002318',
       'D007674', 'D002446', 'D000067877'], dtype=object)

In [162]:
mapped_diseases_bi_female = [
    [disease_dict[code.strip("[]' ")] for code in disease_list.split(',')]
    for disease_list in unique_mesh_ids_bi_female
]

print(mapped_diseases_bi_female)

[['Bipolar Disorder'], ['Depression'], ['Schizophrenia'], ['Autoimmune Diseases'], ['Diabetes Mellitus'], ['Diarrhea'], ['Thyroid Diseases'], ['Attention Deficit Disorder with Hyperactivity'], ['Clostridium Infections'], ['Epilepsy'], ['Intestinal Diseases'], ['Lung Diseases'], ['Constipation'], ['Inflammatory Bowel Diseases'], ['Irritable Bowel Syndrome'], ['Liver Diseases'], ['Migrane Disorders'], ['Cardiovascular Diseases'], ['Kidney Diseases'], ['Celiac Disease'], ['Autism Spectrum Disorder']]


In [163]:
bacteria_bi_female_df = bi_female_df.drop(['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID'], axis=1)
att_bi_female_columns = ['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID']
att_bi_female_df = bi_female_df[att_bi_female_columns]

zero_counts_bi_female = (bacteria_bi_female_df == 0.0).sum()

threshold_bi_female = 0.2 * len(bacteria_bi_female_df)

filtered_bi_female = zero_counts_bi_female[zero_counts_bi_female <= threshold_bi_female]

filtered_bi_female_df = bacteria_bi_female_df[filtered_bi_female.index]

filtered_bi_female = pd.concat([att_bi_female_df, filtered_bi_female_df], axis=1)

filtered_bi_female.to_csv('clean_bipolar_female.csv')

filtered_bi_female.shape

(316, 33)

In [164]:
filtered_bi_female.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Anaerotruncus,Bacillus,...,Odoribacter,Oscillibacter,Parabacteroides,Prevotella,Roseburia,Ruminiclostridium,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1073023,Female,34.72,35,United States of America,"['D001714', 'D003863', 'D012559']",0.005583,3.4167,0.904422,3.6121,...,0.022331,0.614113,3.81867,0.027914,1.74743,1.11099,1.57436,0.217731,0.206565,28.2101
1,ERR1073394,Female,22.41,27,United States of America,"['D001327', 'D001714', 'D003863', 'D003920', '...",0.005104,6.11698,0.053591,0.030623,...,0.604808,1.253,1.81953,0.117389,1.43674,0.165876,0.247537,0.010208,0.010208,3.81514
2,ERR1073395,Female,22.41,27,United States of America,"['D001327', 'D001714', 'D003863', 'D003920', '...",0.00247,6.74687,0.069173,0.027175,...,0.140817,1.339,1.64781,0.027175,2.03073,0.185286,0.303869,0.00247,0.019764,4.92366
3,ERR1073490,Female,25.73,37,United States of America,"['D001714', 'D003863', 'D012559']",0.331675,3.57188,0.290216,0.041459,...,0.424161,2.61194,2.0921,0.031892,0.711188,0.369945,0.373134,0.105243,0.054216,36.5066
4,ERR1073812,Female,29.71,55,United States of America,"['D001289', 'D001714', 'D003015', 'D003863', '...",0.00528,3.12022,0.068634,0.021118,...,0.211182,0.628267,0.950319,2.41804,0.158387,0.23758,0.021118,0.036957,0.047516,59.0993


In [165]:
for mesh_id_str in filtered_bi_female['Mesh ID']:
    mesh_id_list = ast.literal_eval(mesh_id_str)
    
    mesh_id_string = ', '.join(mesh_id_list)
    
def format_mesh_ids(mesh_id_str):
    mesh_id_list = ast.literal_eval(mesh_id_str)
    return ', '.join(mesh_id_list)

filtered_bi_female['Mesh ID'] = filtered_bi_female['Mesh ID'].apply(format_mesh_ids)

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_bi_female['Mesh ID'] = filtered_bi_female['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_bi_female.to_csv('clean_bipolar_female_condition.csv')

filtered_bi_female.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Anaerotruncus,Bacillus,...,Odoribacter,Oscillibacter,Parabacteroides,Prevotella,Roseburia,Ruminiclostridium,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1073023,Female,34.72,35,United States of America,"Bipolar Disorder, Depression, Schizophrenia",0.005583,3.4167,0.904422,3.6121,...,0.022331,0.614113,3.81867,0.027914,1.74743,1.11099,1.57436,0.217731,0.206565,28.2101
1,ERR1073394,Female,22.41,27,United States of America,"Autoimmune Diseases, Bipolar Disorder, Depress...",0.005104,6.11698,0.053591,0.030623,...,0.604808,1.253,1.81953,0.117389,1.43674,0.165876,0.247537,0.010208,0.010208,3.81514
2,ERR1073395,Female,22.41,27,United States of America,"Autoimmune Diseases, Bipolar Disorder, Depress...",0.00247,6.74687,0.069173,0.027175,...,0.140817,1.339,1.64781,0.027175,2.03073,0.185286,0.303869,0.00247,0.019764,4.92366
3,ERR1073490,Female,25.73,37,United States of America,"Bipolar Disorder, Depression, Schizophrenia",0.331675,3.57188,0.290216,0.041459,...,0.424161,2.61194,2.0921,0.031892,0.711188,0.369945,0.373134,0.105243,0.054216,36.5066
4,ERR1073812,Female,29.71,55,United States of America,"Attention Deficit Disorder with Hyperactivity,...",0.00528,3.12022,0.068634,0.021118,...,0.211182,0.628267,0.950319,2.41804,0.158387,0.23758,0.021118,0.036957,0.047516,59.0993


### Epilepsy Male

In [166]:
ep_male_df = pd.read_csv('epilepsy_male.csv')
ep_male_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetoanaerobium,Acholeplasma,...,Vibrio,Victivallis,Virgibacillus,Vulgatibacter,Weissella,Xanthomonas,Xenorhabdus,Yersinia,Zoogloea,Zymomonas
0,ERR1072832,Male,24.93,58,United States of America,['D004827'],0.0,0.0,0.00357,0.0,...,0.0,0.00357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ERR1076844,Male,21.02,36,United States of America,"['D003248', 'D004827', 'D008171', 'D013959']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1089831,Male,22.91,43,United States of America,['D004827'],0.0,0.0,0.0,0.0,...,0.0,0.005888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1089867,Male,15.62,6,Canada,['D004827'],0.0,0.0,0.0,0.0,...,0.0,0.005153,0.0,0.0,0.010306,0.0,0.0,0.0,0.0,0.0
4,ERR1089868,Male,15.62,6,Canada,['D004827'],0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [167]:
ep_male_df.isnull().sum()

Run ID         0
Sex            0
BMI            0
Age            0
Country        0
              ..
Xanthomonas    0
Xenorhabdus    0
Yersinia       0
Zoogloea       0
Zymomonas      0
Length: 465, dtype: int64

In [168]:
ep_male_df['BMI'].unique()
# everything here looks to be in a normal range, expecting to see lower numbers since there are a couple 
# six year olds

array([24.93, 21.02, 22.91, 15.62, 23.87, 17.94, 27.46, 14.23, 21.79,
       27.51, 23.63, 30.52, 28.12, 24.68, 24.31, 25.11, 20.8 , 19.22,
       24.44, 21.74, 16.64])

In [169]:
ep_male_df['Age'].unique()

array([58, 36, 43,  6, 63, 70,  9,  8, 27, 77, 31, 54, 61, 34, 69, 17, 65,
        5])

In [170]:
ep_male_df['Country'].unique()

array(['United States of America', 'Canada', 'United Kingdom',
       'Czech Republic'], dtype=object)

In [171]:
mesh_ids_series_ep_male = pd.Series(ep_male_df['Mesh ID'])
flattened_mesh_ids_ep_male = [item for sublist in mesh_ids_series_ep_male.apply(ast.literal_eval) for item in sublist]
unique_mesh_ids_ep_male = pd.Series(flattened_mesh_ids_ep_male).unique()
unique_mesh_ids_ep_male


array(['D004827', 'D003248', 'D008171', 'D013959', 'D003967', 'D008881',
       'D001327', 'D002318', 'D007410', 'D000067877', 'D001289',
       'D008107', 'D007674', 'D001714', 'D003015', 'D003863', 'D012559',
       'D043183'], dtype=object)

In [172]:
mapped_diseases_ep_male = [
    [disease_dict[code.strip("[]' ")] for code in disease_list.split(',')]
    for disease_list in unique_mesh_ids_ep_male
]

print(mapped_diseases_ep_male)

[['Epilepsy'], ['Constipation'], ['Lung Diseases'], ['Thyroid Diseases'], ['Diarrhea'], ['Migrane Disorders'], ['Autoimmune Diseases'], ['Cardiovascular Diseases'], ['Intestinal Diseases'], ['Autism Spectrum Disorder'], ['Attention Deficit Disorder with Hyperactivity'], ['Liver Diseases'], ['Kidney Diseases'], ['Bipolar Disorder'], ['Clostridium Infections'], ['Depression'], ['Schizophrenia'], ['Irritable Bowel Syndrome']]


In [173]:
bacteria_ep_male_df = ep_male_df.drop(['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID'], axis=1)
att_ep_male_columns = ['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID']
att_ep_male_df = ep_male_df[att_ep_male_columns]

zero_counts_ep_male = (bacteria_ep_male_df == 0.0).sum()

threshold_ep_male = 0.2 * len(bacteria_ep_male_df)

filtered_ep_male = zero_counts_ep_male[zero_counts_ep_male <= threshold_ep_male]

filtered_ep_male_df = bacteria_ep_male_df[filtered_ep_male.index]

filtered_ep_male = pd.concat([att_ep_male_df, filtered_ep_male_df], axis=1)

filtered_ep_male.to_csv('clean_epilepsy_male.csv')

filtered_ep_male.shape

(22, 33)

In [174]:
filtered_ep_male.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Bacillus,Bacteroides,...,Lactobacillus,Oscillibacter,Parabacteroides,Prevotella,Pseudomonas,Roseburia,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1072832,Male,24.93,58,United States of America,['D004827'],2.01364,7.35121,0.024992,26.4344,...,0.00357,1.1532,3.53815,34.9352,0.010711,0.067835,0.25349,0.024992,0.014281,5.95166
1,ERR1076844,Male,21.02,36,United States of America,"['D003248', 'D004827', 'D008171', 'D013959']",0.0,0.010182,0.010182,37.9514,...,0.0,0.295276,1.68341,0.162911,0.01697,0.084849,19.96,0.006788,0.020364,10.5654
2,ERR1089831,Male,22.91,43,United States of America,['D004827'],0.067717,3.40939,0.032386,18.2717,...,0.144266,1.25718,0.618284,0.020609,0.082438,2.04034,4.86972,0.479906,0.026498,18.8783
3,ERR1089867,Male,15.62,6,Canada,['D004827'],0.061837,2.36009,0.097908,38.6375,...,0.030918,0.340101,3.99876,0.005153,0.170051,1.22127,11.8984,0.42255,0.530764,14.6192
4,ERR1089868,Male,15.62,6,Canada,['D004827'],7.8405,4.48589,0.095206,28.1474,...,0.016801,6.54682,1.38329,0.235215,0.0,1.70811,0.341622,0.403226,0.420027,12.0744


In [175]:
for mesh_id_str in filtered_ep_male['Mesh ID']:
    mesh_id_list = ast.literal_eval(mesh_id_str)
    
    mesh_id_string = ', '.join(mesh_id_list)
    
def format_mesh_ids(mesh_id_str):
    mesh_id_list = ast.literal_eval(mesh_id_str)
    return ', '.join(mesh_id_list)

filtered_ep_male['Mesh ID'] = filtered_ep_male['Mesh ID'].apply(format_mesh_ids)

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_ep_male['Mesh ID'] = filtered_ep_male['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_ep_male.to_csv('clean_epilepsy_male_condition.csv')

filtered_ep_male.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Bacillus,Bacteroides,...,Lactobacillus,Oscillibacter,Parabacteroides,Prevotella,Pseudomonas,Roseburia,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1072832,Male,24.93,58,United States of America,Epilepsy,2.01364,7.35121,0.024992,26.4344,...,0.00357,1.1532,3.53815,34.9352,0.010711,0.067835,0.25349,0.024992,0.014281,5.95166
1,ERR1076844,Male,21.02,36,United States of America,"Constipation, Epilepsy, Lung Diseases, Thyroid...",0.0,0.010182,0.010182,37.9514,...,0.0,0.295276,1.68341,0.162911,0.01697,0.084849,19.96,0.006788,0.020364,10.5654
2,ERR1089831,Male,22.91,43,United States of America,Epilepsy,0.067717,3.40939,0.032386,18.2717,...,0.144266,1.25718,0.618284,0.020609,0.082438,2.04034,4.86972,0.479906,0.026498,18.8783
3,ERR1089867,Male,15.62,6,Canada,Epilepsy,0.061837,2.36009,0.097908,38.6375,...,0.030918,0.340101,3.99876,0.005153,0.170051,1.22127,11.8984,0.42255,0.530764,14.6192
4,ERR1089868,Male,15.62,6,Canada,Epilepsy,7.8405,4.48589,0.095206,28.1474,...,0.016801,6.54682,1.38329,0.235215,0.0,1.70811,0.341622,0.403226,0.420027,12.0744


### Epilepsy Female

In [176]:
ep_female_df = pd.read_csv('epilepsy_female.csv')
ep_female_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,Acetobacterium,...,Wandonia,Weeksella,Weissella,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea
0,ERR1073812,Female,29.71,55,United States of America,"['D001289', 'D001714', 'D003015', 'D003863', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ERR1090471,Female,38.01,47,United States of America,"['D001327', 'D001714', 'D003248', 'D003863', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004069,0.0,0.0,0.0
2,ERR1091746,Female,31.09,59,United States of America,"['D001327', 'D003920', 'D004827', 'D013959']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001851,0.005553,0.0,0.0
3,ERR1091758,Female,21.79,62,United States of America,"['D001289', 'D001327', 'D001714', 'D002318', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002317,0.0,0.0
4,ERR1160392,Female,35.19,50,United States of America,"['D001714', 'D003248', 'D003863', 'D004827', '...",0.0,0.0,0.001275,0.001275,...,0.0,0.0,0.00255,0.001275,0.001275,0.0,0.001275,0.005101,0.0,0.0


In [177]:
ep_female_df.isnull().sum()

Run ID           0
Sex              0
BMI              0
Age              0
Country          0
                ..
Xanthomonas      0
Xenorhabdus      0
Yersinia         0
Youngiibacter    0
Zoogloea         0
Length: 452, dtype: int64

In [178]:
ep_female_df['BMI'].unique()

array([29.71, 38.01, 31.09, 21.79, 35.19, 26.7 , 19.9 , 20.85, 21.45,
       28.76, 23.49, 21.74, 20.6 , 19.04, 25.73, 18.61, 26.52])

In [179]:
ep_female_df['Age'].unique()

array([55, 47, 59, 62, 50, 39, 45, 13, 64, 51, 40, 42, 33,  7])

In [180]:
ep_female_df['Country'].unique()

array(['United States of America', 'United Kingdom', 'Germany'],
      dtype=object)

In [181]:
mesh_ids_series_ep_female = pd.Series(ep_female_df['Mesh ID'])
flattened_mesh_ids_ep_female = [item for sublist in mesh_ids_series_ep_female.apply(ast.literal_eval) for item in sublist]
unique_mesh_ids_ep_female = pd.Series(flattened_mesh_ids_ep_female).unique()
unique_mesh_ids_ep_female


array(['D001289', 'D001714', 'D003015', 'D003863', 'D003920', 'D004827',
       'D007410', 'D008171', 'D012559', 'D001327', 'D003248', 'D013959',
       'D002318', 'D043183', 'D008881', 'D002446'], dtype=object)

In [182]:
mapped_diseases_ep_female = [
    [disease_dict[code.strip("[]' ")] for code in disease_list.split(',')]
    for disease_list in unique_mesh_ids_ep_female
]

print(mapped_diseases_ep_female)

[['Attention Deficit Disorder with Hyperactivity'], ['Bipolar Disorder'], ['Clostridium Infections'], ['Depression'], ['Diabetes Mellitus'], ['Epilepsy'], ['Intestinal Diseases'], ['Lung Diseases'], ['Schizophrenia'], ['Autoimmune Diseases'], ['Constipation'], ['Thyroid Diseases'], ['Cardiovascular Diseases'], ['Irritable Bowel Syndrome'], ['Migrane Disorders'], ['Celiac Disease']]


In [183]:
bacteria_ep_female_df = ep_female_df.drop(['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID'], axis=1)
att_ep_female_columns = ['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID']
att_ep_female_df = ep_female_df[att_ep_female_columns]

zero_counts_ep_female = (bacteria_ep_female_df == 0.0).sum()

threshold_ep_female = 0.2 * len(bacteria_ep_female_df)

filtered_ep_female = zero_counts_ep_female[zero_counts_ep_female <= threshold_ep_female]

filtered_ep_female_df = bacteria_ep_female_df[filtered_ep_female.index]

filtered_ep_female = pd.concat([att_ep_female_df, filtered_ep_female_df], axis=1)

filtered_ep_female.to_csv('clean_epilepsy_female.csv')

filtered_ep_female.shape

(20, 40)

In [184]:
filtered_ep_female.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Acinetobacter,Akkermansia,Alistipes,Anaerofilum,...,Parasutterella,Prevotella,Pseudomonas,Roseburia,Ruminiclostridium,Ruminococcus,Sporobacter,Streptococcus,Unknown,Veillonella
0,ERR1073812,Female,29.71,55,United States of America,"['D001289', 'D001714', 'D003015', 'D003863', '...",0.010559,0.00528,3.12022,0.026398,...,0.269257,2.41804,0.047516,0.158387,0.23758,0.021118,0.036957,0.047516,59.0993,0.015839
1,ERR1090471,Female,38.01,47,United States of America,"['D001327', 'D001714', 'D003248', 'D003863', '...",0.012207,0.0,0.732452,0.036623,...,0.028484,0.0,0.101729,0.077314,0.183113,0.016277,0.016277,0.109868,75.4873,0.0
2,ERR1091746,Female,31.09,59,United States of America,"['D001327', 'D003920', 'D004827', 'D013959']",0.001851,0.009255,0.87367,0.011106,...,0.003702,27.9445,0.007404,0.634891,0.003702,0.732994,0.012957,0.623785,47.5909,0.42758
3,ERR1091758,Female,21.79,62,United States of America,"['D001289', 'D001327', 'D001714', 'D002318', '...",0.078793,0.366156,0.104285,0.0,...,0.020857,0.0,0.491298,0.002317,0.0,0.013905,0.0,0.027809,90.174,0.0
4,ERR1160392,Female,35.19,50,United States of America,"['D001714', 'D003248', 'D003863', 'D004827', '...",0.028054,0.012752,1.71383,0.029329,...,0.001275,0.091812,0.048456,0.386376,0.972954,0.095638,0.132618,0.146644,72.8695,0.020403


In [185]:
for mesh_id_str in filtered_ep_female['Mesh ID']:
    mesh_id_list = ast.literal_eval(mesh_id_str)
    
    mesh_id_string = ', '.join(mesh_id_list)
    
def format_mesh_ids(mesh_id_str):
    mesh_id_list = ast.literal_eval(mesh_id_str)
    return ', '.join(mesh_id_list)

filtered_ep_female['Mesh ID'] = filtered_ep_female['Mesh ID'].apply(format_mesh_ids)

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_ep_female['Mesh ID'] = filtered_ep_female['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_ep_female.to_csv('clean_epilepsy_female_condition.csv')

filtered_ep_female.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Acinetobacter,Akkermansia,Alistipes,Anaerofilum,...,Parasutterella,Prevotella,Pseudomonas,Roseburia,Ruminiclostridium,Ruminococcus,Sporobacter,Streptococcus,Unknown,Veillonella
0,ERR1073812,Female,29.71,55,United States of America,"Attention Deficit Disorder with Hyperactivity,...",0.010559,0.00528,3.12022,0.026398,...,0.269257,2.41804,0.047516,0.158387,0.23758,0.021118,0.036957,0.047516,59.0993,0.015839
1,ERR1090471,Female,38.01,47,United States of America,"Autoimmune Diseases, Bipolar Disorder, Constip...",0.012207,0.0,0.732452,0.036623,...,0.028484,0.0,0.101729,0.077314,0.183113,0.016277,0.016277,0.109868,75.4873,0.0
2,ERR1091746,Female,31.09,59,United States of America,"Autoimmune Diseases, Diabetes Mellitus, Epilep...",0.001851,0.009255,0.87367,0.011106,...,0.003702,27.9445,0.007404,0.634891,0.003702,0.732994,0.012957,0.623785,47.5909,0.42758
3,ERR1091758,Female,21.79,62,United States of America,"Attention Deficit Disorder with Hyperactivity,...",0.078793,0.366156,0.104285,0.0,...,0.020857,0.0,0.491298,0.002317,0.0,0.013905,0.0,0.027809,90.174,0.0
4,ERR1160392,Female,35.19,50,United States of America,"Bipolar Disorder, Constipation, Depression, Ep...",0.028054,0.012752,1.71383,0.029329,...,0.001275,0.091812,0.048456,0.386376,0.972954,0.095638,0.132618,0.146644,72.8695,0.020403


### Health Male

In [186]:
health_male_df = pd.read_csv('health_male300.csv')
health_male_df.shape
# only a random sample of 300 from the original dataset but the ones without bacteria reported were dropped
# i'm going to remove all the rows with nans because the health dataset was very large and i can come back 
# for additional data if necessary 

(109, 648)

In [187]:
health_male_df.isnull().sum()

Run ID                                                 0
Sex                                                    0
BMI                                                   28
Age                                                    4
Country                                                4
                                                      ..
unclassified Peptostreptococcaceae                     0
unclassified Peptostreptococcaceae (miscellaneous)     0
unclassified Propionibacteriaceae                      0
unclassified Sutterellaceae                            0
unclassified Tissierellia                              0
Length: 648, dtype: int64

In [188]:
health_male_df['BMI'].unique()
# looks reasonable besides the nans
# can replace with the median of the others or the median of the country each belongs to

array([    nan, 21.22  , 28.7   , 30.9   , 22.15  , 20.8   , 27.3   ,
       25.8   , 24.93  , 22.5   , 20.3601, 28.8288, 22.04  , 24.4898,
       17.52  , 22.1607, 27.    , 18.41  , 17.1193, 29.0367, 23.46  ,
       24.7183, 25.37  , 19.38  , 31.3   , 23.67  , 31.5   , 18.02  ,
       19.49  , 22.24  , 33.7   , 26.22  , 24.6   , 22.4914, 25.71  ,
       19.0311, 22.9224, 33.3   , 23.02  , 23.63  , 20.36  , 24.2989,
       30.02  , 26.91  , 28.8   , 31.8   , 23.92  , 28.2   , 27.55  ,
       15.5   , 27.76  , 25.54  , 28.    , 25.75  , 24.3911, 25.38  ,
       35.5   ])

In [189]:
health_male_df.dropna(subset=['BMI'], inplace=True)

In [190]:
health_male_df.isnull().sum()

Run ID                                                0
Sex                                                   0
BMI                                                   0
Age                                                   0
Country                                               0
                                                     ..
unclassified Peptostreptococcaceae                    0
unclassified Peptostreptococcaceae (miscellaneous)    0
unclassified Propionibacteriaceae                     0
unclassified Sutterellaceae                           0
unclassified Tissierellia                             0
Length: 648, dtype: int64

In [191]:
health_male_df['BMI'].unique()

array([21.22  , 28.7   , 30.9   , 22.15  , 20.8   , 27.3   , 25.8   ,
       24.93  , 22.5   , 20.3601, 28.8288, 22.04  , 24.4898, 17.52  ,
       22.1607, 27.    , 18.41  , 17.1193, 29.0367, 23.46  , 24.7183,
       25.37  , 19.38  , 31.3   , 23.67  , 31.5   , 18.02  , 19.49  ,
       22.24  , 33.7   , 26.22  , 24.6   , 22.4914, 25.71  , 19.0311,
       22.9224, 33.3   , 23.02  , 23.63  , 20.36  , 24.2989, 30.02  ,
       26.91  , 28.8   , 31.8   , 23.92  , 28.2   , 27.55  , 15.5   ,
       27.76  , 25.54  , 28.    , 25.75  , 24.3911, 25.38  , 35.5   ])

In [192]:
health_male_df['Age'].unique()

array([ 0.     , 62.     , 63.     , 67.     , 27.     , 34.     ,
       52.     , 20.     , 26.     , 22.     , 31.     , 19.     ,
       10.     , 29.     , 61.     , 39.     ,  0.49589, 30.     ,
       57.     , 65.     , 47.     , 23.     , 49.     , 46.     ,
       64.     ,  4.     , 32.     , 42.     , 28.     , 38.     ,
       59.     , 50.     , 37.     , 54.     , 25.     ,  9.     ,
       66.     , 58.     , 35.     , 72.     ,  2.     , 55.     ])

In [193]:
health_male_df['Country'].unique()

array(['United States of America', 'United Kingdom', 'Austria',
       'New Zealand', 'Canada', 'China', 'Germany', 'India', 'Denmark'],
      dtype=object)

In [194]:
mesh_id_health_male_df = health_male_df['Mesh ID'].unique()
mesh_id_health_male_df

array(["['D006262']"], dtype=object)

In [195]:
mapped_diseases_health_male = [
    [disease_dict[code.strip("[]' ")] for code in disease_list.split(',')]
    for disease_list in mesh_id_health_male_df
]

print(mapped_diseases_health_male)

[['Health']]


In [196]:
bacteria_health_male_df = health_male_df.drop(['Run ID', 'Sex', 'Age', 'Country', 'Mesh ID'], axis=1)
att_health_male_columns = ['Run ID', 'Sex', 'Age', 'Country', 'Mesh ID']
att_health_male_df = health_male_df[att_health_male_columns]

zero_counts_health_male = (bacteria_health_male_df == 0.0).sum()

threshold_health_male = 0.2 * len(bacteria_health_male_df)

filtered_health_male = zero_counts_health_male[zero_counts_health_male <= threshold_health_male]

filtered_health_male_df = bacteria_health_male_df[filtered_health_male.index]

filtered_health_male = pd.concat([att_health_male_df, filtered_health_male_df], axis=1)

filtered_health_male.to_csv('clean_health_male.csv')

filtered_health_male.shape

(81, 19)

In [197]:
filtered_health_male.head()
# still has an age of 0 - but lots of other ages were 0.382 etc.

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,BMI,Alistipes,Bacteroides,Bifidobacterium,Blautia,Clostridium,Dorea,Eubacterium,Faecalibacterium,Oscillibacter,Parabacteroides,Roseburia,Ruminococcus,Unknown
1,ERR1075960,Male,0.0,United States of America,['D006262'],21.22,2.29105,21.4258,0.376027,0.918747,0.220964,0.042642,0.546596,7.64072,0.922624,1.15522,0.065902,0.798573,30.3497
3,ERR1854747,Male,62.0,United Kingdom,['D006262'],28.7,3.11017,16.461,1.19921,0.664308,0.155293,0.064705,1.47528,5.86231,3.51997,0.80666,0.52627,2.12665,44.651
4,ERR688529,Male,63.0,Austria,['D006262'],30.9,6.2395,4.79469,1.94063,5.02463,5.94399,3.15984,18.5823,12.7213,0.22463,0.28063,1.19232,11.9998,0.09314
5,ERR1316059,Male,67.0,United States of America,['D006262'],22.15,6.81649,16.4122,0.70338,1.06365,0.217304,0.011437,0.291645,16.0462,4.10019,2.97936,0.165837,0.337393,21.2729
6,SRR5649200,Male,27.0,United States of America,['D006262'],20.8,6.5014,15.2769,4.55914,3.05031,0.087048,0.168655,1.24769,1.41816,0.491458,1.80081,0.937579,4.73323,37.3527


In [198]:
for mesh_id_str in filtered_health_male['Mesh ID']:
    mesh_id_list = ast.literal_eval(mesh_id_str)
    
    mesh_id_string = ', '.join(mesh_id_list)
    
def format_mesh_ids(mesh_id_str):
    mesh_id_list = ast.literal_eval(mesh_id_str)
    return ', '.join(mesh_id_list)

filtered_health_male['Mesh ID'] = filtered_health_male['Mesh ID'].apply(format_mesh_ids)

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_health_male['Mesh ID'] = filtered_health_male['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_health_male.to_csv('clean_health_male_condition.csv')

filtered_health_male.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,BMI,Alistipes,Bacteroides,Bifidobacterium,Blautia,Clostridium,Dorea,Eubacterium,Faecalibacterium,Oscillibacter,Parabacteroides,Roseburia,Ruminococcus,Unknown
1,ERR1075960,Male,0.0,United States of America,Health,21.22,2.29105,21.4258,0.376027,0.918747,0.220964,0.042642,0.546596,7.64072,0.922624,1.15522,0.065902,0.798573,30.3497
3,ERR1854747,Male,62.0,United Kingdom,Health,28.7,3.11017,16.461,1.19921,0.664308,0.155293,0.064705,1.47528,5.86231,3.51997,0.80666,0.52627,2.12665,44.651
4,ERR688529,Male,63.0,Austria,Health,30.9,6.2395,4.79469,1.94063,5.02463,5.94399,3.15984,18.5823,12.7213,0.22463,0.28063,1.19232,11.9998,0.09314
5,ERR1316059,Male,67.0,United States of America,Health,22.15,6.81649,16.4122,0.70338,1.06365,0.217304,0.011437,0.291645,16.0462,4.10019,2.97936,0.165837,0.337393,21.2729
6,SRR5649200,Male,27.0,United States of America,Health,20.8,6.5014,15.2769,4.55914,3.05031,0.087048,0.168655,1.24769,1.41816,0.491458,1.80081,0.937579,4.73323,37.3527


### Health Female

In [199]:
health_female_df = pd.read_csv('health_female300.csv')
health_female_df.shape

(122, 807)

In [200]:
health_female_df.isnull().sum()

Run ID                                                 0
Sex                                                    0
BMI                                                   37
Age                                                    6
Country                                                1
                                                      ..
unclassified Peptostreptococcaceae                     0
unclassified Peptostreptococcaceae (miscellaneous)     0
unclassified Propionibacteriaceae                      0
unclassified Ruminococcaceae                           0
unclassified Sutterellaceae                            0
Length: 807, dtype: int64

In [201]:
# doing the same thing here and dropping all the rows with nans
# will pull more from the original dataset later if need be 

health_female_df.dropna(subset=['BMI'], inplace=True)

In [202]:
health_female_df.isnull().sum()

Run ID                                                0
Sex                                                   0
BMI                                                   0
Age                                                   0
Country                                               0
                                                     ..
unclassified Peptostreptococcaceae                    0
unclassified Peptostreptococcaceae (miscellaneous)    0
unclassified Propionibacteriaceae                     0
unclassified Ruminococcaceae                          0
unclassified Sutterellaceae                           0
Length: 807, dtype: int64

In [203]:
health_female_df['Age'].unique()

array([63.     , 26.     , 21.     , 24.     , 35.     , 22.     ,
        2.01644, 18.     , 23.     , 53.     , 56.     , 59.     ,
       31.     , 55.     , 33.     , 75.     , 80.     , 25.     ,
       54.     , 52.     , 27.     , 32.     , 49.     , 69.     ,
       20.     , 64.     , 62.     , 45.     , 57.     , 43.     ,
       36.     , 47.     , 28.     , 60.     , 46.     , 50.     ])

In [204]:
health_female_df['BMI'].unique()

array([21.03  , 19.0311, 20.    , 24.13  , 24.63  , 22.3   , 23.996 ,
       14.9367, 20.01  , 24.3504, 23.7388, 21.46  , 25.74  , 26.4264,
       24.4646, 25.6   , 21.3   , 22.4059, 20.72  , 20.47  , 27.3   ,
       21.31  , 21.7   , 22.6667, 22.03  , 20.05  , 32.    , 23.5   ,
       23.6   , 21.95  , 21.79  , 20.98  , 20.51  , 33.5   , 28.3   ,
       22.4   , 19.8   , 21.1   , 23.63  , 21.29  , 22.7   , 22.2   ,
       30.77  , 22.1   , 24.974 , 23.1473, 27.46  , 28.9   , 40.2   ,
       28.2   , 25.3333, 20.78  , 23.53  , 26.29  , 25.79  , 17.56  ,
       19.6   , 32.2   ,  0.    , 34.    , 23.4   , 23.26  , 23.7332])

In [205]:
health_female_df[health_female_df['BMI'] == 0]

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acaryochloris,Acetoanaerobium,...,unclassified Clostridiales Family XIII. Incertae Sedis,unclassified Dermatophilaceae,unclassified Erysipelotrichaceae,unclassified Erysipelotrichaceae (miscellaneous),unclassified Lachnospiraceae,unclassified Peptostreptococcaceae,unclassified Peptostreptococcaceae (miscellaneous),unclassified Propionibacteriaceae,unclassified Ruminococcaceae,unclassified Sutterellaceae
113,ERR1315958,Female,0.0,46.0,Finland,['D006262'],0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [206]:
# cannot use the average BMI of 46 year olds because she's the only one 
# health_female_df[health_female_df['Age'] == 46]['BMI'].mean()

# cannot use the average BMI of finnish women either because she's the only one 
# health_female_df[health_female_df['Country'] == 'Finland']['BMI'].mean()

# going to drop the row and get more samples later if need be 

health_female_df.drop(health_female_df[health_female_df['BMI'] == 0].index, inplace=True)

In [207]:
health_female_df['BMI'].unique()

array([21.03  , 19.0311, 20.    , 24.13  , 24.63  , 22.3   , 23.996 ,
       14.9367, 20.01  , 24.3504, 23.7388, 21.46  , 25.74  , 26.4264,
       24.4646, 25.6   , 21.3   , 22.4059, 20.72  , 20.47  , 27.3   ,
       21.31  , 21.7   , 22.6667, 22.03  , 20.05  , 32.    , 23.5   ,
       23.6   , 21.95  , 21.79  , 20.98  , 20.51  , 33.5   , 28.3   ,
       22.4   , 19.8   , 21.1   , 23.63  , 21.29  , 22.7   , 22.2   ,
       30.77  , 22.1   , 24.974 , 23.1473, 27.46  , 28.9   , 40.2   ,
       28.2   , 25.3333, 20.78  , 23.53  , 26.29  , 25.79  , 17.56  ,
       19.6   , 32.2   , 34.    , 23.4   , 23.26  , 23.7332])

In [208]:
health_female_df['Country'].unique()

array(['United Kingdom', 'Canada', 'United States of America',
       'New Zealand', 'India', 'Denmark', 'Japan', 'Italy', 'Austria',
       'Germany', 'Brazil', 'China'], dtype=object)

In [209]:
mesh_id_health_female_df = health_female_df['Mesh ID'].unique()
mesh_id_health_female_df

array(["['D006262']"], dtype=object)

In [210]:
mapped_diseases_health_female = [
    [disease_dict[code.strip("[]' ")] for code in disease_list.split(',')]
    for disease_list in mesh_id_health_female_df
]

print(mapped_diseases_health_female)

[['Health']]


In [211]:
bacteria_health_female_df = health_female_df.drop(['Run ID', 'Sex', 'Age', 'Country', 'Mesh ID'], axis=1)
att_health_female_columns = ['Run ID', 'Sex', 'Age', 'Country', 'Mesh ID']
att_health_female_df = health_female_df[att_health_female_columns]

zero_counts_health_female = (bacteria_health_female_df == 0.0).sum()

threshold_health_female = 0.2 * len(bacteria_health_female_df)

filtered_health_female = zero_counts_health_female[zero_counts_health_female <= threshold_health_female]

filtered_health_female_df = bacteria_health_female_df[filtered_health_female.index]

filtered_health_female = pd.concat([att_health_female_df, filtered_health_female_df], axis=1)

filtered_health_female.to_csv('clean_health_female.csv')

filtered_health_female.shape

(84, 18)

In [212]:
filtered_health_female.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,BMI,Alistipes,Bacteroides,Blautia,Coprococcus,Dorea,Eubacterium,Faecalibacterium,Oscillibacter,Parabacteroides,Roseburia,Ruminococcus,Unknown
1,ERR1842901,Female,63.0,United Kingdom,['D006262'],21.03,2.78519,38.1333,0.865185,4.58667,0.082963,1.19704,14.323,0.877037,2.37037,0.177778,2.09778,23.52
2,ERR719035,Female,26.0,Canada,['D006262'],19.0311,3.16575,75.4519,0.10565,0.02527,0.39244,3.79275,3.29535,0.44752,3.771,0.70116,0.69206,0.08422
3,SRR5648781,Female,21.0,United States of America,['D006262'],20.0,0.404316,4.4988,1.41709,2.44064,0.00482,0.121919,4.00574,1.73096,0.08024,0.790203,8.84223,60.918
4,ERR2032350,Female,24.0,United States of America,['D006262'],24.13,1.68019,24.4199,6.4735,0.776693,0.076084,1.46145,11.4158,0.519909,4.18146,0.786203,3.78202,22.5875
5,ERR1089968,Female,63.0,United States of America,['D006262'],24.63,3.79776,14.7071,1.12168,0.130957,0.096794,1.52024,3.58139,2.89814,6.18345,0.21067,0.620623,39.6402


In [213]:
for mesh_id_str in filtered_health_female['Mesh ID']:
    mesh_id_list = ast.literal_eval(mesh_id_str)
    
    mesh_id_string = ', '.join(mesh_id_list)
    
def format_mesh_ids(mesh_id_str):
    mesh_id_list = ast.literal_eval(mesh_id_str)
    return ', '.join(mesh_id_list)

filtered_health_female['Mesh ID'] = filtered_health_female['Mesh ID'].apply(format_mesh_ids)

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_health_female['Mesh ID'] = filtered_health_female['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_health_female.to_csv('clean_health_female_condition.csv')

filtered_health_female.head()

Unnamed: 0,Run ID,Sex,Age,Country,Mesh ID,BMI,Alistipes,Bacteroides,Blautia,Coprococcus,Dorea,Eubacterium,Faecalibacterium,Oscillibacter,Parabacteroides,Roseburia,Ruminococcus,Unknown
1,ERR1842901,Female,63.0,United Kingdom,Health,21.03,2.78519,38.1333,0.865185,4.58667,0.082963,1.19704,14.323,0.877037,2.37037,0.177778,2.09778,23.52
2,ERR719035,Female,26.0,Canada,Health,19.0311,3.16575,75.4519,0.10565,0.02527,0.39244,3.79275,3.29535,0.44752,3.771,0.70116,0.69206,0.08422
3,SRR5648781,Female,21.0,United States of America,Health,20.0,0.404316,4.4988,1.41709,2.44064,0.00482,0.121919,4.00574,1.73096,0.08024,0.790203,8.84223,60.918
4,ERR2032350,Female,24.0,United States of America,Health,24.13,1.68019,24.4199,6.4735,0.776693,0.076084,1.46145,11.4158,0.519909,4.18146,0.786203,3.78202,22.5875
5,ERR1089968,Female,63.0,United States of America,Health,24.63,3.79776,14.7071,1.12168,0.130957,0.096794,1.52024,3.58139,2.89814,6.18345,0.21067,0.620623,39.6402


### Depression Male

In [214]:
dep_male_df = pd.read_csv('depression_male.csv')
dep_male_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetoanaerobium,Acetobacter,...,Weissella,Wenyingzhuangia,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea,Zymomonas
0,ERR1072629,Male,17.67,64,United States of America,"['D001714', 'D003863', 'D007410', 'D008171', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00484,0.0
1,ERR1072937,Male,26.51,53,United States of America,"['D001714', 'D002318', 'D003863', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073491,Male,28.08,49,United States of America,"['D001714', 'D003863', 'D012559']",0.003157,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ERR1075554,Male,30.74,62,United States of America,"['D001289', 'D001714', 'D003863', 'D008881', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1075686,Male,45.84,56,United States of America,"['D001714', 'D003863', 'D008171', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [215]:
dep_male_df.isnull().sum()

Run ID           0
Sex              0
BMI              0
Age              0
Country          0
                ..
Xenorhabdus      0
Yersinia         0
Youngiibacter    0
Zoogloea         0
Zymomonas        0
Length: 773, dtype: int64

In [216]:
dep_male_df['BMI'].unique()

array([  17.67,   26.51,   28.08,   30.74,   45.84,   20.28,   22.72,
         20.62,   31.47,   27.37,    0.  ,   26.54,   23.49,   21.52,
         22.24,   33.12,   23.18,   25.1 ,   23.99,   22.84,   20.06,
         40.32,   16.98,   31.97,   24.49,   23.75,   22.15,   19.14,
         25.33,   24.41,   25.38,   24.33,   32.14, 4101.24,   21.29,
         19.61,   22.87,   29.99,   25.09,   30.27,   23.59,   26.26,
         29.68,   21.91,   28.09,   24.99,   23.71,   32.92,   20.45,
         24.68,   28.76,   24.48,   20.89,   27.32,   25.21,   20.52,
         33.72,   27.69,   28.35,   27.98,   24.37,   28.75,   22.28,
         23.24,   24.58,   23.06,   45.19,   29.57,   22.05,   33.47,
         21.62,   27.02,   20.8 ,   24.39,   33.91,   28.97,   32.22,
         29.84,   28.12,   25.4 ,   24.8 ,   21.95,   20.97,   25.82,
         29.41])

In [217]:
# doing the same thing as above to replace the 0 and 4101.24
dep_male_bmi_filter = dep_male_df[(dep_male_df['BMI'] > 15) & (dep_male_df['BMI'] < 50)]
dep_male_bmi_median = dep_male_bmi_filter['BMI'].median()
dep_male_df['BMI'] = dep_male_df['BMI'].apply(lambda x: dep_male_bmi_median if x <= 15 or x >= 50 else x)

dep_male_df['BMI'].unique()

array([17.67, 26.51, 28.08, 30.74, 45.84, 20.28, 22.72, 20.62, 31.47,
       27.37, 24.68, 26.54, 23.49, 21.52, 22.24, 33.12, 23.18, 25.1 ,
       23.99, 22.84, 20.06, 40.32, 16.98, 31.97, 24.49, 23.75, 22.15,
       19.14, 25.33, 24.41, 25.38, 24.33, 32.14, 21.29, 19.61, 22.87,
       29.99, 25.09, 30.27, 23.59, 26.26, 29.68, 21.91, 28.09, 24.99,
       23.71, 32.92, 20.45, 28.76, 24.48, 20.89, 27.32, 25.21, 20.52,
       33.72, 27.69, 28.35, 27.98, 24.37, 28.75, 22.28, 23.24, 24.58,
       23.06, 45.19, 29.57, 22.05, 33.47, 21.62, 27.02, 20.8 , 24.39,
       33.91, 28.97, 32.22, 29.84, 28.12, 25.4 , 24.8 , 21.95, 20.97,
       25.82, 29.41])

In [218]:
dep_male_df['Age'].unique()

array([64, 53, 49, 62, 56, 39, 67, 73,  0, 71, 54, 23, 40, 35, 38, 42, 43,
       21, 61, 34, 72, 27, 32, 25, 24, 65, 47, 51, 63, 46, 52, 45, 68, 48,
       57, 70, 29, 58, 37, 36, 41, 28, 50, 17, 31, 55, 30, 12, 76])

In [219]:
dep_male_df[dep_male_df['Age'] == 0]

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetoanaerobium,Acetobacter,...,Weissella,Wenyingzhuangia,Wenzhouxiangella,Wolbachia,Xanthomonas,Xenorhabdus,Yersinia,Youngiibacter,Zoogloea,Zymomonas
10,ERR1089866,Male,24.68,0,United States of America,"['D001289', 'D001714', 'D003863', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,ERR1090943,Male,24.33,0,United States of America,"['D001289', 'D001327', 'D001714', 'D003248', '...",0.0,0.0,0.0,0.0,...,0.008931,0.0,0.0,0.0,0.0,0.0,0.004465,0.0,0.0,0.0


In [220]:
# going to replace the 0 age's with the output from the following code 
# it narrows down the most likely age based on other samples with the same country and within the same bmi range
dep_male_new_age = dep_male_df[
    (dep_male_df['Country'] == 'United States of America') &
    ((dep_male_df['BMI'] > 22) | (dep_male_df['BMI'] < 27))
]['Age'].mean()

In [221]:
dep_male_df['Age'].replace(0, dep_male_new_age, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dep_male_df['Age'].replace(0, dep_male_new_age, inplace=True)


In [222]:
dep_male_df['Age'].unique()

array([64.        , 53.        , 49.        , 62.        , 56.        ,
       39.        , 67.        , 73.        , 46.26760563, 71.        ,
       54.        , 23.        , 40.        , 35.        , 38.        ,
       42.        , 43.        , 21.        , 61.        , 34.        ,
       72.        , 27.        , 32.        , 25.        , 24.        ,
       65.        , 47.        , 51.        , 63.        , 46.        ,
       52.        , 45.        , 68.        , 48.        , 57.        ,
       70.        , 29.        , 58.        , 37.        , 36.        ,
       41.        , 28.        , 50.        , 17.        , 31.        ,
       55.        , 30.        , 12.        , 76.        ])

In [223]:
dep_male_df['Country'].unique()

array(['United States of America', 'United Kingdom', 'Australia',
       'Sweden', 'Canada', 'New Zealand'], dtype=object)

In [224]:
mesh_ids_series_dep_male = pd.Series(dep_male_df['Mesh ID'])
flattened_mesh_ids_dep_male = [item for sublist in mesh_ids_series_dep_male.apply(ast.literal_eval) for item in sublist]
unique_mesh_ids_dep_male = pd.Series(flattened_mesh_ids_dep_male).unique()
unique_mesh_ids_dep_male


array(['D001714', 'D003863', 'D007410', 'D008171', 'D012559', 'D013959',
       'D043183', 'D002318', 'D001289', 'D008881', 'D003248', 'D002446',
       'D003920', 'D003967', 'D008107', 'D001327', 'D015212', 'D007674',
       'D003015', 'D000067877', 'D004827', 'D010661'], dtype=object)

In [225]:
mapped_diseases_dep_male = [
    [disease_dict[code.strip("[]' ")] for code in disease_list.split(',')]
    for disease_list in unique_mesh_ids_dep_male
]

print(mapped_diseases_dep_male)

[['Bipolar Disorder'], ['Depression'], ['Intestinal Diseases'], ['Lung Diseases'], ['Schizophrenia'], ['Thyroid Diseases'], ['Irritable Bowel Syndrome'], ['Cardiovascular Diseases'], ['Attention Deficit Disorder with Hyperactivity'], ['Migrane Disorders'], ['Constipation'], ['Celiac Disease'], ['Diabetes Mellitus'], ['Diarrhea'], ['Liver Diseases'], ['Autoimmune Diseases'], ['Inflammatory Bowel Diseases'], ['Kidney Diseases'], ['Clostridium Infections'], ['Autism Spectrum Disorder'], ['Epilepsy'], ['Phenylketonurias']]


In [226]:
bacteria_dep_male_df = dep_male_df.drop(['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID'], axis=1)
att_dep_male_columns = ['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID']
att_dep_male_df = dep_male_df[att_dep_male_columns]

zero_counts_dep_male = (bacteria_dep_male_df == 0.0).sum()

threshold_dep_male = 0.2 * len(bacteria_dep_male_df)

filtered_dep_male = zero_counts_dep_male[zero_counts_dep_male <= threshold_dep_male]

filtered_dep_male_df = bacteria_dep_male_df[filtered_dep_male.index]

filtered_dep_male = pd.concat([att_dep_male_df, filtered_dep_male_df], axis=1)

filtered_dep_male.to_csv('clean_dep_male.csv')

filtered_dep_male.shape

(100, 37)

In [227]:
filtered_dep_male.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Anaerofilum,Anaerotruncus,...,Odoribacter,Oscillibacter,Parabacteroides,Prevotella,Pseudomonas,Roseburia,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1072629,Male,17.67,64.0,United States of America,"['D001714', 'D003863', 'D007410', 'D008171', '...",2.15845,17.5047,0.033877,0.159706,...,1.40831,1.97454,11.494,0.029037,0.024198,0.503315,1.45671,0.198422,0.00484,21.9813
1,ERR1072937,Male,26.51,53.0,United States of America,"['D001714', 'D002318', 'D003863', 'D012559']",0.031791,1.01733,0.005299,0.105971,...,0.243734,0.927251,0.22254,0.052986,41.1699,0.090076,0.021194,0.11127,0.598739,10.1733
2,ERR1073491,Male,28.08,49.0,United States of America,"['D001714', 'D003863', 'D012559']",0.176768,5.6029,0.025253,0.126263,...,2.07702,0.656566,3.93624,0.022096,0.018939,0.407197,6.1774,0.085227,0.050505,11.0574
3,ERR1075554,Male,30.74,62.0,United States of America,"['D001289', 'D001714', 'D003863', 'D008881', '...",0.360811,3.15526,0.044181,0.191451,...,0.250359,2.03233,0.025772,0.14727,0.003682,1.39538,0.20986,0.224587,0.206178,25.9674
4,ERR1075686,Male,45.84,56.0,United States of America,"['D001714', 'D003863', 'D008171', 'D012559']",0.037344,4.91069,0.012448,1.16388,...,0.485467,0.55393,1.35059,0.024896,1.67424,0.149374,1.27591,0.908695,0.037344,55.3059


In [228]:
for mesh_id_str in filtered_dep_male['Mesh ID']:
    mesh_id_list = ast.literal_eval(mesh_id_str)
    
    mesh_id_string = ', '.join(mesh_id_list)
    
def format_mesh_ids(mesh_id_str):
    mesh_id_list = ast.literal_eval(mesh_id_str)
    return ', '.join(mesh_id_list)

filtered_dep_male['Mesh ID'] = filtered_dep_male['Mesh ID'].apply(format_mesh_ids)

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_dep_male['Mesh ID'] = filtered_dep_male['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_dep_male.to_csv('clean_depression_male_condition.csv')

filtered_dep_male.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Anaerofilum,Anaerotruncus,...,Odoribacter,Oscillibacter,Parabacteroides,Prevotella,Pseudomonas,Roseburia,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1072629,Male,17.67,64.0,United States of America,"Bipolar Disorder, Depression, Intestinal Disea...",2.15845,17.5047,0.033877,0.159706,...,1.40831,1.97454,11.494,0.029037,0.024198,0.503315,1.45671,0.198422,0.00484,21.9813
1,ERR1072937,Male,26.51,53.0,United States of America,"Bipolar Disorder, Cardiovascular Diseases, Dep...",0.031791,1.01733,0.005299,0.105971,...,0.243734,0.927251,0.22254,0.052986,41.1699,0.090076,0.021194,0.11127,0.598739,10.1733
2,ERR1073491,Male,28.08,49.0,United States of America,"Bipolar Disorder, Depression, Schizophrenia",0.176768,5.6029,0.025253,0.126263,...,2.07702,0.656566,3.93624,0.022096,0.018939,0.407197,6.1774,0.085227,0.050505,11.0574
3,ERR1075554,Male,30.74,62.0,United States of America,"Attention Deficit Disorder with Hyperactivity,...",0.360811,3.15526,0.044181,0.191451,...,0.250359,2.03233,0.025772,0.14727,0.003682,1.39538,0.20986,0.224587,0.206178,25.9674
4,ERR1075686,Male,45.84,56.0,United States of America,"Bipolar Disorder, Depression, Lung Diseases, S...",0.037344,4.91069,0.012448,1.16388,...,0.485467,0.55393,1.35059,0.024896,1.67424,0.149374,1.27591,0.908695,0.037344,55.3059


### Depression Female

In [229]:
dep_female_df = pd.read_csv('depression_female.csv')
dep_female_df.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,...,Williamsia,Wolbachia,Xanthomonas,Xenococcus,Xenorhabdus,Yersinia,Yokenella,Youngiibacter,Zoogloea,Zymomonas
0,ERR1073023,Female,34.72,35,United States of America,"['D001714', 'D003863', 'D012559']",0.0,0.005583,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ERR1073394,Female,22.41,27,United States of America,"['D001327', 'D001714', 'D003863', 'D003920', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ERR1073395,Female,22.41,27,United States of America,"['D001327', 'D001714', 'D003863', 'D003920', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00247,0.0
3,ERR1073490,Female,25.73,37,United States of America,"['D001714', 'D003863', 'D012559']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ERR1073812,Female,29.71,55,United States of America,"['D001289', 'D001714', 'D003015', 'D003863', '...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [230]:
dep_female_df.isnull().sum()

Run ID           0
Sex              0
BMI              0
Age              0
Country          1
                ..
Yersinia         0
Yokenella        0
Youngiibacter    0
Zoogloea         0
Zymomonas        0
Length: 930, dtype: int64

In [231]:
dep_female_df['Country'].unique()

array(['United States of America', 'Australia', 'United Kingdom',
       'Canada', 'New Zealand', 'Switzerland', nan, 'Norway', 'Ireland'],
      dtype=object)

In [232]:
dep_female_df[dep_female_df['Country'].isna()]
# same woman before and i couldn't accurately replace it then so i'll drop it now too

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Abiotrophia,Acanthopleuribacter,Acetivibrio,Acetoanaerobium,...,Williamsia,Wolbachia,Xanthomonas,Xenococcus,Xenorhabdus,Yersinia,Yokenella,Youngiibacter,Zoogloea,Zymomonas
176,ERR1160486,Female,39.2,40,,"['D001714', 'D003863', 'D012559', 'D043183']",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000916,0.0,0.0,0.0,0.0


In [233]:
dep_female_df.dropna(subset=['Country'], inplace=True)

In [234]:
dep_female_df['Country'].unique()

array(['United States of America', 'Australia', 'United Kingdom',
       'Canada', 'New Zealand', 'Switzerland', 'Norway', 'Ireland'],
      dtype=object)

In [235]:
dep_female_df['Age'].unique()

array([35, 27, 37, 55, 28, 47, 41, 57, 67, 36, 53, 56, 43, 48, 60, 44, 45,
       33, 30, 63, 66, 46, 61, 59, 38, 68, 51, 70, 62, 40, 42, 52, 54, 32,
       39, 13, 19, 26, 49, 22, 29, 65, 34, 64, 58, 69, 31, 50, 21, 24, 25,
       20, 72])

In [236]:
dep_female_df['BMI'].describe()

count     248.000000
mean       36.547419
std       173.540457
min         1.540000
25%        21.790000
50%        24.290000
75%        28.345000
max      2756.040000
Name: BMI, dtype: float64

In [237]:
# going to filter out the outlier bmi's and replace them with the median of the filtered entries 

dep_female_bmi_filter = dep_female_df[(dep_female_df['BMI'] > 15) & (dep_female_df['BMI'] < 50)]
dep_female_bmi_median = dep_female_bmi_filter['BMI'].median()
dep_female_df['BMI'] = dep_female_df['BMI'].apply(lambda x: dep_female_bmi_median if x <= 15 or x >= 50 else x)

dep_female_df['BMI'].describe()

count    248.000000
mean      25.496008
std        5.596399
min       15.180000
25%       21.882500
50%       24.300000
75%       28.297500
max       48.420000
Name: BMI, dtype: float64

In [238]:
mesh_ids_series_dep_female = pd.Series(dep_female_df['Mesh ID'])
flattened_mesh_ids_dep_female = [item for sublist in mesh_ids_series_dep_female.apply(ast.literal_eval) for item in sublist]
unique_mesh_ids_dep_female = pd.Series(flattened_mesh_ids_dep_female).unique()
unique_mesh_ids_dep_female


array(['D001714', 'D003863', 'D012559', 'D001327', 'D003920', 'D003967',
       'D013959', 'D001289', 'D003015', 'D004827', 'D007410', 'D008171',
       'D003248', 'D015212', 'D043183', 'D008107', 'D008881', 'D002318',
       'D007674', 'D002446', 'D000067877'], dtype=object)

In [239]:
mapped_diseases_dep_female = [
    [disease_dict[code.strip("[]' ")] for code in disease_list.split(',')]
    for disease_list in unique_mesh_ids_dep_female
]

print(mapped_diseases_dep_female)

[['Bipolar Disorder'], ['Depression'], ['Schizophrenia'], ['Autoimmune Diseases'], ['Diabetes Mellitus'], ['Diarrhea'], ['Thyroid Diseases'], ['Attention Deficit Disorder with Hyperactivity'], ['Clostridium Infections'], ['Epilepsy'], ['Intestinal Diseases'], ['Lung Diseases'], ['Constipation'], ['Inflammatory Bowel Diseases'], ['Irritable Bowel Syndrome'], ['Liver Diseases'], ['Migrane Disorders'], ['Cardiovascular Diseases'], ['Kidney Diseases'], ['Celiac Disease'], ['Autism Spectrum Disorder']]


In [240]:
bacteria_dep_female_df = dep_female_df.drop(['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID'], axis=1)
att_dep_female_columns = ['Run ID', 'Sex', 'BMI', 'Age', 'Country', 'Mesh ID']
att_dep_female_df = dep_female_df[att_dep_female_columns]

zero_counts_dep_female = (bacteria_dep_female_df == 0.0).sum()

threshold_dep_female = 0.2 * len(bacteria_dep_female_df)

filtered_dep_female = zero_counts_dep_female[zero_counts_dep_female <= threshold_dep_female]

filtered_dep_female_df = bacteria_dep_female_df[filtered_dep_female.index]

filtered_dep_female = pd.concat([att_dep_female_df, filtered_dep_female_df], axis=1)

filtered_dep_female.to_csv('clean_dep_female.csv')

filtered_dep_female.shape

(248, 36)

In [241]:
filtered_dep_female.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Anaerotruncus,Bacillus,...,Odoribacter,Oscillibacter,Parabacteroides,Prevotella,Pseudomonas,Roseburia,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1073023,Female,34.72,35,United States of America,"['D001714', 'D003863', 'D012559']",0.005583,3.4167,0.904422,3.6121,...,0.022331,0.614113,3.81867,0.027914,0.005583,1.74743,1.57436,0.217731,0.206565,28.2101
1,ERR1073394,Female,22.41,27,United States of America,"['D001327', 'D001714', 'D003863', 'D003920', '...",0.005104,6.11698,0.053591,0.030623,...,0.604808,1.253,1.81953,0.117389,0.005104,1.43674,0.247537,0.010208,0.010208,3.81514
2,ERR1073395,Female,22.41,27,United States of America,"['D001327', 'D001714', 'D003863', 'D003920', '...",0.00247,6.74687,0.069173,0.027175,...,0.140817,1.339,1.64781,0.027175,0.012352,2.03073,0.303869,0.00247,0.019764,4.92366
3,ERR1073490,Female,25.73,37,United States of America,"['D001714', 'D003863', 'D012559']",0.331675,3.57188,0.290216,0.041459,...,0.424161,2.61194,2.0921,0.031892,0.328486,0.711188,0.373134,0.105243,0.054216,36.5066
4,ERR1073812,Female,29.71,55,United States of America,"['D001289', 'D001714', 'D003015', 'D003863', '...",0.00528,3.12022,0.068634,0.021118,...,0.211182,0.628267,0.950319,2.41804,0.047516,0.158387,0.021118,0.036957,0.047516,59.0993


In [242]:
for mesh_id_str in filtered_dep_female['Mesh ID']:
    mesh_id_list = ast.literal_eval(mesh_id_str)
    
    mesh_id_string = ', '.join(mesh_id_list)
    
def format_mesh_ids(mesh_id_str):
    mesh_id_list = ast.literal_eval(mesh_id_str)
    return ', '.join(mesh_id_list)

filtered_dep_female['Mesh ID'] = filtered_dep_female['Mesh ID'].apply(format_mesh_ids)

def replace_mesh_ids_with_diseases(mesh_ids_str):
    mesh_ids = mesh_ids_str.split(', ')
    disease_names = [disease_dict.get(mesh_id, "Unknown Disease") for mesh_id in mesh_ids]
    return ', '.join(disease_names)

filtered_dep_female['Mesh ID'] = filtered_dep_female['Mesh ID'].apply(replace_mesh_ids_with_diseases)

filtered_dep_female.to_csv('clean_depression_female_condition.csv')

filtered_dep_female.head()

Unnamed: 0,Run ID,Sex,BMI,Age,Country,Mesh ID,Akkermansia,Alistipes,Anaerotruncus,Bacillus,...,Odoribacter,Oscillibacter,Parabacteroides,Prevotella,Pseudomonas,Roseburia,Ruminococcus,Sporobacter,Streptococcus,Unknown
0,ERR1073023,Female,34.72,35,United States of America,"Bipolar Disorder, Depression, Schizophrenia",0.005583,3.4167,0.904422,3.6121,...,0.022331,0.614113,3.81867,0.027914,0.005583,1.74743,1.57436,0.217731,0.206565,28.2101
1,ERR1073394,Female,22.41,27,United States of America,"Autoimmune Diseases, Bipolar Disorder, Depress...",0.005104,6.11698,0.053591,0.030623,...,0.604808,1.253,1.81953,0.117389,0.005104,1.43674,0.247537,0.010208,0.010208,3.81514
2,ERR1073395,Female,22.41,27,United States of America,"Autoimmune Diseases, Bipolar Disorder, Depress...",0.00247,6.74687,0.069173,0.027175,...,0.140817,1.339,1.64781,0.027175,0.012352,2.03073,0.303869,0.00247,0.019764,4.92366
3,ERR1073490,Female,25.73,37,United States of America,"Bipolar Disorder, Depression, Schizophrenia",0.331675,3.57188,0.290216,0.041459,...,0.424161,2.61194,2.0921,0.031892,0.328486,0.711188,0.373134,0.105243,0.054216,36.5066
4,ERR1073812,Female,29.71,55,United States of America,"Attention Deficit Disorder with Hyperactivity,...",0.00528,3.12022,0.068634,0.021118,...,0.211182,0.628267,0.950319,2.41804,0.047516,0.158387,0.021118,0.036957,0.047516,59.0993
