In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler

# Load the data

## Data for ASD meta abundance 

In [23]:
ASD_data = pd.read_csv("ASD_meta_abundance.csv")
print(ASD_data.head())

                                            Taxonomy    A3    A5    A6    A9  \
0  g__Faecalibacterium;s__Faecalibacterium prausn...  4988  5060  2905  5745   
1              g__Hungatella;s__Hungatella hathewayi  5803  5612  4109  1432   
2       g__Clostridium;s__uncultured Clostridium sp.  3793  2795  1355  5558   
3           g__Butyricimonas;s__Butyricimonas virosa    64  1385   725  1553   
4             g__Alistipes;s__Alistipes indistinctus    15    20   723   620   

    A31   A51   A52   A53   A54  ...  B120  B127  B132  B141  B142  B143  \
0  4822  3889  4646  6337  5064  ...  4471  5868  6561  4910  4492  2812   
1  2652  4175  3891   894  4903  ...  2126  4429  2598  4222  4925  5753   
2  5383  3505  5541  4429  4121  ...  4085  6041  6188  3960  4403  2841   
3    40    53    33   175    58  ...  2065    21    27    55    35     8   
4  3261    43    83    37    43  ...    90    22    30  1027  2641     4   

   B152  B156  B158  B164  
0  5303  4205  3430  4563  
1  126

Drop Nan Values 

In [24]:
ASD_data = ASD_data.dropna()
print(ASD_data.head())

                                            Taxonomy    A3    A5    A6    A9  \
0  g__Faecalibacterium;s__Faecalibacterium prausn...  4988  5060  2905  5745   
1              g__Hungatella;s__Hungatella hathewayi  5803  5612  4109  1432   
2       g__Clostridium;s__uncultured Clostridium sp.  3793  2795  1355  5558   
3           g__Butyricimonas;s__Butyricimonas virosa    64  1385   725  1553   
4             g__Alistipes;s__Alistipes indistinctus    15    20   723   620   

    A31   A51   A52   A53   A54  ...  B120  B127  B132  B141  B142  B143  \
0  4822  3889  4646  6337  5064  ...  4471  5868  6561  4910  4492  2812   
1  2652  4175  3891   894  4903  ...  2126  4429  2598  4222  4925  5753   
2  5383  3505  5541  4429  4121  ...  4085  6041  6188  3960  4403  2841   
3    40    53    33   175    58  ...  2065    21    27    55    35     8   
4  3261    43    83    37    43  ...    90    22    30  1027  2641     4   

   B152  B156  B158  B164  
0  5303  4205  3430  4563  
1  126

Describe the dataset

In [25]:
ASD_data.describe()

Unnamed: 0,A3,A5,A6,A9,A31,A51,A52,A53,A54,A59,...,B120,B127,B132,B141,B142,B143,B152,B156,B158,B164
count,5619.0,5619.0,5619.0,5619.0,5619.0,5619.0,5619.0,5619.0,5619.0,5619.0,...,5619.0,5619.0,5619.0,5619.0,5619.0,5619.0,5619.0,5619.0,5619.0,5619.0
mean,13.097704,15.410393,10.348639,20.897847,17.03328,11.923652,15.551878,16.712938,18.305036,18.808863,...,18.201281,20.5727,20.241146,13.412885,14.840363,8.74693,12.918135,13.767396,10.20929,10.017975
std,143.12676,144.472056,104.290974,165.674456,149.263986,116.726055,149.813289,143.751194,150.129719,154.60683,...,141.082455,174.280866,173.026801,135.541296,145.620691,114.043483,119.844978,120.361359,108.498075,118.393981
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
max,5803.0,5612.0,4109.0,5745.0,5383.0,4175.0,5541.0,6337.0,5064.0,6359.0,...,4471.0,6041.0,6561.0,4910.0,4925.0,5753.0,5303.0,4205.0,3856.0,4868.0


## Data for GSE113690

In [26]:
OTU_data  = pd.read_csv("GSE113690_Autism_16S_rRNA_OTU_assignment_and_abundance.csv")
print(OTU_data .head())

    OTU                                           taxonomy  A1  A10  A100  \
0  OTU1  d__Bacteria;_k__norank;_p__Firmicutes;_c__Clos...   0    0     0   
1  OTU2  d__Bacteria;_k__norank;_p__Proteobacteria;_c__...   0    0     0   
2  OTU3  d__Bacteria;_k__norank;_p__Firmicutes;_c__Erys...   0    0     0   
3  OTU4  d__Bacteria;_k__norank;_p__Firmicutes;_c__Baci...   0    0     0   
4  OTU5  d__Bacteria;_k__norank;_p__Tenericutes;_c__Mol...   0    0     1   

   A101  A102  A104  A105  A106  ...  B52  B54  B55  B56  B57  B58  B59  B6  \
0     0     0     0     1     0  ...    0    0    0    0    0    0    0   0   
1     0     0     0     0     0  ...    0    0    0    0    0    0    0   0   
2     0     0     0     0     2  ...    0    0    0    0    0    0    0   0   
3     0     0     0     0     0  ...    0    0    0    0    0    0    0   0   
4     0     0     1     0     3  ...    0    0    0    0    0    0    0   0   

   B60  B61  
0    0    0  
1    0    0  
2    0    0  
3    1

In [27]:
OTU_data = OTU_data.dropna()
print(OTU_data.head())

    OTU                                           taxonomy  A1  A10  A100  \
0  OTU1  d__Bacteria;_k__norank;_p__Firmicutes;_c__Clos...   0    0     0   
1  OTU2  d__Bacteria;_k__norank;_p__Proteobacteria;_c__...   0    0     0   
2  OTU3  d__Bacteria;_k__norank;_p__Firmicutes;_c__Erys...   0    0     0   
3  OTU4  d__Bacteria;_k__norank;_p__Firmicutes;_c__Baci...   0    0     0   
4  OTU5  d__Bacteria;_k__norank;_p__Tenericutes;_c__Mol...   0    0     1   

   A101  A102  A104  A105  A106  ...  B52  B54  B55  B56  B57  B58  B59  B6  \
0     0     0     0     1     0  ...    0    0    0    0    0    0    0   0   
1     0     0     0     0     0  ...    0    0    0    0    0    0    0   0   
2     0     0     0     0     2  ...    0    0    0    0    0    0    0   0   
3     0     0     0     0     0  ...    0    0    0    0    0    0    0   0   
4     0     0     1     0     3  ...    0    0    0    0    0    0    0   0   

   B60  B61  
0    0    0  
1    0    0  
2    0    0  
3    1

In [28]:
OTU_data.describe()

Unnamed: 0,A1,A10,A100,A101,A102,A104,A105,A106,A108,A109,...,B52,B54,B55,B56,B57,B58,B59,B6,B60,B61
count,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0,...,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0,1322.0
mean,24.021936,24.021936,24.021936,24.021936,24.021936,24.021936,24.021936,24.021936,24.021936,24.021936,...,24.021936,24.021936,24.021936,24.021936,24.021936,24.021936,24.021936,24.021936,24.021936,24.021936
std,244.199925,189.799405,139.254461,195.813732,197.50371,203.391886,160.052642,247.470492,231.79715,181.76218,...,246.785018,229.083403,257.203478,196.741465,270.288024,209.47597,315.156941,187.257506,182.356615,225.963926
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
max,5865.0,4808.0,2616.0,5839.0,5701.0,5758.0,3064.0,7685.0,7544.0,3742.0,...,7175.0,6104.0,7449.0,4040.0,6360.0,5148.0,8602.0,4702.0,3696.0,6184.0
