In [22]:
from enum import Enum
import pandas as pd
from scipy.stats import ttest_ind
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats


#Enum for all the categories and their column number
class Col(Enum):
    Gender = 1
    Menstrual = 2 
    SleepPrior = 3 
    Prescription = 4 
    OralAntibiotics = 5 
    TopAntibiotics = 6 
    Eyes = 7 
    Hands = 8 
    Race = 9 
    Feeling = 10 
    Symptoms = 11 
    Facial = 12 
    LastVomit = 13 
    EnvAllergies = 14 
    Snore = 15 
    GrindTeeth = 16 
    EarInfect = 17 
    AllergyTime = 18 
    Skin = 19 
    OralSurgery = 20 
    HeartDisease = 21 
    BadTeeth = 22 
    EatPrior = 23 
    DrinkPrior = 24 
    GumDay = 25 
    YogurtWeek = 26 
    TeaWeek = 27 
    CoffeeWeek = 28 
    SodaWeek = 29 
    FastFoodWeek = 30 
    EatOutWeek = 31 
    Straw = 32 
    AddedSugar = 33 
    Spicy = 34 
    ShareDrink = 35 
    MeatWeek = 36 
    FreshWeek = 37 
    BrushTongue = 38 
    BrushTeethWeek = 39 
    TobaccoWeek = 40 
    FlossWeek = 41 
    DentistYear = 42 
    Toothbrush = 43 
    Mouthwash = 44 
    Toothpaste = 45 
    WashHandsEat = 46 
    WashHandsRestroom = 47 
    ChangeToothbrush = 48 
    BiteNails = 49 
    OrthoDevice = 50 
    Device = 51 
    Stress = 52 
    HandleStress = 53 
    Organized = 54 
    RateStress = 55 
    AvgSleep = 56 
    AvgWork = 57 
    Setting = 58 
    Environment = 59 
    UpDown = 60 
    Weight = 61 
    ExerciseMonth = 62 
    Transportation = 63 
    Orientation = 64 
    NearDogsWeek = 65 
    NearCatsWeek = 66 
    ShareBathroom = 67 
    BooksYear = 68 
    Alarms = 69 
    Patience = 70 
    Lipstick = 71 
    IntroExtro = 72 
    Relgious = 73 
    OptiPessi = 74 
    SexuallyActive = 75 
    OralSex = 76 
    Music = 77 
    AvgGrade = 78 
    RateHappy = 79 
    Arts = 80 
    Kiss = 81 
    Roommates = 82 

    
#Enum for type of test
class Test(Enum):
    Ttest = 1
    Anova = 2
    

def sleepPrior(category, data):
    if category != Col.SleepPrior:
        return data
    
    #if SleepPrior, manipulate the data so that the subcategories don't go over 7
    #also create new subcategories to make it more general
    data[category.name] = data[category.name].replace(['0', '1', '2', '3'], 'less than 4')
    data[category.name] = data[category.name].replace(['4', '5', '6', '7'], '4 to 7')
    data[category.name] = data[category.name].replace(['8', '9', '10', '11', '12'], 'more than 7')
    return data


def prescription(category, data):
    if category != Col.Prescription:
        return data
    
    #manipulate the data so that if the prescription column has a specific prescription, replace with yes
    data[category.name] = data[category.name].replace(['None', 'none', 'no '], 'no')
    data.loc[(data[category.name] != 'no') & (data[category.name] != 'No') & (data[category.name] != 'nan'), 
             category.name] = 'yes'
    return data


def eyes(category, data):
    if category != Col.Eyes:
        return data
    
    #manipulate the data to make it more generic
    data[category.name] = data[category.name].replace('Green/blue/gold mix', 'hazel')
    return data


def perWeek(category, data):
    if category not in (Col.TeaWeek, Col.CoffeeWeek, Col.SodaWeek, Col.FastFoodWeek, Col.BrushTeethWeek, 
                        Col.FlossWeek, Col.BooksYear, Col.GumDay, Col.EatOutWeek, Col.AvgSleep, Col.AvgWork, 
                        Col.ExerciseMonth) :
        return data
    
    #manipulate the data so results don't show as dates
    data[category.name] = data[category.name].replace('5-Jan', '1-5')
    data[category.name] = data[category.name].replace('10-Jun', '6-10')
    data[category.name] = data[category.name].replace('15-Nov', '11-15')
    
    data[category.name] = data[category.name].replace('2-Jan', '1-2')
    data[category.name] = data[category.name].replace('4-Mar', '3-4')
    data[category.name] = data[category.name].replace('6-May', '5-6')
    data[category.name] = data[category.name].replace('8-Jul', '7-8')
    data[category.name] = data[category.name].replace('10-Sep', '9-10')
    data[category.name] = data[category.name].replace('12-Nov', '11-12')
    
    if category == Col.FlossWeek:
        data[category.name] = data[category.name].replace('5-Nov', '11-15')
    
    return data


def numToWords(category, data):
    if category not in (Col.AddedSugar, Col.Straw, Col.Lipstick):
        return data
    
    #manipulate the data so results don't show as dates
    data[category.name] = data[category.name].replace('1', 'never')
    data[category.name] = data[category.name].replace('2', 'rarely')
    data[category.name] = data[category.name].replace('3', 'sometimes')
    data[category.name] = data[category.name].replace('4', 'often')
    data[category.name] = data[category.name].replace('5', 'always')
    
    return data


def mouthwash(category, data):
    if category != Col.Mouthwash:
        return data
    
    #manipulate the data so that results are in form of yes or no
    data.loc[(data[category.name] != 'never') & (data[category.name] != 'No') & (data[category.name] != 'nan'), 
             category.name] = 'yes'
    data[category.name] = data[category.name].replace('never', 'no')
    return data


def religious(category, data):
    if category != Col.Relgious:
        return data
    
    #manipulate the data so that results are in form of yes or no
    data.loc[(data[category.name] != 'No') & (data[category.name] != 'nan'), category.name] = 'yes'
    data[category.name] = data[category.name].replace('never', 'no')
    return data


def orientation(category, data):
    if category != Col.Orientation:
        return data
    
    #fix spelling mistakes, change bicurious to bisexual to generalize data
    data[category.name] = data[category.name].replace('hetrosexual', 'heterosexual')
    data[category.name] = data[category.name].replace(['bicurious', 'Bicurious'], 'bisexual')
    return data
    
    
def runTests(data, category, showAllClasses):
    subcategory = []
    categoryDF = []
    bactClasses = []
    sigClasses = []
    inputs = {}
    
    #decided not to do these groups
    if category.name in ('Menstrual', 'Race', 'Feeling', 'Symptoms', 'EnvAllergies', 'Skin', 'EatPrior',
                        'DrinkPrior', 'TobaccoWeek', 'Toothpaste', 'BiteNails', 'Transportation', 'Environment'):
        return False
    
    
    # fix the inconsistant data
    data = sleepPrior(category, data)
    data = prescription(category, data)
    data = eyes(category, data)
    data = perWeek(category, data)
    data = numToWords(category, data)
    data = mouthwash(category, data)
    data = religious(category, data)
    data = orientation(category, data)

    #change the category into its own separate dataframe

    #get list of subcategories
    try:
        data[category.name] = data[category.name].str.lower()
    except:
        data[category.name]
    subcategory = data[category.name].drop_duplicates().values.tolist()
    
    #remove nans (where people didn't put in an answer)
    try:
        subcategory = [subcat.lower().strip() for subcat in subcategory if str(subcat) != 'nan' 
                   and str(subcat) != 'don\'t know' and str(subcat) != 'do not know' and str(subcat) != 'don?t know']
    except:
        subcategory = [subcat for subcat in subcategory if str(subcat) != 'nan' 
                   and str(subcat) != 'don\'t know' and str(subcat) != 'do not know' and str(subcat) != 'don?t know']
        
    #remove any more duplicates
    subcategory = list(set(subcategory))

    #remove outliers/format the data for specific categories
    if category == Col.Spicy:
        subcategory.remove('sometimes')
    if category == Col.Hands:
        subcategory.remove('ambidextrous')
    if category == Col.BrushTeethWeek:
        subcategory.remove('42689');
    

    #decide on which test to use
    if len(subcategory) == 2:
        test = Test.Ttest
    else:
        test = Test.Anova
        
    #load dataframes for each subcategory of all bacterial classes\n",
    for x in range(0, len(subcategory)):
        categoryDF.append(data[data[category.name] == subcategory[x]].iloc[:, 83:125])
        
    print(category.name)
    print(subcategory)

    #perform t-test if applicable
    if test == Test.Ttest:
        
        #go through all bacterial classes to perform t-tests on them
        for x in range(0, 42):
            ttest = ttest_ind(categoryDF[0].iloc[:, x],categoryDF[1].iloc[:, x])
            pvalue = ttest[1]
            
            #keep a list of all bacteria classes and their index
            if len(bactClasses) != 42:
                bactClasses.append(categoryDF[0].iloc[:,x].name)

            #print classes that are significant
            if pvalue <= 0.05 or showAllClasses:
                #keep track of significant classes
                sigClasses.append(categoryDF[0].iloc[:,x].name)
                
                print(categoryDF[0].iloc[:,x].name, "\nP-value: {:4.3f}".format(pvalue))
        print('\n')
        
    #perform anova test is applicable
    if test == Test.Anova:
        
        #go through all bacterial classes to perform anova tests on them
        for x in range(0,42):
            
            #keep track of how many subcategories there are to put into anova test
            arg = []
            for y in range(0, len(subcategory)):
                arg.append(categoryDF[y].iloc[:, x])
            anova = stats.f_oneway(*(a for a in arg))
            pvalue = anova[1]
            
            #keep a list of all bacteria classes and their index
            if len(bactClasses) != 42:
                bactClasses.append(categoryDF[0].iloc[:,x].name)
            
            #print classes that are significant
            if pvalue <= 0.05 or showAllClasses:
                #keep track of significant classes
                sigClasses.append(categoryDF[0].iloc[:,x].name)
                
                print(categoryDF[0].iloc[:,x].name, "\nP-value: {:4.3f}".format(pvalue))
        print('\n')
        
    #get and set inputs, key = bactClass, value = list of all values under that class - Design 1
#     for c in sigClasses:
#         classValue = []
#         for index, row in data.iterrows():
#             if row[category.name] in subcategory:
#                 classValue.append(row[c])
#         inputs[c] = classValue
#     print(inputs)

    #get and set inputs Design 2
    intputsList = []
    for index, row in data.iterrows():
        classValue = []
        for c in sigClasses:
            if row[category.name] in subcategory:
                classValue.append(row[c])
        intputsList.append(classValue)
    print(intputsList)


class Main():
    data = pd.read_csv('MasterSheet.csv')
    showAllClasses = False
    
    runTests(data, Col.Gender, showAllClasses)
    
#     for x in range(1, 82):
#         runTests(data, Col(x), showAllClasses)



Gender
['female', 'male']
Bacilli 
P-value: 0.040
Fusobacteria 
P-value: 0.000
Epsilonproteobacteria 
P-value: 0.021
Alphaproteobacteria 
P-value: 0.039
Thermotogae 
P-value: 0.012


[[3.9442973119999998, 0.076904877, -0.33510018199999997, -0.342277971, -0.353762432], [3.9723758339999997, 0.190906131, -0.362045826, -0.357471775, -0.381358487], [3.760585735, 0.602311415, -0.370152097, -0.361497044, -0.376256186], [3.114230698, -0.22210792399999998, -0.339227998, -0.318056067, -0.331408263], [5.343307545, 0.134594481, -0.308472788, -0.30015785300000003, -0.319163419], [3.503269346, 0.480379028, -0.363317868, -0.366886806, -0.377593619], [3.746173966, -0.021308512, -0.35812048399999996, -0.329250886, -0.343685685], [2.187348146, -0.207054766, -0.33773129700000004, -0.328490128, -0.336431133], [3.77699665, 0.203021108, -0.31253190000000003, -0.35530704, -0.35530704], [4.293834467, -0.064573619, -0.349397739, -0.333604432, -0.351576126], [4.398598967, 0.094596186, -0.322446842, -0.331351675

In [4]:
# following https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/_index.ipynb
# as a tutorial

import tensorflow as tf
from tensorflow.keras import layers

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

print(x_train)
print("y train")
print(y_train)

# model = tf.keras.models.Sequential([
#   tf.keras.layers.Flatten(),
#   tf.keras.layers.Dense(512, activation=tf.nn.relu),
#   tf.keras.layers.Dropout(0.2),
#   tf.keras.layers.Dense(10, activation=tf.nn.softmax)
# ])

# model.compile(optimizer='adam',
#               loss='sparse_categorical_crossentropy',
#               metrics=['accuracy'])

# model.fit(x_train, y_train, epochs=5)

# model.evaluate(x_test, y_test)

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
y train
[5