In [86]:
from enum import Enum
import pandas as pd
from scipy.stats import ttest_ind
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats


#Enum for all the categories and their column number
class Col(Enum):
    Gender = 1
    Menstrual = 2 
    SleepPrior = 3 
    Prescription = 4 
    OralAntibiotics = 5 
    TopAntibiotics = 6 
    Eyes = 7 
    Hands = 8 
    Race = 9 
    Feeling = 10 
    Symptoms = 11 
    Facial = 12 
    LastVomit = 13 
    EnvAllergies = 14 
    Snore = 15 
    GrindTeeth = 16 
    EarInfect = 17 
    AllergyTime = 18 
    Skin = 19 
    OralSurgery = 20 
    HeartDisease = 21 
    BadTeeth = 22 
    EatPrior = 23 
    DrinkPrior = 24 
    GumDay = 25 
    YogurtWeek = 26 
    TeaWeek = 27 
    CoffeeWeek = 28 
    SodaWeek = 29 
    FastFoodWeek = 30 
    EatOutWeek = 31 
    Straw = 32 
    AddedSugar = 33 
    Spicy = 34 
    ShareDrink = 35 
    MeatWeek = 36 
    FreshWeek = 37 
    BrushTongue = 38 
    BrushTeethWeek = 39 
    TobaccoWeek = 40 
    FlossWeek = 41 
    DentistYear = 42 
    Toothbrush = 43 
    Mouthwash = 44 
    Toothpaste = 45 
    WashHandsEat = 46 
    WashHandsRestroom = 47 
    ChangeToothbrush = 48 
    BiteNails = 49 
    OrthoDevice = 50 
    Device = 51 
    Stress = 52 
    HandleStress = 53 
    Organized = 54 
    RateStress = 55 
    AvgSleep = 56 
    AvgWork = 57 
    Setting = 58 
    Environment = 59 
    UpDown = 60 
    Weight = 61 
    ExerciseMonth = 62 
    Transportation = 63 
    Orientation = 64 
    NearDogsWeek = 65 
    NearCatsWeek = 66 
    ShareBathroom = 67 
    BooksYear = 68 
    Alarms = 69 
    Patience = 70 
    Lipstick = 71 
    IntroExtro = 72 
    Relgious = 73 
    OptiPessi = 74 
    SexuallyActive = 75 
    OralSex = 76 
    Music = 77 
    AvgGrade = 78 
    RateHappy = 79 
    Arts = 80 
    Kiss = 81 
    Roommates = 82 

    
#Enum for type of test
class Test(Enum):
    Ttest = 1
    Anova = 2
    

def sleepPrior(category, data):
    if category != Col.SleepPrior:
        return data
    
    #if SleepPrior, manipulate the data so that the subcategories don't go over 7
    #also create new subcategories to make it more general
    data[category.name] = data[category.name].replace(['0', '1', '2', '3'], 'less than 4')
    data[category.name] = data[category.name].replace(['4', '5', '6', '7'], '4 to 7')
    data[category.name] = data[category.name].replace(['8', '9', '10', '11', '12'], 'more than 7')
    return data


def Prescription(category, data):
    if category != Col.Prescription:
        return data
    
    #manipulate the data so that if the prescription column has a specific prescription, replace with yes
    data[category.name] = data[category.name].replace(['None', 'none', 'no '], 'no')
    data.loc[(data[category.name] != 'no') & (data[category.name] != 'No') & (data[category.name] != 'nan'), category.name] = 'yes'
    return data


def eyes(category, data):
    if category != Col.Eyes:
        return data
    
    #manipulate the data so that if the prescription column has a specific prescription, replace with yes
    data[category.name] = data[category.name].replace('Green/blue/gold mix', 'hazel')
    return data


def gumDay(category, data):
    if category != Col.GumDay:
        return data
    
    #manipulate the data so that if the prescription column has a specific prescription, replace with yes
    data[category.name] = data[category.name].replace('2-Jan', '1-2')
    data[category.name] = data[category.name].replace('4-Mar', '3-4')
    data[category.name] = data[category.name].replace('6-May', '5-6')
    return data
    
    
def runTests(data, category, showAllClasses):
    subcategory = []
    categoryDF = []
    
    #decided not to do these groups
    if category.name in ('Menstrual', 'Race', 'Feeling', 'Symptoms', 'EnvAllergies', 'Skin', 'EatPrior',
                        'DrinkPrior'):
        return False
    
    
    # fix the inconsistant data
    data = sleepPrior(category, data)
    data = Prescription(category, data)
    data = eyes(category, data)
    data = gumDay(category, data)

    #change the category into its own separate dataframe

    #get list of subcategories
    try:
        data[category.name] = data[category.name].str.lower()
    except:
        data[category.name]
    subcategory = data[category.name].drop_duplicates().values.tolist()
    
    #remove nans (where people didn't put in an answer)
    try:
        subcategory = [subcat.lower().strip() for subcat in subcategory if str(subcat) != 'nan' 
                   and str(subcat) != 'don\'t know' and str(subcat) != 'do not know' and str(subcat) != 'don?t know']
    except:
        subcategory = [subcat for subcat in subcategory if str(subcat) != 'nan' 
                   and str(subcat) != 'don\'t know' and str(subcat) != 'do not know' and str(subcat) != 'don?t know']
        
    #remove any more duplicates
    subcategory = list(set(subcategory))

    #remove outliers/format the data for specific categories
    if category == Col.Spicy:
        subcategory.remove('sometimes')
    if category == Col.Hands:
        subcategory.remove('ambidextrous')
    

    #decide on which test to use
    if len(subcategory) == 2:
        test = Test.Ttest
    else:
        test = Test.Anova
        
    #load dataframes for each subcategory of all bacterial classes\n",
    for x in range(0, len(subcategory)):
        categoryDF.append(data[data[category.name] == subcategory[x]].iloc[:, 83:125])
        
    print(category.name)
    print(subcategory)

    #perform t-test if applicable
    if test == Test.Ttest:
        
        #go through all bacterial classes to perform t-tests on them
        for x in range(0, 42):
            ttest = ttest_ind(categoryDF[0].iloc[:, x],categoryDF[1].iloc[:, x])
            pvalue = ttest[1]

            #print classes that are significant
            if pvalue <= 0.05 or showAllClasses:
                print(categoryDF[0].iloc[:,x].name, "\nP-value: {:4.3f}".format(pvalue))
        print('\n')
        
    #perform anova test is applicable
    if test == Test.Anova:
        
        #go through all bacterial classes to perform anova tests on them
        for x in range(0,42):
            
            #keep track of how many subcategories there are to put into anova test
            arg = []
            for y in range(0, len(subcategory)):
                arg.append(categoryDF[y].iloc[:, x])
            anova = stats.f_oneway(*(a for a in arg))
            pvalue = anova[1]
            
            #print classes that are significant
            if pvalue <= 0.05 or showAllClasses:
                print(categoryDF[0].iloc[:,x].name, "\nP-value: {:4.3f}".format(pvalue))
        print('\n')


class Main():
    data = pd.read_csv('MasterSheet.csv')
    showAllClasses = True
    
    runTests(data, Col.YogurtWeek, showAllClasses)
    
#     for x in range(1, 82):
#         runTests(data, Col(x), showAllClasses)



YogurtWeek
[0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 7.0]
Bacilli 
P-value: 0.672
Bacteroidia 
P-value: 0.029
Betaproteobacteria 
P-value: 0.640
Clostridia 
P-value: 0.836
Gammaproteobacteria 
P-value: 0.005
Actinobacteria 
P-value: 0.229
Fusobacteria 
P-value: 0.317
Flavobacteriia 
P-value: 0.559
Sphingobacteriia 
P-value: 0.785
Deltaproteobacteria 
P-value: 0.791
Epsilonproteobacteria 
P-value: 0.984
Erysipelotrichi 
P-value: 0.205
Alphaproteobacteria 
P-value: 0.855
Spirochaetes 
P-value: 0.797
Nostocophycideae 
P-value: 0.681
Mollicutes 
P-value: 0.869
Chlamydiia 
P-value: 0.818
GroupII 
P-value: 0.807
Deinococci 
P-value: 0.802
Synergistia 
P-value: 0.753
Oscillatoriophycideae 
P-value: 0.708
Opitutae 
P-value: 0.785
Thermoprotei 
P-value: 0.823
Anaerolineae 
P-value: 0.775
Deferribacteres 
P-value: 0.797
Nitriliruptoria 
P-value: 0.802
Acidobacteria 
P-value: 0.810
Thermotogae 
P-value: 0.879
Synechococcophycideae 
P-value: 0.803
Thermodesulfobacteria 
P-value: 0.796
Caldithrixae 
P-value: 