In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import trompy as tp
from scipy import stats

In [2]:
# read in data
PATH_TO_WORKBOOK = Path("../data/")
EXCEL_WORKBOOK = "measurements_b3_1.xlsx"
rows, header = tp.metafilereader(PATH_TO_WORKBOOK / EXCEL_WORKBOOK, sheetname="metafile_niacin")


# how about a google drive but that we save a csv file for every day

Showing the data set (such as columns) and creating a new variable for the four groups

In [12]:
# Clean up df

def replace_outliers(df, column):
    
    values=[]
    for idx, row in df.iterrows():
        if row[column] < 1:
            values.append(np.nan)
        elif row[column] > np.mean(df[column]) * 2:
            values.append(np.nan)
        else:
            values.append(row[column])
    
    print("the number of outliers being replaced for", column, "is", np.sum(np.isnan(values)))
    df[column] = values
    mean_by_animal = df.groupby(["animal_id"]).mean(numeric_only=True).reset_index()
            
    values=[]
    for idx, row in df.iterrows():
        if np.isnan(row[column]):
            values.append(mean_by_animal.query('animal_id == @row.animal_id')[column].values[0])
        else:
            values.append(row[column])
    
    return values
    
def remove_and_clean(df):
    """Removes unecessary rows and cleans data to get rid of erroneous values.

    Args:
        df (Pandas dataframe): Contains data from niacin experiment

    Returns:
        Dataframe: Cleaned dataframe
    """
    return (df
            .query("food_yesterday != -1.0")
            
            .assign(food_intake = lambda df_: replace_outliers(df_, "food_intake"))
            .assign(water_intake = lambda df_: replace_outliers(df_, "water_intake"))
    )

df_food_water = remove_and_clean(df)

the number of outliers being replaced for food_intake is 7
the number of outliers being replaced for water_intake is 5


In [13]:
def get_value(row):
    if row['sex'] == 'female' and row['group'] == 'control':
        return 0
    elif row['sex'] == 'female' and row['group'] == 'experimental':
        return 1
    elif row['sex'] == 'male' and row['group'] == 'control':
        return 2
    elif row['sex'] == 'male' and row['group'] == 'experimental':
        return 3
    else:
        return -1

def tweak_df(df):
    
    return (df
            # .query("bodyweight != ''")
            .query("bodyweight.notna()", engine="python")
            .astype({"sex": "category",
                     "group": "category",
                     "diet": "category",
                     "bodyweight": float,
                     "food_yesterday": float,
                     "food_intake": float,
                     "water_intake": float
                     }
                    )
             .assign(groupn = lambda df_: df_.apply(get_value, axis=1))
            )
    
df = tweak_df(pd.DataFrame(rows, columns=header))

In [14]:
df

Unnamed: 0,animal_id,sex,group,diet,date,niacin,bodyweight,food_yesterday,food_today,food_intake,water_yesterday,water_today,water_intake,groupn
0,B3101,male,control,control,2024-02-12,0.0,21.8,-1.0,60.7,-1.000000,-1.0,50.9,-1.000000,2
1,B3102,male,control,control,2024-02-12,0.0,24.7,-1.0,56,-1.000000,-1.0,50.4,-1.000000,2
2,B3103,male,experimental,control,2024-02-12,0.0,24.9,-1.0,54.5,-1.000000,-1.0,50.5,-1.000000,3
3,B3104,male,experimental,control,2024-02-12,0.0,24.9,-1.0,54.5,-1.000000,-1.0,50.5,-1.000000,3
4,B3105,male,control,control,2024-02-12,0.0,25.4,-1.0,56.3,-1.000000,-1.0,50.9,-1.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1179,B3128,female,experimental,experimental,2024-03-25,0.0,22.5,53.5,43.6,3.300000,51.4,38.9,4.166667,1
1180,B3129,female,control,control,2024-03-25,0.0,22.2,37.8,28.9,2.966667,51.4,42.5,2.966667,0
1181,B3130,female,control,control,2024-03-25,0.0,24.0,54.5,44.1,3.466667,51.3,42.5,2.933333,0
1182,B3131,female,experimental,experimental,2024-03-25,0.0,22.2,66.7,57,3.233333,52.0,43.9,2.700000,1


In [15]:
df.dtypes

animal_id            object
sex                category
group              category
diet               category
date                 object
niacin              float64
bodyweight          float64
food_yesterday      float64
food_today           object
food_intake         float64
water_yesterday     float64
water_today          object
water_intake        float64
groupn                int64
dtype: object

In [16]:
df.columns

Index(['animal_id', 'sex', 'group', 'diet', 'date', 'niacin', 'bodyweight',
       'food_yesterday', 'food_today', 'food_intake', 'water_yesterday',
       'water_today', 'water_intake', 'groupn'],
      dtype='object')

In [17]:
df.animal_id.unique()
df.date.unique()

array(['2024-02-12', '2024-02-13', '2024-02-14', '2024-02-15',
       '2024-02-16', '2024-02-17', '2024-02-18', '2024-02-19',
       '2024-02-20', '2024-02-21', '2024-02-22', '2024-02-23',
       '2024-02-24', '2024-02-25', '2024-02-26', '2024-02-27',
       '2024-02-28', '2024-02-29', '2024-03-01', '2024-03-02',
       '2024-03-03', '2024-03-04', '2024-03-05', '2024-03-06',
       '2024-03-07', '2024-03-08', '2024-03-09', '2024-03-10',
       '2024-03-11', '2024-03-12', '2024-03-13', '2024-03-14',
       '2024-03-15', '2024-03-18', '2024-03-20', '2024-03-22',
       '2024-03-25'], dtype=object)

In [18]:
df.groupby(["sex", "group", "date"]).mean(numeric_only=True).reset_index()

Unnamed: 0,sex,group,date,niacin,bodyweight,food_yesterday,food_intake,water_yesterday,water_intake,groupn
0,female,control,2024-02-12,0.0,20.4125,-1.0000,-1.000000,-1.0000,-1.000000,0.0
1,female,control,2024-02-13,0.0,19.9375,77.4500,2.887500,51.5000,9.012500,0.0
2,female,control,2024-02-14,0.0,20.0625,74.5625,2.937500,51.2875,3.062500,0.0
3,female,control,2024-02-15,0.0,20.0875,71.6250,2.987500,51.5250,3.000000,0.0
4,female,control,2024-02-16,0.0,20.2125,72.7875,3.362500,51.7500,3.150000,0.0
...,...,...,...,...,...,...,...,...,...,...
143,male,experimental,2024-03-15,0.0,28.1250,51.7250,3.025000,48.5750,2.475000,3.0
144,male,experimental,2024-03-18,1.5,28.0375,48.7000,2.854167,50.7125,2.279167,3.0
145,male,experimental,2024-03-20,1.5,28.3625,44.6000,2.881250,51.3500,2.381250,3.0
146,male,experimental,2024-03-22,1.5,28.7875,48.1625,3.018750,46.5875,2.418750,3.0


Performing t tests on the data
 (normality and equality of variances check hasn't been performed yet)
But first removing outliers from the dataset

In [19]:
#t tests for females - bodyweight
female_experimental_data = df_food_water[(df_food_water['sex'] == 'female') & (df_food_water['group'] == 'experimental')]
female_control_data = df_food_water[(df_food_water["sex"] == "female") & (df_food_water["group"] == "control")]

for day in df_food_water["date"].unique():  # Iterate over unique dates
    control_data = female_control_data[female_control_data['date'] == day]['bodyweight']
    experimental_data = female_experimental_data[female_experimental_data['date'] == day]['bodyweight']
    
    t_statistic, p_value = stats.ttest_ind(control_data, experimental_data)
    
    # Print results for each day
    print("Day:", day)
    print("t-statistic:", t_statistic)
    print("p-value:", p_value)
    
    # Interpret the results
    alpha = 0.05  # significance level
    if p_value < alpha:
        print("Reject the null hypothesis - There is a significant difference in body weight between the groups at this time point.")
    else:
        print("Fail to reject the null hypothesis - There is no significant difference in body weight between the groups at this time point.")
    print()

Day: 2024-02-13
t-statistic: -1.2082752965060406
p-value: 0.2469554602734486
Fail to reject the null hypothesis - There is no significant difference in body weight between the groups at this time point.

Day: 2024-02-14
t-statistic: -0.8882038106453871
p-value: 0.38943654021832985
Fail to reject the null hypothesis - There is no significant difference in body weight between the groups at this time point.

Day: 2024-02-15
t-statistic: -1.502752910229976
p-value: 0.15512108630446475
Fail to reject the null hypothesis - There is no significant difference in body weight between the groups at this time point.

Day: 2024-02-16
t-statistic: -0.7177627307884807
p-value: 0.4847092040082437
Fail to reject the null hypothesis - There is no significant difference in body weight between the groups at this time point.

Day: 2024-02-17
t-statistic: -0.7409486090900833
p-value: 0.4709706824307748
Fail to reject the null hypothesis - There is no significant difference in body weight between the groups 

In [20]:
#t tests for females - food intake
female_experimental_data = df_food_water[(df_food_water['sex'] == 'female') & (df_food_water['group'] == 'experimental')]
female_control_data = df_food_water[(df_food_water["sex"] == "female") & (df_food_water["group"] == "control")]

for day in df_food_water["date"].unique():  # Iterate over unique dates
    control_data = female_control_data[female_control_data['date'] == day]['food_intake']
    experimental_data = female_experimental_data[female_experimental_data['date'] == day]['food_intake']
    
    t_statistic, p_value = stats.ttest_ind(control_data, experimental_data)
    
    # Print results for each day
    print("Day:", day)
    print("t-statistic:", t_statistic)
    print("p-value:", p_value)
    
    # Interpret the results
    alpha = 0.05  # significance level
    if p_value < alpha:
        print("Reject the null hypothesis - There is a significant difference in food intake between the groups at this time point.")
    else:
        print("Fail to reject the null hypothesis - There is no significant difference in food intake between the groups at this time point.")
    print()

Day: 2024-02-13
t-statistic: 0.18968247235793748
p-value: 0.8522805928992132
Fail to reject the null hypothesis - There is no significant difference in food intake between the groups at this time point.

Day: 2024-02-14
t-statistic: -0.5417023090757583
p-value: 0.5965399873321169
Fail to reject the null hypothesis - There is no significant difference in food intake between the groups at this time point.

Day: 2024-02-15
t-statistic: 0.05397245024022033
p-value: 0.9577197408319085
Fail to reject the null hypothesis - There is no significant difference in food intake between the groups at this time point.

Day: 2024-02-16
t-statistic: -0.11675588928432197
p-value: 0.9087118725947871
Fail to reject the null hypothesis - There is no significant difference in food intake between the groups at this time point.

Day: 2024-02-17
t-statistic: 0.7867957924694411
p-value: 0.4445168643786436
Fail to reject the null hypothesis - There is no significant difference in food intake between the groups a

In [21]:
#t tests for females - water intake
female_experimental_data = df_food_water[(df_food_water['sex'] == 'female') & (df_food_water['group'] == 'experimental')]
female_control_data = df_food_water[(df_food_water["sex"] == "female") & (df_food_water["group"] == "control")]

for day in df_food_water["date"].unique():  # Iterate over unique dates
    control_data = female_control_data[female_control_data['date'] == day]['water_intake']
    experimental_data = female_experimental_data[female_experimental_data['date'] == day]['water_intake']
    
    t_statistic, p_value = stats.ttest_ind(control_data, experimental_data)
    
    # Print results for each day
    print("Day:", day)
    print("t-statistic:", t_statistic)
    print("p-value:", p_value)
    
    # Interpret the results
    alpha = 0.05  # significance level
    if p_value < alpha:
        print("Reject the null hypothesis - There is a significant difference in water intake between the groups at this time point.")
    else:
        print("Fail to reject the null hypothesis - There is no significant difference in water intake between the groups at this time point.")
    print()

Day: 2024-02-13
t-statistic: 0.6939076658142835
p-value: 0.4990929300529223
Fail to reject the null hypothesis - There is no significant difference in water intake between the groups at this time point.

Day: 2024-02-14
t-statistic: 1.5939701191492737
p-value: 0.13326281504951323
Fail to reject the null hypothesis - There is no significant difference in water intake between the groups at this time point.

Day: 2024-02-15
t-statistic: -0.25819888974716076
p-value: 0.8000104103653621
Fail to reject the null hypothesis - There is no significant difference in water intake between the groups at this time point.

Day: 2024-02-16
t-statistic: -0.2778347225812983
p-value: 0.7852001015366507
Fail to reject the null hypothesis - There is no significant difference in water intake between the groups at this time point.

Day: 2024-02-17
t-statistic: 0.9701425001453265
p-value: 0.34843869825643436
Fail to reject the null hypothesis - There is no significant difference in water intake between the gro

In [22]:
#t tests for males - body weight
male_experimental_data = df_food_water[(df_food_water['sex'] == 'male') & (df_food_water['group'] == 'experimental')]
male_control_data = df_food_water[(df_food_water["sex"] == "male") & (df_food_water["group"] == "control")]

for day in df_food_water["date"].unique():  # Iterate over unique dates
    control_data = male_control_data[male_control_data['date'] == day]['bodyweight']
    experimental_data = male_experimental_data[male_experimental_data['date'] == day]['bodyweight']
    
    t_statistic, p_value = stats.ttest_ind(control_data, experimental_data)
    
    # Print results for each day
    print("Day:", day)
    print("t-statistic:", t_statistic)
    print("p-value:", p_value)
    
    # Interpret the results
    alpha = 0.05  # significance level
    if p_value < alpha:
        print("Reject the null hypothesis - There is a significant difference in body weight between the groups at this time point.")
    else:
        print("Fail to reject the null hypothesis - There is no significant difference in body weight between the groups at this time point.")
    print()

Day: 2024-02-13
t-statistic: -0.11009065766597023
p-value: 0.9139000405415773
Fail to reject the null hypothesis - There is no significant difference in body weight between the groups at this time point.

Day: 2024-02-14
t-statistic: -0.9137080416200226
p-value: 0.3763380423523791
Fail to reject the null hypothesis - There is no significant difference in body weight between the groups at this time point.

Day: 2024-02-15
t-statistic: -1.1349008767288875
p-value: 0.2754746122452408
Fail to reject the null hypothesis - There is no significant difference in body weight between the groups at this time point.

Day: 2024-02-16
t-statistic: -1.447830410173213
p-value: 0.16968693424714965
Fail to reject the null hypothesis - There is no significant difference in body weight between the groups at this time point.

Day: 2024-02-17
t-statistic: -1.1090412261584421
p-value: 0.2861006693401651
Fail to reject the null hypothesis - There is no significant difference in body weight between the groups 

In [23]:
#t tests for males - water intake
male_experimental_data = df_food_water[(df_food_water['sex'] == 'male') & (df_food_water['group'] == 'experimental')]
male_control_data = df_food_water[(df_food_water["sex"] == "male") & (df_food_water["group"] == "control")]

for day in df_food_water["date"].unique():  # Iterate over unique dates
    control_data = male_control_data[male_control_data['date'] == day]['water_intake']
    experimental_data = male_experimental_data[male_experimental_data['date'] == day]['water_intake']
    
    t_statistic, p_value = stats.ttest_ind(control_data, experimental_data)
    
    # Print results for each day
    print("Day:", day)
    print("t-statistic:", t_statistic)
    print("p-value:", p_value)
    
    # Interpret the results
    alpha = 0.05  # significance level
    if p_value < alpha:
        print("Reject the null hypothesis - There is a significant difference in water intake between the groups at this time point.")
    else:
        print("Fail to reject the null hypothesis - There is no significant difference in water intake between the groups at this time point.")
    print()

Day: 2024-02-13
t-statistic: -1.1237046501291088
p-value: 0.2800380891051075
Fail to reject the null hypothesis - There is no significant difference in water intake between the groups at this time point.

Day: 2024-02-14
t-statistic: -0.41547809001357516
p-value: 0.6840896806261858
Fail to reject the null hypothesis - There is no significant difference in water intake between the groups at this time point.

Day: 2024-02-15
t-statistic: -0.8715444587568315
p-value: 0.3981568940749517
Fail to reject the null hypothesis - There is no significant difference in water intake between the groups at this time point.

Day: 2024-02-16
t-statistic: -0.5544041192379071
p-value: 0.5880498939951981
Fail to reject the null hypothesis - There is no significant difference in water intake between the groups at this time point.

Day: 2024-02-17
t-statistic: 0.4065725086860543
p-value: 0.690469113407749
Fail to reject the null hypothesis - There is no significant difference in water intake between the grou

In [24]:
#t tests males - food intake
male_experimental_data = df_food_water[(df_food_water['sex'] == 'male') & (df_food_water['group'] == 'experimental')]
male_control_data = df_food_water[(df_food_water["sex"] == "male") & (df_food_water["group"] == "control")]

for day in df_food_water["date"].unique():  # Iterate over unique dates
    control_data = male_control_data[male_control_data['date'] == day]["food_intake"] 
    experimental_data = male_experimental_data[male_experimental_data['date'] == day]['food_intake']
    
    t_statistic, p_value = stats.ttest_ind(control_data, experimental_data)
    
    # Print results for each day
    print("Day:", day)
    print("t-statistic:", t_statistic)
    print("p-value:", p_value)
    
    # Interpret the results
    alpha = 0.05  # significance level
    if p_value < alpha:
        print("Reject the null hypothesis - There is a significant difference in food intake between the groups at this time point.")
    else:
        print("Fail to reject the null hypothesis - There is no significant difference in food intake between the groups at this time point.")
    print()

Day: 2024-02-13
t-statistic: -0.2613286875312944
p-value: 0.7976442396508406
Fail to reject the null hypothesis - There is no significant difference in food intake between the groups at this time point.

Day: 2024-02-14
t-statistic: -0.20873254938846433
p-value: 0.8376634743078235
Fail to reject the null hypothesis - There is no significant difference in food intake between the groups at this time point.

Day: 2024-02-15
t-statistic: -0.691950971492164
p-value: 0.5002838330524217
Fail to reject the null hypothesis - There is no significant difference in food intake between the groups at this time point.

Day: 2024-02-16
t-statistic: -0.7819012143768049
p-value: 0.44729549275175495
Fail to reject the null hypothesis - There is no significant difference in food intake between the groups at this time point.

Day: 2024-02-17
t-statistic: -0.2182178902360036
p-value: 0.8304081425975807
Fail to reject the null hypothesis - There is no significant difference in food intake between the groups 