In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import trompy as tp

In [None]:
# read in data
PATH_TO_WORKBOOK = Path("../data/")
EXCEL_WORKBOOK = "measurements_b3_1.xlsx"
rows, header = tp.metafilereader(PATH_TO_WORKBOOK / EXCEL_WORKBOOK, sheetname="metafile_niacin")


# how about a google drive but that we save a csv file for every day

In [None]:
def get_value(row):
    if row['sex'] == 'female' and row['group'] == 'control':
        return 0
    elif row['sex'] == 'female' and row['group'] == 'experimental':
        return 1
    elif row['sex'] == 'male' and row['group'] == 'control':
        return 2
    elif row['sex'] == 'male' and row['group'] == 'experimental':
        return 3
    else:
        return -1

def tweak_df(df):
    
    return (df
            # .query("bodyweight != ''")
            .query("bodyweight.notna()", engine="python")
            .astype({"sex": "category",
                     "group": "category",
                     "diet": "category",
                     "bodyweight": float,
                     "food_yesterday": float,
                     "food_intake": float,
                     "water_intake": float
                     }
                    )
             .assign(groupn = lambda df_: df_.apply(get_value, axis=1))
            )
    
df = tweak_df(pd.DataFrame(rows, columns=header))

In [None]:
df

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
df.animal_id.unique()
df.date.unique()

In [None]:
df.groupby(["sex", "group", "date"]).mean(numeric_only=True).reset_index()

In [None]:
# Find mean and SEM of all data
mean = df.groupby(["sex", "group", "date"]).mean(numeric_only=True).reset_index()
sem = df.groupby(["sex", "group", "date"]).sem(numeric_only=True).reset_index()

# Generate figure with two subplots
f, ax = plt.subplots(ncols=2, figsize=(12, 6), sharey=True,
                     gridspec_kw={"width_ratios": [3, 1]},)

# Set x values based on number of days
x = np.arange(int(len(mean.date) / 4))

# Plot data for each group in first subplot
ax[0].errorbar(x,
               mean.query("sex == 'female' and group == 'control'").bodyweight,
               yerr=sem.query("sex == 'female' and group == 'control'").bodyweight,
               marker="s", markerfacecolor="white", color="red",
               label="female, control"
                )

ax[0].errorbar(x,
               mean.query("sex == 'male' and group == 'control'").bodyweight,
               yerr=sem.query("sex == 'male' and group == 'control'").bodyweight,
               marker="o", markerfacecolor="white", color="blue",
               label="male, control")

ax[0].errorbar(x,
               mean.query("sex == 'female' and group == 'experimental'").bodyweight,
               yerr=sem.query("sex == 'female' and group == 'experimental'").bodyweight,
               marker="s", markerfacecolor="red", color="red",
               label="female, exptl")

ax[0].errorbar(x,
               mean.query("sex == 'male' and group == 'experimental'").bodyweight,
               yerr=sem.query("sex == 'male' and group == 'experimental'").bodyweight,
               marker="o", markerfacecolor="blue", color="blue",
               label="male, exptl")

# Set x axis limits and ticks
ax[0].set_xlim([-0.5,35])
ax[0].set_xticks([0,7,14,21,28,35])

# Add labels
ax[0].set_ylabel("Bodyweight (g)")
ax[0].set_xlabel("Days")

ax[0].legend()

# Plot today's data in second subplot
today = df.date.max()
today_df = df.query("date == @today")

palette = ["red", "red", "blue", "blue"]

sns.stripplot(data=today_df, x="groupn", y="bodyweight", hue="groupn", palette=palette, ax=ax[1])
sns.pointplot(data=today_df, x='groupn', y='bodyweight', capsize=.2, color="grey", markers='o', join=False, ax=ax[1])

ax[1].set_xticklabels(["F-C", "F-E", "M-C", "M-E"])
ax[1].set_xlabel("")
ax[1].set_ylabel("Bodyweight (g)")
ax[1].legend().remove()

In [None]:
today_df

In [None]:
today

In [None]:
df.head()

In [None]:
# Food intake figure

# Clean up df

def replace_outliers(df, column):
    
    values=[]
    for idx, row in df.iterrows():
        if row[column] < 1:
            values.append(np.nan)
        elif row[column] > np.mean(df[column]) * 2:
            values.append(np.nan)
        else:
            values.append(row[column])
    
    print("the number of outliers being replaced for", column, "is", np.sum(np.isnan(values)))
    df[column] = values
    mean_by_animal = df.groupby(["animal_id"]).mean(numeric_only=True).reset_index()
            
    values=[]
    for idx, row in df.iterrows():
        if np.isnan(row[column]):
            values.append(mean_by_animal.query('animal_id == @row.animal_id')[column].values[0])
        else:
            values.append(row[column])
    
    return values
    
def remove_and_clean(df):
    """Removes unecessary rows and cleans data to get rid of erroneous values.

    Args:
        df (Pandas dataframe): Contains data from niacin experiment

    Returns:
        Dataframe: Cleaned dataframe
    """
    return (df
            .query("food_yesterday != -1.0")
            
            .assign(food_intake = lambda df_: replace_outliers(df_, "food_intake"))
            .assign(water_intake = lambda df_: replace_outliers(df_, "water_intake"))
    )

df_food_water = remove_and_clean(df)

In [None]:
# Find mean and SEM of all data
mean_fw = df_food_water.groupby(["sex", "group", "date"]).mean(numeric_only=True).reset_index()
sem_fw = df_food_water.groupby(["sex", "group", "date"]).sem(numeric_only=True).reset_index()

# Generate figure with two subplots
f, ax = plt.subplots(ncols=2, figsize=(12, 6), sharey=True,
                     gridspec_kw={"width_ratios": [3, 1]},)

# Set x values based on number of days
x = np.arange(int(len(mean.date) / 4) - 1)

# Plot data for each group in first subplot
ax[0].errorbar(x,
               mean_fw.query("sex == 'female' and group == 'control'").food_intake,
               yerr=sem_fw.query("sex == 'female' and group == 'control'").food_intake,
               marker="s", markerfacecolor="white", color="red",
               label="female, control"
                )

ax[0].errorbar(x,
               mean_fw.query("sex == 'male' and group == 'control'").food_intake,
               yerr=sem_fw.query("sex == 'male' and group == 'control'").food_intake,
               marker="o", markerfacecolor="white", color="blue",
               label="male, control")

ax[0].errorbar(x,
               mean_fw.query("sex == 'female' and group == 'experimental'").food_intake,
               yerr=sem_fw.query("sex == 'female' and group == 'experimental'").food_intake,
               marker="s", markerfacecolor="red", color="red",
               label="female, exptl")

ax[0].errorbar(x,
               mean_fw.query("sex == 'male' and group == 'experimental'").food_intake,
               yerr=sem_fw.query("sex == 'male' and group == 'experimental'").food_intake,
               marker="o", markerfacecolor="blue", color="blue",
               label="male, exptl")

# Set x axis limits and ticks
ax[0].set_xlim([-0.5,35])
ax[0].set_xticks([0,7,14,21,28,25])

ax[0].set_ylim([0,4.5])

# Add labels
ax[0].set_ylabel("Food intake (g)")
ax[0].set_xlabel("Days")

ax[0].legend()

# Plot today's data in second subplot
today = df.date.max()
today_df = df.query("date == @today")

palette = ["red", "red", "blue", "blue"]

sns.stripplot(data=today_df, x="groupn", y="food_intake", hue="groupn", palette=palette, ax=ax[1])
sns.pointplot(data=today_df, x='groupn', y='food_intake', capsize=.2, color="grey", markers='o', join=False, ax=ax[1])

ax[1].set_xticklabels(["F-C", "F-E", "M-C", "M-E"])
ax[1].set_xlabel("")
ax[1].set_ylabel("Food intake (g)")
ax[1].legend().remove()


In [None]:
# Find mean and SEM of all data
mean_fw = df_food_water.groupby(["sex", "group", "date"]).mean(numeric_only=True).reset_index()
sem_fw = df_food_water.groupby(["sex", "group", "date"]).sem(numeric_only=True).reset_index()

# Generate figure with two subplots
f, ax = plt.subplots(ncols=2, figsize=(12, 6), sharey=True,
                     gridspec_kw={"width_ratios": [3, 1]},)

# Set x values based on number of days
x = np.arange(int(len(mean.date) / 4) - 1)

# Plot data for each group in first subplot
ax[0].errorbar(x,
               mean_fw.query("sex == 'female' and group == 'control'").water_intake,
               yerr=sem_fw.query("sex == 'female' and group == 'control'").water_intake,
               marker="s", markerfacecolor="white", color="red",
               label="female, control"
                )

ax[0].errorbar(x,
               mean_fw.query("sex == 'male' and group == 'control'").water_intake,
               yerr=sem_fw.query("sex == 'male' and group == 'control'").water_intake,
               marker="o", markerfacecolor="white", color="blue",
               label="male, control")

ax[0].errorbar(x,
               mean_fw.query("sex == 'female' and group == 'experimental'").water_intake,
               yerr=sem_fw.query("sex == 'female' and group == 'experimental'").water_intake,
               marker="s", markerfacecolor="red", color="red",
               label="female, exptl")

ax[0].errorbar(x,
               mean_fw.query("sex == 'male' and group == 'experimental'").water_intake,
               yerr=sem_fw.query("sex == 'male' and group == 'experimental'").water_intake,
               marker="o", markerfacecolor="blue", color="blue",
               label="male, exptl")

# Set x axis limits and ticks
ax[0].set_xlim([-0.5,35])
ax[0].set_xticks([0,7,14,21,28,35])

ax[0].set_ylim([0,4.5])

# Add labels
ax[0].set_ylabel("Water intake (mL)")
ax[0].set_xlabel("Days")

ax[0].legend()

# Plot today's data in second subplot
today = df.date.max()
today_df = df.query("date == @today")

palette = ["red", "red", "blue", "blue"]

sns.stripplot(data=today_df, x="groupn", y="water_intake", hue="groupn", palette=palette, ax=ax[1])
sns.pointplot(data=today_df, x='groupn', y='water_intake', capsize=.2, color="grey", markers='o', join=False, ax=ax[1])

ax[1].set_xticklabels(["F-C", "F-E", "M-C", "M-E"])
ax[1].set_xlabel("")
ax[1].set_ylabel("Water intake (mL)")
ax[1].legend().remove()


In [None]:
df.query("date == '2024-02-13'")

In [None]:
df_food_water.groupby(["sex", "group", "date"]).mean(numeric_only=True).reset_index()

In [None]:
mean_fw.query("sex == 'female' and group == 'control'").water_intake
sem_fw.query("sex == 'female' and group == 'control'").water_intake

In [None]:
# List of figures

# body weight graphs
# body weight of CON vs ND, male vs female
# bar graph and daily line graph
# most recent day change from last 3 days? (barscatter)


# food intake



# niacin concentration



#individual mice showing all on one graph

In [None]:
#I'm not really sure what happens in the following cells: 
#It seems like it's double?

In [None]:
# Food intake figure

# Clean up df

def replace_outliers(df, column):
    
    values=[]
    for idx, row in df.iterrows():
        if row[column] < 1:
            values.append(np.nan)
        elif row[column] > np.mean(df[column]) * 2:
            values.append(np.nan)
        else:
            values.append(row[column])
            
    df[column] = values
    mean_by_animal = df.groupby(["animal_id"]).mean(numeric_only=True).reset_index()
            
    values=[]
    for idx, row in df.iterrows():
        if np.isnan(row[column]):
            values.append(mean_by_animal.query('animal_id == @row.animal_id')[column].values[0])
        else:
            values.append(row[column])
    
    return values
    
def remove_and_clean(df):
    """Removes unecessary rows and cleans data to get rid of erroneous values.

    Args:
        df (Pandas dataframe): Contains data from niacin experiment

    Returns:
        Dataframe: Cleaned dataframe
    """
    return (df
            .query("food_yesterday != -1.0")
            
            .assign(food_intake = lambda df_: replace_outliers(df_, "food_intake"))
            .assign(water_intake = lambda df_: replace_outliers(df_, "water_intake"))
    )

df_food_water = remove_and_clean(df)

# replace_outliers(df, "food_intake")

In [None]:
df_food_water

In [None]:
a = np.nan
np.isnan(a)

In [None]:
df.groupby(["animal_id"]).mean(numeric_only=True).reset_index()

In [None]:
#starting from here my own (pathetic?) attempts at coding

In [None]:
#Now I want the same graph but then with the four subgroups 
#Plot here will depict mean of all animals per day
#But that's not what I wanted, I want the mean per animal for all days
import matplotlib.patches as mpatches

def remove_outliers(df, column):
    values = []
    for idx, row in df.iterrows():
        if row[column] < 1 or row[column] > np.mean(df[column]) * 2:
            values.append(np.nan)
        else:
            values.append(row[column])
    return values

def remove_and_clean(df):
    """Removes unnecessary rows and cleans data to get rid of erroneous values.
    Args:
        df (Pandas dataframe): Contains data from niacin experiment
    Returns:
        Dataframe: Cleaned dataframe
    """
    df_cleaned = (
        df.query("food_yesterday != -1.0")
        .assign(food_intake=lambda df_: remove_outliers(df_, "food_intake"))
        .assign(water_intake=lambda df_: remove_outliers(df_, "water_intake"))
    )
    return df_cleaned

# Cleans the dataframe for food intake and water intake: removes the outliers using "remove_and_clean" and removes
# all of the -1 values
df_food_water = remove_and_clean(df)

# Calculate mean values per subgroup
mean_fw = df_food_water.groupby(["sex", "group", "date"]).mean(numeric_only=True).reset_index()

# Get color for each data point
colors = {"3": "dodgerblue", "2": "lightskyblue", "1": "mediumslateblue", "0": "purple"}
color_list = [colors[str(int(group))] for group in mean_fw['groupn']]

# Create a scatter plot with color-coding based on the four groups
ax = mean_fw.plot.scatter(x="food_intake", y="water_intake", c=color_list, grid=True)

# Create legend handles, labels for each group, and add legend to the plot
legend_handles = [
    mpatches.Patch(color=colors["0"], label="female control"),
    mpatches.Patch(color=colors["1"], label="female experimental"),
    mpatches.Patch(color=colors["2"], label="male control"),
    mpatches.Patch(color=colors["3"], label="male experimental")
]
ax.legend(handles=legend_handles, loc='upper left')

# Set labels for the axes and title
ax.set_xlabel("Food intake (Mean)")
ax.set_ylabel("Water intake (Mean)")
ax.set_title("Mean Food intake vs Mean Water intake")

#This shows the mean per day, not per animal apparently
unique_combinations = mean_fw[['sex', 'group', 'date']].drop_duplicates()
print(unique_combinations)

In [None]:
# So a new try for to get an overview of all animals: their mean food intake plotted against their mean water intake

#First the same stemps as in previous cells, cleaning the dataframe. 
def remove_outliers(df, column):
    values = []
    for idx, row in df.iterrows():
        if row[column] < 1 or row[column] > np.mean(df[column]) * 2:
            values.append(np.nan)
        else:
            values.append(row[column])
    return values

def remove_and_clean(df):
    """Removes unnecessary rows and cleans data to get rid of erroneous values.
    Args:
        df (Pandas dataframe): Contains data from niacin experiment
    Returns:
        Dataframe: Cleaned dataframe
    """
    df_cleaned = (
        df.query("food_yesterday != -1.0")
        .assign(food_intake=lambda df_: remove_outliers(df_, "food_intake"))
        .assign(water_intake=lambda df_: remove_outliers(df_, "water_intake"))
    )
    return df_cleaned

# Cleans the dataframe for food intake and water intake: removes the outliers using "remove_and_clean" and removes
# all of the -1 values
df_food_water = remove_and_clean(df)

#Get the mean of all values from the animals by animal_id
mean_fw = df_food_water.groupby(["animal_id"]).mean(numeric_only=True).reset_index()

# Get color for each data point
colors = {"3": "dodgerblue", "2": "lightskyblue", "1": "mediumslateblue", "0": "purple"}
color_list = [colors[str(int(group))] for group in mean_fw['groupn']]

# Create a scatter plot with color-coding based on the four groups
ax = mean_fw.plot.scatter(x="food_intake", y="water_intake", c=color_list, grid=True)

# Create legend handles, labels for each group, and add legend to the plot
legend_handles = [
    mpatches.Patch(color=colors["0"], label="female control"),
    mpatches.Patch(color=colors["1"], label="female experimental"),
    mpatches.Patch(color=colors["2"], label="male control"),
    mpatches.Patch(color=colors["3"], label="male experimental")
]
ax.legend(handles=legend_handles, loc='upper left')

# Set labels for the axes and title
ax.set_xlabel("Food intake (Mean)")
ax.set_ylabel("Water intake (Mean)")
ax.set_title("Mean Food intake vs Mean Water intake")

In [None]:
# New project: Try to run a t-test on the data to see if there are significant differences between the groups 
# Then, To display all individual animals's course in body weight, water and food intake
# Might be a bit much: Perhaps plot it in multiple plots 

In [None]:
#good, but now 4 subplots of the groups in one graph

In [None]:
# Subset the DataFrames into four
female_experimental_data = df[(df['sex'] == 'female') & (df['group'] == 'experimental')]
female_control_data = df[(df["sex"] == "female") & (df["group"] == "control")]
male_experimental_data = df[(df["sex"] == "male") & (df["group"] == "experimental")]
male_control_data = df[(df["sex"] == "male") & (df["group"] == "control")]

# Set x values based on the number of days
x_values0 = np.arange(len(female_experimental_data['date'].unique()))
x_values1 = np.arange(len(female_control_data['date'].unique()))
x_values2 = np.arange(len(male_experimental_data['date'].unique()))
x_values3 = np.arange(len(male_control_data['date'].unique()))

# Create a subplot
fig, ax = plt.subplots(ncols=4, figsize=(18, 10), sharey=True,
                     gridspec_kw={"width_ratios": [1, 1, 1, 1]})

# Get unique samples in the dataset
samples0 = female_experimental_data['animal_id'].unique()
samples1 = female_control_data["animal_id"].unique()
samples2 = male_experimental_data["animal_id"].unique()
samples3 = male_control_data["animal_id"].unique()

# Colour of lines
palette = ["orange", "magenta", "green", "black", "red", "pink", "cyan", "purple"]

# Loop through each sample and plot the connected data points
for i, sample in enumerate(samples0):
    sample_data = female_experimental_data[female_experimental_data['animal_id'] == sample]
    ax[0].plot(x_values0,
               sample_data['bodyweight'],
               marker='s', linestyle='-',
               markerfacecolor="white",
               color=palette[i % len(palette)],  # Use the color from the palette based on the index
               label=f"animal_id {sample}")

for i, sample in enumerate(samples1):
    sample_data = female_control_data[female_control_data['animal_id'] == sample]
    ax[1].plot(x_values1,
               sample_data['bodyweight'],
               marker='s', linestyle='-',
               markerfacecolor="white",
               color=palette[i % len(palette)],
               label=f"animal_id {sample}")

for i, sample in enumerate(samples2):
    sample_data = male_experimental_data[male_experimental_data['animal_id'] == sample]
    ax[2].plot(x_values2,
               sample_data['bodyweight'],
               marker='s', linestyle='-',
               markerfacecolor="white",
               color=palette[i % len(palette)],
               label=f"animal_id {sample}")

for i, sample in enumerate(samples3):
    sample_data = male_control_data[male_control_data['animal_id'] == sample]
    ax[3].plot(x_values3,
               sample_data['bodyweight'],
               marker='s', linestyle='-',
               markerfacecolor="white",
               color=palette[i % len(palette)],
               label=f"animal_id {sample}")

# Set x axis limits and ticks
for i in range(4):
    ax[i].set_xlim([-0.5, len(eval(f'x_values{i}')) - 0.5])
    ax[i].set_xticks(np.arange(len(eval(f'x_values{i}'))))
    ax[i].set_xticklabels(sorted(eval(f'df[df["group"]=="experimental"]["date"].unique()')))

    # Add labels
    ax[i].set_ylabel("Bodyweight (g)")
    ax[i].set_xlabel("Days")

ax[0].set_title(f"Female experimental data")
ax[1].set_title(f"Female control data")
ax[2].set_title(f"Male experimental data")
ax[3].set_title(f"Male control data")

In [None]:
#do the same for food intake
#cleaning dataframe
def remove_outliers(df, column):
    values = []
    for idx, row in df.iterrows():
        if row[column] < 1 or row[column] > np.mean(df[column]) * 2:
            values.append(np.nan)
        else:
            values.append(row[column])
    return values

def remove_and_clean(df):
    """Removes unnecessary rows and cleans data to get rid of erroneous values.
    Args:
        df (Pandas dataframe): Contains data from niacin experiment
    Returns:
        Dataframe: Cleaned dataframe
    """
    df_cleaned = (
        df.query("food_yesterday != -1.0")
        .assign(food_intake=lambda df_: remove_outliers(df_, "food_intake"))
        .assign(water_intake=lambda df_: remove_outliers(df_, "water_intake"))
    )
    return df_cleaned

# Cleans the dataframe for food intake and water intake: removes the outliers using "remove_and_clean" and removes
# all of the -1 values
df_food_water = remove_and_clean(df)

# Subset the DataFrames into four
female_experimental_data = df_food_water[(df_food_water['sex'] == 'female') & (df_food_water['group'] == 'experimental')]
female_control_data = df_food_water[(df_food_water["sex"] == "female") & (df_food_water["group"] == "control")]
male_experimental_data = df_food_water[(df_food_water["sex"] == "male") & (df_food_water["group"] == "experimental")]
male_control_data = df_food_water[(df_food_water["sex"] == "male") & (df_food_water["group"] == "control")]

# Set x values based on the number of days
x_values0 = np.arange(len(female_experimental_data['date'].unique()))
x_values1 = np.arange(len(female_control_data['date'].unique()))
x_values2 = np.arange(len(male_experimental_data['date'].unique()))
x_values3 = np.arange(len(male_control_data['date'].unique()))

# Create a subplot
fig, ax = plt.subplots(ncols=4, figsize=(30, 15), sharey=True,
                     gridspec_kw={"width_ratios": [1, 1, 1, 1]})

# Get unique samples in the dataset
samples0 = female_experimental_data['animal_id'].unique()
samples1 = female_control_data["animal_id"].unique()
samples2 = male_experimental_data["animal_id"].unique()
samples3 = male_control_data["animal_id"].unique()

# Colour of lines
palette = ["orange", "magenta", "green", "black", "red", "pink", "cyan", "purple"]

# Loop through each sample and plot the connected data points
for i, sample in enumerate(samples0):
    sample_data = female_experimental_data[female_experimental_data['animal_id'] == sample]
    ax[0].plot(x_values0,
               sample_data['food_intake'],
               marker='s', linestyle='-',
               markerfacecolor="white",
               color=palette[i % len(palette)],  # Use the color from the palette based on the index
               label=f"animal_id {sample}")

for i, sample in enumerate(samples1):
    sample_data = female_control_data[female_control_data['animal_id'] == sample]
    ax[1].plot(x_values1,
               sample_data['food_intake'],
               marker='s', linestyle='-',
               markerfacecolor="white",
               color=palette[i % len(palette)],
               label=f"animal_id {sample}")

for i, sample in enumerate(samples2):
    sample_data = male_experimental_data[male_experimental_data['animal_id'] == sample]
    ax[2].plot(x_values2,
               sample_data['food_intake'],
               marker='s', linestyle='-',
               markerfacecolor="white",
               color=palette[i % len(palette)],
               label=f"animal_id {sample}")

for i, sample in enumerate(samples3):
    sample_data = male_control_data[male_control_data['animal_id'] == sample]
    ax[3].plot(x_values3,
               sample_data['food_intake'],
               marker='s', linestyle='-',
               markerfacecolor="white",
               color=palette[i % len(palette)],
               label=f"animal_id {sample}")

# Set x axis limits and ticks
for i in range(4):
    ax[i].set_xlim([-0.5, len(eval(f'x_values{i}')) - 0.5])
    ax[i].set_xticks(np.arange(len(eval(f'x_values{i}'))))
    ax[i].set_xticklabels(sorted(eval(f'df_food_water[df_food_water["group"]=="experimental"]["date"].unique()')))

    # Add labels
    ax[i].set_ylabel("Food intake (g)")
    ax[i].set_xlabel("Days")

ax[0].set_title(f"Female experimental data")
ax[1].set_title(f"Female control data")
ax[2].set_title(f"Male experimental data")
ax[3].set_title(f"Male control data")


In [None]:
#do the same for water intake
#cleaning dataframe
def remove_outliers(df, column):
    values = []
    for idx, row in df.iterrows():
        if row[column] < 1 or row[column] > np.mean(df[column]) * 2:
            values.append(np.nan)
        else:
            values.append(row[column])
    return values

def remove_and_clean(df):
    """Removes unnecessary rows and cleans data to get rid of erroneous values.
    Args:
        df (Pandas dataframe): Contains data from niacin experiment
    Returns:
        Dataframe: Cleaned dataframe
    """
    df_cleaned = (
        df.query("food_yesterday != -1.0")
        .assign(food_intake=lambda df_: remove_outliers(df_, "food_intake"))
        .assign(water_intake=lambda df_: remove_outliers(df_, "water_intake"))
    )
    return df_cleaned

# Cleans the dataframe for food intake and water intake: removes the outliers using "remove_and_clean" and removes
# all of the -1 values
df_food_water = remove_and_clean(df)

# Subset the DataFrames into four
female_experimental_data = df_food_water[(df_food_water['sex'] == 'female') & (df_food_water['group'] == 'experimental')]
female_control_data = df_food_water[(df_food_water["sex"] == "female") & (df_food_water["group"] == "control")]
male_experimental_data = df_food_water[(df_food_water["sex"] == "male") & (df_food_water["group"] == "experimental")]
male_control_data = df_food_water[(df_food_water["sex"] == "male") & (df_food_water["group"] == "control")]

# Set x values based on the number of days
x_values0 = np.arange(len(female_experimental_data['date'].unique()))
x_values1 = np.arange(len(female_control_data['date'].unique()))
x_values2 = np.arange(len(male_experimental_data['date'].unique()))
x_values3 = np.arange(len(male_control_data['date'].unique()))

# Create a subplot
fig, ax = plt.subplots(ncols=4, figsize=(30, 15), sharey=True,
                     gridspec_kw={"width_ratios": [1, 1, 1, 1]})

# Get unique samples in the dataset
samples0 = female_experimental_data['animal_id'].unique()
samples1 = female_control_data["animal_id"].unique()
samples2 = male_experimental_data["animal_id"].unique()
samples3 = male_control_data["animal_id"].unique()

# Colour of lines
palette = ["orange", "magenta", "green", "black", "red", "pink", "cyan", "purple"]

# Loop through each sample and plot the connected data points
for i, sample in enumerate(samples0):
    sample_data = female_experimental_data[female_experimental_data['animal_id'] == sample]
    ax[0].plot(x_values0,
               sample_data['water_intake'],
               marker='s', linestyle='-',
               markerfacecolor="white",
               color=palette[i % len(palette)],  # Use the color from the palette based on the index
               label=f"animal_id {sample}")

for i, sample in enumerate(samples1):
    sample_data = female_control_data[female_control_data['animal_id'] == sample]
    ax[1].plot(x_values1,
               sample_data['water_intake'],
               marker='s', linestyle='-',
               markerfacecolor="white",
               color=palette[i % len(palette)],
               label=f"animal_id {sample}")

for i, sample in enumerate(samples2):
    sample_data = male_experimental_data[male_experimental_data['animal_id'] == sample]
    ax[2].plot(x_values2,
               sample_data['water_intake'],
               marker='s', linestyle='-',
               markerfacecolor="white",
               color=palette[i % len(palette)],
               label=f"animal_id {sample}")

for i, sample in enumerate(samples3):
    sample_data = male_control_data[male_control_data['animal_id'] == sample]
    ax[3].plot(x_values3,
               sample_data['water_intake'],
               marker='s', linestyle='-',
               markerfacecolor="white",
               color=palette[i % len(palette)],
               label=f"animal_id {sample}")

# Set x axis limits and ticks
for i in range(4):
    ax[i].set_xlim([-0.5, len(eval(f'x_values{i}')) - 0.5])
    ax[i].set_xticks(np.arange(len(eval(f'x_values{i}'))))
    ax[i].set_xticklabels(sorted(eval(f'df_food_water[df_food_water["group"]=="experimental"]["date"].unique()')))

    # Add labels
    ax[i].set_ylabel("Water intake (ml)")
    ax[i].set_xlabel("Days")
    ax[i].legend()
    
ax[0].set_title(f"Female experimental data")
ax[1].set_title(f"Female control data")
ax[2].set_title(f"Male experimental data")
ax[3].set_title(f"Male control data")

In [None]:
df.columns

In [None]:
# Next, try to perform a t-test and get the result in the graph


#This is from an example on internet, could be used:
t_stat, p_value = stats.ttest_1samp(student_scores, mu)
print("T statistic:", t_stat)
print("P-value:", p_value)
