## Observations and Insights
#### Add your analysis here

---
Ramicane and Capomulin have the best results for decreasing tumor volume based on the Summary Statistics.  They were also tested on the largest population sizes.
Infubinol had one final tumor volume result that fell in range with the average final tumor volume for mice on the Capomulin and Ramicane treatments.  Unfortunately this result appears to be an outlier, once again pointing towards the better results from the Capomulin and Ramicane treatments.
As mouse weight increases on the Capomulin regimen so does the average tumor volume based on the correlation factor of 0.84.

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_data = pd.merge(study_results, mouse_metadata, on=["Mouse ID", "Mouse ID"], how="left")

# Display the data table for preview
mouse_data.head()

In [None]:
# Check the number of mice.
unique_mice = mouse_data["Mouse ID"].unique()
len(unique_mice)

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicate_mice_df = mouse_data[mouse_data.duplicated(["Mouse ID", "Timepoint"], keep=False)]
dup_mice = duplicate_mice_df["Mouse ID"].unique()
dup_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
dup_mouse_df = mouse_data.loc[mouse_data["Mouse ID"]==dup_mice[0], :]
dup_mouse_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse_data = mouse_data[mouse_data["Mouse ID"] != dup_mice[0]]

In [None]:
# Check the number of mice in the clean DataFrame.
clean_mice = clean_mouse_data["Mouse ID"].unique()
len(clean_mice)

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drugs_group = clean_mouse_data.groupby("Drug Regimen")
tumor_vol = drugs_group["Tumor Volume (mm3)"]

mean_tumor_vol = pd.Series(tumor_vol.mean())
med_tumor_vol = pd.Series(tumor_vol.median())
tumor_vol_var = pd.Series(tumor_vol.var())
tumor_vol_stddev = pd.Series(tumor_vol.std())
tumor_vol_stderr = pd.Series(tumor_vol.sem())

# Use this straighforward method, create multiple series and put them all in a dataframe at the end.
summary_statistics = pd.concat([mean_tumor_vol.rename("Mean Tumor Volume"), 
                                med_tumor_vol.rename("Median Tumor Volume"),
                                tumor_vol_var.rename("Tumor Volume Variance"),
                                tumor_vol_stddev.rename("Tumor Volume Std. Dev."),
                                tumor_vol_stderr.rename("Tumor Volume Std. Err.")], axis=1)
summary_statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
tumor_data = clean_mouse_data[["Drug Regimen", "Tumor Volume (mm3)"]]
tumor_data.groupby("Drug Regimen").agg(["mean", "median", "var", "std", "sem"])

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
# Calculate mice by treatment plan and put in data frame
mice_by_treatment = pd.DataFrame(drugs_group["Mouse ID"].count())
mice_by_treatment = mice_by_treatment.sort_values(["Mouse ID"], ascending=False)

# # Use DataFrame.plot() in order to create a bar chart of the data
mice_by_treatment.plot(kind="bar", figsize=(6,5), legend = False)
plt.ylabel("Number of Data Points")
plt.tight_layout()

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
# Set x axis and tick locations
x_axis = np.arange(len(mice_by_treatment))
tick_locations = [value for value in x_axis]
drugs = mice_by_treatment.index

# Set fig size
plt.figure(figsize=(6,5))

# Set data
plt.bar(x_axis, mice_by_treatment["Mouse ID"], color='blue', alpha=0.5, align="center")
plt.xticks(tick_locations, drugs, rotation="vertical")

# Set x and y limits
plt.xlim(-0.75, len(x_axis))
plt.ylim(0, max(mice_by_treatment["Mouse ID"])+10)

# Set axis labels
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Data Points")

# Show the graph
# tigh_layout() adjusts the visual of our graph making it easier to see
plt.tight_layout()
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
# Sort mice by gender into data frame
gender_group = clean_mouse_data.groupby("Sex")
mice_by_gender = pd.DataFrame(gender_group["Mouse ID"].count())
mice_by_gender = mice_by_gender.sort_values(["Mouse ID"], ascending=False)

# Create pie plot 
mice_by_gender.plot(kind="pie", figsize=(4, 4), autopct='%1.1f%%', subplots = True, legend = False)
plt.ylabel("Sex")
plt.tight_layout()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
# Labels for the sections of our pie chart
labels = mice_by_gender.index

# The values of each section of the pie chart
sizes = mice_by_gender["Mouse ID"]

# The colors of each section of the pie chart
colors = ["blue", "orange"]

# # Tells matplotlib to seperate the "Humans" section from the others
# explode = (0.1, 0, 0, 0)

# Creates the pie chart based upon the values above
# Automatically finds the percentages of each part of the pie chart
plt.pie(sizes, labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=False, startangle=0)

# Tells matplotlib that we want a pie chart with equal axes
plt.axis("equal")
plt.ylabel("Sex")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across each of the treatment regimens: 
# Start by getting the last (greatest) timepoint for each mouse
mice_ID_group = clean_mouse_data.groupby("Mouse ID")
greatest_timepoint = pd.DataFrame(mice_ID_group["Timepoint"].max())

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
last_tumor_data = pd.merge(greatest_timepoint, clean_mouse_data, how="left", left_on=["Mouse ID", "Timepoint"], 
                                                                     right_on=["Mouse ID", "Timepoint"])
last_tumor_data

In [None]:
# Put 4 treatment names into a list for use with a for loop (and later for plot labels)
treatment_list = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create a empty list to fill with tumor vol data (for plotting) (hint: each element of the list will be series)
tumor_vol_list = []

# For each treatment in the list, calculate the IQR and quantitatively 
# determine if there are any potential outliers. 
for drug in treatment_list:
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    treatment_tumors = last_tumor_data.loc[last_tumor_data["Drug Regimen"]==drug, "Tumor Volume (mm3)"]
    # add subset to tumor volume data list
    tumor_vol_list.append(treatment_tumors)
    # Determine outliers using upper and lower bounds
    quartiles = treatment_tumors.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    outliers = treatment_tumors.loc[(treatment_tumors < lower_bound) | 
                                    (treatment_tumors > upper_bound)]
    print(f"{drug}'s potential outliers: {outliers}.")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_ylabel('Final Tumor Volume (mm3)')
flierprops = dict(marker='o', markerfacecolor='r', markersize=12,
                  linestyle='none', markeredgecolor='black')
ax1.boxplot(tumor_vol_list, flierprops=flierprops)
x_axis = np.arange(len(treatment_list))
tick_locations = [value +1 for value in x_axis]
plt.xticks(tick_locations, treatment_list)
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
# Identify capo_mouse ID, retrieve mouse 
capo_mice_df = clean_mouse_data.loc[clean_mouse_data["Drug Regimen"]=="Capomulin"]
capo_mice = capo_mice_df["Mouse ID"]
capo_tumor_df = capo_mice_df.loc[capo_mice_df["Mouse ID"]==capo_mice[0], :]
x_axis = np.arange(0, 45, 10)
capo_tumor_vol = capo_tumor_df["Tumor Volume (mm3)"]
capo_time = capo_tumor_df["Timepoint"]
plt.plot(capo_time, capo_tumor_vol) 
plt.title("Capomulin treatment of mouse b128")
plt.xlabel("Timepoint (days)")
plt.ylabel("Tumor Volume (mm3)")
plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capo_mice_IDs = capo_mice_df.groupby("Mouse ID")
x_values = capo_mice_IDs["Weight (g)"].mean()
y_values = capo_mice_IDs["Tumor Volume (mm3)"].mean()
plt.scatter(x_values, y_values)
plt.xlabel('Weight (g)')
plt.ylabel('Tumor Volume (mm3)')
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.xlabel('Weight (g)')
plt.ylabel('Tumor Volume (mm3)')
correlation = st.pearsonr(x_values, y_values)
print(f"The correlation between mouse weight and the average tumor volume is {round(correlation[0],2)}")
plt.show()