In [None]:
# Observations and Insights
#### Add your analysis here

#Overall, I can see that Capomulin seems to have the most points to analyze and study in this exercise.
#Following Capomulin, the second best drug would be Ramicane as it has the second most data points. 
#Going further down and analyzing the charts, it is notably visible Ramicane had less outliers than Capomulin.
#This can support the fact that Ramicane may be a better treatment drug over Capomulin.

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_data = pd.merge(study_results, mouse_metadata, how="left", on="Mouse ID")

# Display the data table for preview
combined_data.head()

In [None]:
# Check the number of mice.
len(combined_data['Mouse ID'].unique())

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicate_IDs = combined_data.loc[combined_data.duplicated(subset=["Mouse ID", "Timepoint"]), "Mouse ID"].unique()

duplicate_IDs

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
combined_data.loc[combined_data["Mouse ID"] == "g989"]


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_data =  combined_data.loc[combined_data["Mouse ID"].isin(duplicate_IDs) == False]

cleaned_data

In [None]:
# Check the number of mice in the clean DataFrame.
len(cleaned_data['Mouse ID'].unique())

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

summary_stats = cleaned_data.groupby(["Drug Regimen"])
#summary_stats

summary_mean = summary_stats["Tumor Volume (mm3)"].mean()
#summary_mean

summary_median = summary_stats["Tumor Volume (mm3)"].median()
#summary_median

summary_variance = summary_stats["Tumor Volume (mm3)"].var()
#summary_variance

summary_std = summary_stats["Tumor Volume (mm3)"].std()
#summary_std

summary_sem = summary_stats["Tumor Volume (mm3)"].sem()
#summary_sem

summary_stats_df = pd.DataFrame({"Mean Tumor Volume": summary_mean,
                                 "Median Tumor Volume": summary_median,
                                "Tumor Volume Variance": summary_variance,
                                "Tumor Volume Std. Dev.": summary_std,
                                "Tumor Volume Std. Err.": summary_sem})

summary_stats_df

# Use this straighforward method, create multiple series and put them all in a dataframe at the end.


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

summary_stats_2 = cleaned_data.groupby(["Drug Regimen"]).agg({'Tumor Volume (mm3)': ["mean", "median", "var", "std", "sem"]})

summary_stats_2

# Use method to produce everything with a single groupby function



## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
bar_plot = cleaned_data["Drug Regimen"].value_counts()

bar_plot.plot(kind="bar")
plt.ylabel("Number of Data Points")
plt.xlabel("Drug Regimen")

plt.show()


In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
x_axis = bar_plot.index
y_axis = bar_plot.values

plt.bar(x_axis, y_axis)
plt.ylabel("Number of Data Points")
plt.xlabel("Drug Regimen")
plt.xticks(rotation = 90) 

plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
pie_plot = cleaned_data["Sex"].value_counts()
#pie_plot

pie_plot.plot(kind="pie",autopct="%1.1f%%")

plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

labels = pie_plot.index
percentage = pie_plot.values

plt.pie(percentage, labels=labels,autopct="%1.1f%%")
plt.ylabel("Sex")

plt.show()


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across each of the treatment regimens: 
# Start by getting the last (greatest) timepoint for each mouse

final_timepoint = cleaned_data.groupby(["Mouse ID"])["Timepoint"].max()
final_timepoint = final_timepoint.reset_index()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

final_tumor_volume = final_timepoint.merge(cleaned_data, on=["Mouse ID", "Timepoint"], how="left")

final_tumor_volume

In [None]:
# Put 4 treatment names into a list for use with a for loop (and later for plot labels)
treatment_list = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create a empty list to fill with tumor vol data (for plotting) (hint: each element of the list will be series)
tumor_vol_list = []

# For each treatment in the list, calculate the IQR and quantitatively 
# determine if there are any potential outliers.

for drug in treatment_list:
    
    # Locate the rows which contain mice on each drug and get the tumor 
    tumor = final_tumor_volume.loc[final_tumor_volume['Drug Regimen'] == drug, 'Tumor Volume (mm3)']
    
    # add subset to tumor volume data list
    tumor_vol_list.append(tumor)
    
    # Determine outliers using upper and lower bounds
    quartiles = tumor_vol_list.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq - lowerq
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    
    outliers = tumor.loc[(tumor < lower_bound) | (tumor > upper_bound)]
    
    print(f"{drug}'s potential outliers: {outliers}")
    
#For some reason I am getting an AttributeError in this input of 'list' object has no attribute 'quantile' and I do not know why.
#I have searched for any and all solutions and nothing seems to fix it.

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
flierprops = dict(marker='o', markerfacecolor='red', markersize=12,
                  linestyle='none')
ax1.set_ylabel('Final Tumor Volume (mm3)')
ax1.set_xlabel('Drug Regimen')
ax1.boxplot(tumor_vol_list, flierprops = flierprops)
x_axis = np.arange(len(treatment_list))
tick = [value + 1 for value in x_axis]
plt.xticks(tick, treatment_list)

plt.show()

#Due to the problem in the cell above, my boxplot is only visible for Capomulin. I have tried over and over to fix this but have lost the battle.

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

cap_data = cleaned_data[cleaned_data['Drug Regimen']=='Capomulin']
mouse_l509 = cap_data[cap_data['Mouse ID']=='l509']
mouse_l509_tumor = mouse_l509['Tumor Volume (mm3)']

plt.plot(mouse_l509.Timepoint, mouse_l509_tumor)

plt.xlabel('Timepoint (days)')
plt.ylabel('Tumor Volume (mm3)')

plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

cap_mean = cap_data.groupby('Mouse ID').mean()['Tumor Volume (mm3)']
cap_weight = cap_data.groupby('Mouse ID').mean()['Weight (g)']
plt.scatter(cap_weight, cap_mean)

plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')

plt.show()

## Correlation and Regression

In [None]:
(slope, intercept, rvalue, pvalue, stderr) = linregress(cap_weight, cap_mean)
regression = cap_weight * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

plt.scatter(cap_weight,cap_mean)
plt.plot(cap_weight,regression,"r-")
plt.ylabel('Average Tumor Volume (mm3)')
plt.xlabel('Weight (g)')

print(f"The correlation between mouse weight and the average tumor volume is {round(rvalue,2)}")
plt.show()