## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import seaborn as sns
import numpy as np
sns.set_style("darkgrid")

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merged_pd = pd.merge(mouse_metadata, study_results)

# Display the data table for preview
merged_pd

In [None]:
# Checking the number of mice.
len(pd.unique(merged_pd['Mouse ID']))

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
print(f"There are {len(mouse_metadata)} entries in Mouse MetaData and {len(study_results)} in Study Results")
nonDuplicate = []
duplicate = []
for index, row in study_results.iterrows():
    isPresent = False;
    for entry in nonDuplicate:
        #print(row['Mouse ID'])
        if (row['Mouse ID'] == entry['Mouse ID'] and row['Timepoint'] == entry['Timepoint']):
            isPresent = True
    if(isPresent):
        duplicate.append(row)
        print(f"duplicate found: {row['Mouse ID']}")
    else:
        nonDuplicate.append(row)

print(f"Detected {len(nonDuplicate)} unique entries for tumors, {len(duplicate)} duplicate entries")

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate[0]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_metadata = mouse_metadata[mouse_metadata['Mouse ID'] != "g989"]
actual_merged_pd = pd.merge(mouse_metadata, pd.DataFrame(nonDuplicate))
actual_merged_pd.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
len(pd.unique(actual_merged_pd['Mouse ID']))

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
mean = actual_merged_pd.groupby(by = "Drug Regimen").mean().drop(columns = ["Age_months","Weight (g)", "Timepoint", "Metastatic Sites"])
medianpd = actual_merged_pd.groupby(by = "Drug Regimen").median().drop(columns = ["Age_months","Weight (g)", "Timepoint", "Metastatic Sites"])
variancepd = actual_merged_pd.groupby(by = "Drug Regimen").var().drop(columns = ["Age_months","Weight (g)", "Timepoint", "Metastatic Sites"])
stdvpd = actual_merged_pd.groupby(by = "Drug Regimen").std().drop(columns = ["Age_months","Weight (g)", "Timepoint", "Metastatic Sites"])
sempd = actual_merged_pd.groupby(by = "Drug Regimen").sem().drop(columns = ["Age_months","Weight (g)", "Timepoint", "Metastatic Sites"])

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
mean.columns = ["Mean Volume"]
for stat in [(medianpd, 1, "Median"), (variancepd, 2, "Variance"), (stdvpd, 3, "Standard Deviation"), (sempd, 4, "SEM")]:
    mean.insert(loc = stat[1], column = stat[2], value = list(stat[0]["Tumor Volume (mm3)"]))

mean

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same summary statistics in a single line
statsd = actual_merged_pd.groupby(by = "Drug Regimen")["Tumor Volume (mm3)"].agg(["mean", "median", "var", "std", "sem"])

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
measurementspd = actual_merged_pd.groupby(by = "Drug Regimen").agg("count")
measurementspd["Timepoint"].plot.bar();

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
plt.bar(x = list(measurementspd.index), height = measurementspd["Timepoint"]);

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
mouseGender = mouse_metadata.groupby(by = "Sex").agg("count")
#mouseGender
mouseGender.plot.pie(y = "Mouse ID")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.pie(x = mouseGender["Mouse ID"]);

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
drugs = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
actual_merged_pd["Keep"] = (actual_merged_pd["Drug Regimen"].apply(lambda x: x in drugs))
latest_Tumor = actual_merged_pd[actual_merged_pd["Keep"] == True].drop(columns = ["Keep"])
latest_Tumor = latest_Tumor.groupby(by = "Mouse ID").agg("max")
latest_Tumor.drop(columns = ["Sex", "Tumor Volume (mm3)","Metastatic Sites", "Age_months"], inplace = True)
latest_Tumor_size = study_results.merge(latest_Tumor, how="inner", left_on = "Mouse ID", right_on = "Mouse ID", suffixes = ("_Original","_Filtered"))
latest_Tumor_size = latest_Tumor_size[latest_Tumor_size["Timepoint_Original"]==latest_Tumor_size["Timepoint_Filtered"]]
#actual_merged_pd[actual_merged_pd["Drug Regimen"] in drugs]
# Start by getting the last (greatest) timepoint for each mouse
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
latest_Tumor_size.rename({"Timepoint_Filtered":"Timepoint"}, axis = 1, inplace = True)
latest_Tumor_size.drop(columns = ["Timepoint_Original"], inplace = True)
latest_Tumor_size


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
print(f"Drugs: {drugs}")
#or did you mean:
alldrugs = set(mouse_metadata["Drug Regimen"])
print(f"The full list of drugs is: {alldrugs}")

# Create empty list to fill with tumor vol data (for plotting)
tumor_volume_data = {}

# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
for drug in drugs:
    tumor_volume_data[drug] = list(latest_Tumor_size[latest_Tumor_size["Drug Regimen"] == drug]["Tumor Volume (mm3)"])
    
tumor_volume_data
    # add subset 
    #I don't know what this even means
    
    # Determine outliers using upper and lower bounds
outlier_tumors = {}
outlier_tolerance = {}
for drug in drugs:
    outlier_tumors[drug] = []
    iqr = st.iqr(latest_Tumor_size[latest_Tumor_size["Drug Regimen"] == drug]["Tumor Volume (mm3)"])
    outlier_tolerance[drug] = 1.5*iqr
    median_tumorsize = np.median(latest_Tumor_size[latest_Tumor_size["Drug Regimen"] == drug]["Tumor Volume (mm3)"])
    print(f"Median tumor size for {drug} is {median_tumorsize} and the IQR is {iqr}. The outlier range is below {median_tumorsize - outlier_tolerance[drug]} or above {median_tumorsize + outlier_tolerance[drug]}.")
    for tumor in tumor_volume_data[drug]:
        if tumor > outlier_tolerance[drug] + median_tumorsize:
            outlier_tumors[drug].append(tumor)
        elif tumor < median_tumorsize - outlier_tolerance[drug]:
            outlier_tumors[drug].append(tumor)

outlier_tumors
        
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig, ax = plt.subplots(2,2, figsize = (10,10))
ax[(0,0)].boxplot(tumor_volume_data["Capomulin"]);
ax[(0,0)].set_title("Capomulin");
ax[(0,1)].boxplot(tumor_volume_data["Ramicane"]);
ax[(0,1)].set_title("Ramicane");
ax[(1,0)].boxplot(tumor_volume_data["Infubinol"]);
ax[(1,0)].set_title("Infubinol");
ax[(1,1)].boxplot(tumor_volume_data["Ceftamin"]);
ax[(1,1)].set_title("Ceftamin");

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
an_mouse = study_results[study_results["Mouse ID"] == "c139"]
an_mouse.head()
plt.scatter(an_mouse["Timepoint"],an_mouse["Tumor Volume (mm3)"]);
plt.title("Size of Mouse C139's Tumor over time")
plt.xlabel("Timepoint")
plt.ylabel("Volume of Tumors, in cubic millimeters")
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
cap_mice = actual_merged_pd[actual_merged_pd["Drug Regimen"] == "Capomulin"]
cap_condensed = cap_mice.groupby(by = "Mouse ID").agg("mean")
cap_condenseder = cap_condensed.groupby(by = "Weight (g)").agg("mean")
plt.scatter(cap_condenseder.index, cap_condenseder["Tumor Volume (mm3)"]);
plt.title("Average Tumor size of mice enrolled in Capommulin Regimen by weight")
plt.xlabel("Weight in Grams")
plt.ylabel("Average Tumor Volume for Mice of that weight after Capomulin regimen")
#This chart is ugly as sin but it is technically correct.

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
plt.scatter(cap_condenseder.index, cap_condenseder["Tumor Volume (mm3)"]);
m, b = np.polyfit(cap_condenseder.index, cap_condenseder["Tumor Volume (mm3)"],1)
plt.plot(cap_condenseder.index, m*cap_condenseder.index+ b)
plt.title("Average Tumor size of mice enrolled in Capommulin Regimen by weight")
plt.xlabel("Weight in Grams")
plt.ylabel("Average Tumor Volume for Mice of that weight after Capomulin regimen")