## Observations and Insights 

1) Capomulin and Ramicane have a significantly lower mean than the other drug regimens.
2) Infubinol has an outlier.
3) There is a positive correlation between mouse weight and average tumor volume.

In [1]:
%matplotlib notebook

In [39]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
from scipy.stats import linregress
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merged_df = pd.merge(mouse_metadata, study_results, on="Mouse ID", how="outer")
merged_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [3]:
# Checking the number of mice in the DataFrame.
mouseCount = merged_df["Mouse ID"].value_counts()
mouseCount

g989    13
m601    10
j365    10
b559    10
k894    10
        ..
f932     1
n482     1
o848     1
t573     1
l872     1
Name: Mouse ID, Length: 249, dtype: int64

In [4]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dupMouse=merged_df.loc[merged_df.duplicated(subset=["Mouse ID", "Timepoint"]), "Mouse ID"].unique()
dupMouse

array(['g989'], dtype=object)

In [5]:
# # Optional: Get all the data for the duplicate mouse ID. 
g989Dup  = merged_df[merged_df['Mouse ID'] == "g989"]
print(g989Dup.index) 
print()
g989Dup.head(13)

Int64Index([908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920], dtype='int64')



Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [6]:
# # Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_df = merged_df[merged_df["Mouse ID"] != "g989"]
cleaned_df


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [7]:
# Checking the number of mice in the clean DataFrame.
cleaned_df["Mouse ID"].value_counts()

m601    10
j365    10
w350    10
b559    10
k894    10
        ..
t573     1
o848     1
f932     1
d133     1
n482     1
Name: Mouse ID, Length: 248, dtype: int64

## Summary Statistics

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and 
# SEM of the tumor volume for each regimen
# # This method is the most straightforward, creating multiple series and putting them all together at the end.

# groupby Drug Regimen
regimenGroup = cleaned_df.groupby(["Drug Regimen"])
print(regimenGroup)
regimenGroup.count()

# calculate stats
mean = regimenGroup["Tumor Volume (mm3)"].mean()

median = regimenGroup["Tumor Volume (mm3)"].median()

variance = regimenGroup["Tumor Volume (mm3)"].var()

stDev = regimenGroup["Tumor Volume (mm3)"].std()

sem = regimenGroup["Tumor Volume (mm3)"].sem()

# create dataframe
summRegimenGroup = pd.DataFrame({"Mean": mean, 
                                    "Median": median, 
                                    "Variance": variance, 
                                    "Standard Deviation": stDev,
                                    "Standard Error": sem})
summRegimenGroup


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002199DB92E08>


Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,Standard Error
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [9]:
# # Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# # This method produces everything in a single groupby function.
summregimenGroupB = cleaned_df.groupby("Drug Regimen").agg({"Tumor Volume (mm3)": 
                                                            ['mean', 'median', 'var', 'std', 'sem']})
summregimenGroupB


Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar Plots

In [19]:
# Generate a bar plot showing the number of mice per time point for each treatment 
# # throughout the course of the study using pandas.

# grab columns
miceTP_df = cleaned_df[["Drug Regimen", "Mouse ID"]]
miceTP_df

# groupby and count mice
miceTPGroup_df = miceTP_df.groupby(["Drug Regimen"]).count()
type(miceTPGroup_df)

# sort in ascending order
miceTPGroup_df = miceTPGroup_df.sort_values(["Mouse ID"], ascending=False)

# Use DataFrame.plot() in order to create a bar chart of the data
miceTPGroup_df.plot(kind="bar", )

# Set a lable for the y axis
plt.ylabel("Number of Data Points")
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

In [62]:
# Generate a bar plot showing the number of mice per time point for each treatment 
# throughout the course of the study using pyplot.

# grab columns
miceTP_df = cleaned_df[["Drug Regimen", "Mouse ID"]]
miceTP_df

# groupby
miceTPGroup_df = miceTP_df.groupby(["Drug Regimen"])
miceTPGroup_df.count()

# reset index to be sorted by Mouse Count
miceTPData = miceTPGroup_df["Mouse ID"].count().to_frame(name = "Mouse ID Count").reset_index()
miceTPData.count().head()

# sort by Mouse Count
miceTPData = miceTPData.sort_values(["Mouse ID Count"], ascending=False)
miceTPData 

# set x and y 
mouseCount = miceTPData["Mouse ID Count"]
drugRegimen = miceTPData["Drug Regimen"]

# create bar chart with x and y labels
x_axis = np.arange(len(mouseCount))
plt.bar(x_axis, mouseCount, color='b', alpha=1, align="center")
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, drugRegimen, rotation='vertical')
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Data Points")
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

## Pie Plots

In [63]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

# remove duplicates to have one mouse per ID
gender = merged_df[["Mouse ID", "Sex"]].drop_duplicates("Mouse ID")
gender

# groupby Sex
genderGroup = gender.groupby(["Sex"])

# reset to sort by count
gender_df = genderGroup["Sex"].count().to_frame(name = "Gender Count").reset_index()
gender_df

# sort by count
gender_df = gender_df.sort_values(["Sex"], ascending=False)
gender_df

# set labels for pie
labels = ["Male", "Female"]

# create pie plot with y label
genderPie = gender_df.plot.pie(y="Gender Count",labels=labels, autopct="%.1f%%")
plt.show()
plt.axis("equal")

<IPython.core.display.Javascript object>

(-1.1016630746735625,
 1.1000791940320744,
 -1.1017685314144794,
 1.1009347297692742)

In [64]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

# drop duplicate Mouse IDs
gender = merged_df[["Mouse ID", "Sex"]].drop_duplicates("Mouse ID")
gender

# groupby sex to get a count
genderGroup = gender.groupby(["Sex"])
genderGroup.count()

# reset to be able to sort
gender_df = genderGroup["Sex"].count().to_frame(name = "Gender Count").reset_index()
gender_df.count().head()

# sort by count
gender_df = gender_df.sort_values(["Sex"], ascending=False)
gender_df

# set x and y
genderCount = gender_df["Gender Count"]
gender = gender_df["Sex"] 

# create pie plot with y label
x_axis = np.arange(0, len(genderCount))
colors = ["lightcoral", "lightskyblue"]
genderPie2 = plt.pie(genderCount, labels=gender, colors=colors,autopct="%1.1f%%", shadow=True)

plt.ylabel("Gender Count")
plt.axis("equal")
plt.show

<IPython.core.display.Javascript object>

<function matplotlib.pyplot.show(*args, **kw)>

## Quartiles, Outliers and Boxplots

In [65]:
# get all drug names
cleaned_df["Drug Regimen"].unique()

array(['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'],
      dtype=object)

In [66]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. 

# grab needed columns
tumorVolumeData = cleaned_df[["Drug Regimen", "Mouse ID", "Tumor Volume (mm3)",  "Timepoint"]]
tumorVolumeData 

# set index to Drug Regimen
tumorVolumeData = tumorVolumeData.set_index(["Drug Regimen"])
tumorVolumeData

# drop all drugs that are not in the top 4
dropDrug = tumorVolumeData.drop(['Placebo', 'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'])
dropDrug

# get last value by mouse ID to grab final tumor volume rows
lastMouse = dropDrug.groupby(["Mouse ID"]).tail(1)
lastMouse

# sort by Drug Regimen
lastMouse_df = lastMouse.sort_values(["Drug Regimen"], ascending=True)
lastMouse_df

Unnamed: 0_level_0,Mouse ID,Tumor Volume (mm3),Timepoint
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Capomulin,j246,38.753265,35
Capomulin,v923,40.658124,45
Capomulin,w150,39.952347,10
Capomulin,j119,38.125164,45
Capomulin,l509,41.483008,45
...,...,...,...
Ramicane,r921,43.419381,30
Ramicane,w678,43.166373,5
Ramicane,y449,44.183451,15
Ramicane,r811,37.225650,45


In [73]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 

# grab last tumor volumes per mouse
lastMouse_IQR = lastMouse_df[["Tumor Volume (mm3)"]]
lastMouse_IQR

# create a dataframe for capomulin and create a tumor series for future use
capomulin = lastMouse_IQR.loc[lastMouse_df.index == "Capomulin"]
capomulinTumor = capomulin["Tumor Volume (mm3)"]
# calculate IQR and outliers 
capoQuart = capomulinTumor.quantile([.25,.75])
lowerq = capoQuart[.25]
upperq = capoQuart[.75]
capoIQR = round(upperq-lowerq,3)
capoLowBound = round(lowerq - (1.5*capoIQR), 3)
capoUpBound = round(upperq + (1.5*capoIQR),3)
print(f"*Capomulin* Tumor IQR is: {capoIQR}mm3")
print(f"*Capomulin* Tumors smaller than {capoLowBound}mm3 and larger than {capoUpBound}mm3 could be outliers.")
print()

# create a dataframe for ramicane and create a tumor series for future use
ramicane = lastMouse_IQR.loc[lastMouse_df.index == "Ramicane"]
ramicaneTumor = ramicane["Tumor Volume (mm3)"]
# calculate IQR and outliers 
ramiQuart = ramicaneTumor.quantile([.25,.75])
lowerq = ramiQuart[.25]
upperq = ramiQuart[.75]
ramiIQR = round(upperq-lowerq,3)
ramiLowBound = round(lowerq - (1.5*ramiIQR), 3)
ramiUpBound = round(upperq + (1.5*ramiIQR),3)
print(f"*Ramicane* Tumor IQR is: {ramiIQR}mm3")
print(f"*Ramicane* Tumors smaller than {ramiLowBound}mm3 and larger than {ramiUpBound}mm3 could be outliers.")
print()

# create a dataframe for Infubinol and create a tumor series for future use
infubinol = lastMouse_IQR.loc[lastMouse_df.index == "Infubinol"]
infubinolTumor = infubinol["Tumor Volume (mm3)"]
# calculate IQR and outliers
infuQuart = infubinolTumor.quantile([.25,.75])
lowerq = infuQuart[.25]
upperq = infuQuart[.75]
infuIQR = round(upperq-lowerq,3)
infuLowBound = round(lowerq - (1.5*infuIQR), 3)
infuUpBound = round(upperq + (1.5*infuIQR),3)
print(f"*Infubinol* Tumor IQR is: {infuIQR}mm3")
print(f"*Infubinol* Tumors smaller than {infuLowBound}mm3 and larger than {infuUpBound }mm3 could be outliers.")
print()

# create a dataframe for Ceftamin and create a tumor series for future use
ceftamin = lastMouse_IQR.loc[lastMouse_df.index == "Ceftamin"]
ceftaminTumor = ceftamin["Tumor Volume (mm3)"]
# calculate IQR and outliers for capomulin
ceftQuart = ceftaminTumor.quantile([.25,.75])
lowerq = ceftQuart[.25]
upperq = ceftQuart[.75]
ceftIQR = round(upperq-lowerq,3)
ceftLowBound = round(lowerq - (1.5*ceftIQR), 3)
ceftUpBound = round(upperq + (1.5*ceftIQR),3)
print(f"*Ceftamin* Tumor IQR is: {ceftIQR}mm3")
print(f"*Ceftamin* Tumors smaller than {ceftLowBound}mm3 and larger than {ceftUpBound}mm3 could be outliers.")

*Capomulin* Tumor IQR is: 7.782mm3
*Capomulin* Tumors smaller than 20.704mm3 and larger than 51.832mm3 could be outliers.

*Ramicane* Tumor IQR is: 9.099mm3
*Ramicane* Tumors smaller than 17.912mm3 and larger than 54.308mm3 could be outliers.

*Infubinol* Tumor IQR is: 11.477mm3
*Infubinol* Tumors smaller than 36.833mm3 and larger than 82.741mm3 could be outliers.

*Ceftamin* Tumor IQR is: 15.578mm3
*Ceftamin* Tumors smaller than 25.355mm3 and larger than 87.667mm3 could be outliers.


In [68]:
# create a boxplot using tumor series', turn outlier red
fig, ax = plt.subplots()
redCircle = dict(markerfacecolor='r', marker='o', markersize=10)
ax.boxplot([capomulinTumor, ramicaneTumor, infubinolTumor, ceftaminTumor], labels=["Capomulin", "Ramicane", "Infubitol", "Ceftamin"], flierprops=redCircle)              
ax.set_ylabel("Final Tumor Volume (mm3)")                

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Final Tumor Volume (mm3)')

## Line and Scatter Plots

In [69]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

# isolate a mouse
capoTime  = cleaned_df[cleaned_df['Mouse ID'] == "j119"]

# grab columns needed for line plot
capoTime = capoTime[["Timepoint", "Tumor Volume (mm3)"]]

# set the index
capoTime = capoTime.set_index(["Timepoint"])
capoTime

# creat plot with title and x and y labels
capoTime.plot(title="Capomulin Treatment of Mouse j119") 
plt.xlabel("Timepoint (days)")
plt.ylabel("Tumor Volume (mm3)")

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Tumor Volume (mm3)')

In [70]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

# isolate capomulin
capoWeight  = cleaned_df[cleaned_df['Drug Regimen'] == "Capomulin"]
capoWeight

# grab columns needed
capoWeight = capoWeight[["Mouse ID","Weight (g)", "Tumor Volume (mm3)"]]
capoWeight

# groupby mouse ID
capoWeightGroup = capoWeight.groupby(["Mouse ID"]).mean()
capoWeightGroup

# rename columns for a different y label
capoWeightGroup = capoWeightGroup.rename(columns={"Tumor Volume (mm3)": "Average Tumor Volume (mm3)"})
capoWeightGroup

# create plot
capoWeightGroup.plot(kind="scatter",x="Weight (g)", y="Average Tumor Volume (mm3)", title="") 


<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x2199f408608>

In [71]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

# grab columns from above
capoWeightCorr_df = capoWeightGroup[["Weight (g)", "Average Tumor Volume (mm3)"]]
capoWeightCorr_df

# set x and y
weight= capoWeightCorr_df["Weight (g)"]
avgTumVol = capoWeightCorr_df["Average Tumor Volume (mm3)"]

# calculate correlation
correlation = sts.pearsonr(weight,avgTumVol)
print(f' The correlation between mouse weight and the average tumor volume is {round(correlation[0],2)}.')

# create scatterplot with line regression, x and y labels
(slope, intercept, rvalue, pvalue, stderr) = linregress(weight,avgTumVol)
regress_values = weight * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(weight,avgTumVol)
plt.plot(weight,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="red")
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.show()

 The correlation between mouse weight and the average tumor volume is 0.84.


<IPython.core.display.Javascript object>