## Observations and Insights 

In [1]:
%matplotlib notebook

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_metadata
study_results
combine_dataset = pd.merge(mouse_metadata, study_results, on= "Mouse ID" )
# Display the data table for preview
combine_dataset

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [3]:
# Checking the number of mice.
combine_dataset["Mouse ID"].nunique()

249

In [4]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_df = combine_dataset[combine_dataset.duplicated("Mouse ID")]
duplicate_df.value_counts("Mouse ID")

Mouse ID
g989    12
l471     9
k894     9
k862     9
k483     9
        ..
o813     1
n630     1
w678     1
w746     1
m756     1
Length: 237, dtype: int64

In [5]:
# Optional: Get all the data for the duplicate mouse ID. 



In [6]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
combine_clean = combine_dataset.drop_duplicates(subset = "Mouse ID", keep = "last")
combine_clean = combine_clean.reset_index()
combine_clean

Unnamed: 0,index,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,9,k403,Ramicane,Male,21,16,45,22.050126,1
1,19,s185,Capomulin,Female,3,17,45,23.343598,1
2,29,x401,Capomulin,Female,16,15,45,28.484033,0
3,39,m601,Capomulin,Male,22,17,45,28.430964,1
4,49,g791,Ramicane,Male,11,16,45,29.128472,1
...,...,...,...,...,...,...,...,...,...
244,1859,z314,Stelasyn,Female,21,28,5,45.934712,0
245,1862,z435,Propriva,Female,12,26,10,48.710661,0
246,1872,z581,Infubinol,Female,24,25,45,62.754451,3
247,1882,z795,Naftisol,Female,13,29,45,65.741070,3


In [7]:
# Checking the number of mice in the clean DataFrame.
combine_clean["Mouse ID"].nunique()

249

## Summary Statistics

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
combine_clean["Tumor Volume (mm3)"].describe()
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
combine_drug_group = combine_clean.groupby("Drug Regimen")
combine_drug_mean = pd.DataFrame(combine_drug_group["Tumor Volume (mm3)"].mean())
combine_drug_median = pd.DataFrame(combine_drug_group["Tumor Volume (mm3)"].median())
combine_drug_var = pd.DataFrame(combine_drug_group["Tumor Volume (mm3)"].var())
combine_drug_std = pd.DataFrame(combine_drug_group["Tumor Volume (mm3)"].std())
combine_drug_SEM = pd.DataFrame(combine_drug_group["Tumor Volume (mm3)"].sem())

merge_1 = pd.merge(combine_drug_mean, combine_drug_median, on = "Drug Regimen")
merge_1rename = merge_1.rename(columns = {"Tumor Volume (mm3)_x": "Tumor Vol. Mean",
                                         "Tumor Volume (mm3)_y": "Tumor Vol Median"})
merge_2 = pd.merge(merge_1rename, combine_drug_var, on = "Drug Regimen")
merge_2rename = merge_2.rename(columns = {"Tumor Volume (mm3)": " Tumor Vol Variance"})


merge_3 = pd.merge(merge_2rename, combine_drug_std, on = "Drug Regimen")
merge_3rename = merge_3.rename(columns = {"Tumor Volume (mm3)": " Tumor Vol StrdDev"})

merge_4 = pd.merge(merge_3rename, combine_drug_SEM, on = "Drug Regimen")
tumor_vol_by_drug_df = merge_4.rename(columns = {"Tumor Volume (mm3)": " Tumor Vol SEM"})
tumor_vol_by_drug_df

Unnamed: 0_level_0,Tumor Vol. Mean,Tumor Vol Median,Tumor Vol Variance,Tumor Vol StrdDev,Tumor Vol SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,36.667568,38.125164,32.663378,5.715188,1.143038
Ceftamin,57.753977,59.851956,69.982735,8.365568,1.673114
Infubinol,58.178246,60.16518,74.010875,8.602957,1.720591
Ketapril,62.806191,64.487812,98.92133,9.94592,1.989184
Naftisol,61.205757,63.283288,106.029927,10.297083,2.059417
Placebo,60.508414,62.030594,78.759797,8.874672,1.774934
Propriva,56.736964,55.84141,69.349002,8.327605,1.665521
Ramicane,36.19139,36.561652,32.166354,5.671539,1.134308
Stelasyn,61.001707,62.19235,90.331586,9.504293,1.940056
Zoniferol,59.181258,61.840058,76.862027,8.767099,1.75342


In [9]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_mean_rename=combine_drug_mean.rename(columns = {"Tumor Volume (mm3)":"Mean"})
drug_mean_rename
drug_median_rename=combine_drug_median.rename(columns = {"Tumor Volume (mm3)":"Median"})
drug_median_rename
drug_var_rename=combine_drug_var.rename(columns = {"Tumor Volume (mm3)":"Variance"})
drug_var_rename
drug_std_rename=combine_drug_std.rename(columns = {"Tumor Volume (mm3)":"Standard Deviation"})
drug_std_rename
drug_SEM_rename=combine_drug_SEM.rename(columns = {"Tumor Volume (mm3)":"SEM"})
drug_SEM_rename


# Using the aggregation method, produce the same summary statistics in a single line



Unnamed: 0_level_0,SEM
Drug Regimen,Unnamed: 1_level_1
Capomulin,1.143038
Ceftamin,1.673114
Infubinol,1.720591
Ketapril,1.989184
Naftisol,2.059417
Placebo,1.774934
Propriva,1.665521
Ramicane,1.134308
Stelasyn,1.940056
Zoniferol,1.75342


## Bar and Pie Charts

In [19]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.

drugs = ["Capomulin","Ceftamin","Infubinol","Ketapril","Naftisol",
         "Placebo","Propriva","Ramicane","Stelasyn","Zoniferol"]
mice_per_drug = [25,25,25,25,25,25,25,25,24,25]
x_axis = np.arange(len(drugs))


In [20]:
plt.bar(x_axis, mice_per_drug, color = 'b', alpha = 0.5)


<BarContainer object of 10 artists>

In [12]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.



In [13]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

gender_df = combine_clean["Sex"].value_counts()
gender_df

labels = ["Male","Female"]
size = [125,124]
colors = ["red","blue"]
explode = (0.1,0)

plt.pie(size, explode = explode, labels = labels, colors = colors)


([<matplotlib.patches.Wedge at 0x7fab84ab96d0>,
  <matplotlib.patches.Wedge at 0x7fab84ab9c40>],
 [Text(-0.007570010156471628, 1.1999761226567098, 'Male'),
  Text(0.006939175976765524, -1.0999781124353174, 'Female')])

In [14]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [15]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
combine_tp_vol = combine_clean[["Mouse ID","Drug Regimen","Timepoint","Tumor Volume (mm3)"]]
combine_tp_vol
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


Unnamed: 0,Mouse ID,Drug Regimen,Timepoint,Tumor Volume (mm3)
0,k403,Ramicane,45,22.050126
1,s185,Capomulin,45,23.343598
2,x401,Capomulin,45,28.484033
3,m601,Capomulin,45,28.430964
4,g791,Ramicane,45,29.128472
...,...,...,...,...
244,z314,Stelasyn,5,45.934712
245,z435,Propriva,10,48.710661
246,z581,Infubinol,45,62.754451
247,z795,Naftisol,45,65.741070


In [16]:
# Put treatments into a list for for loop (and later for plot labels)
treatment = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
tumor_vol = []
for treatments in treatment:
    #if treatment == "Capomulin":
    tumor_vol.append(combine_tp_vol["Tumor Volume (mm3)"])
tumor_vol        
# Create empty list to fill with tumor vol data (for plotting)
#tumor_vol = []


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

[0      22.050126
 1      23.343598
 2      28.484033
 3      28.430964
 4      29.128472
          ...    
 244    45.934712
 245    48.710661
 246    62.754451
 247    65.741070
 248    73.867845
 Name: Tumor Volume (mm3), Length: 249, dtype: float64,
 0      22.050126
 1      23.343598
 2      28.484033
 3      28.430964
 4      29.128472
          ...    
 244    45.934712
 245    48.710661
 246    62.754451
 247    65.741070
 248    73.867845
 Name: Tumor Volume (mm3), Length: 249, dtype: float64,
 0      22.050126
 1      23.343598
 2      28.484033
 3      28.430964
 4      29.128472
          ...    
 244    45.934712
 245    48.710661
 246    62.754451
 247    65.741070
 248    73.867845
 Name: Tumor Volume (mm3), Length: 249, dtype: float64,
 0      22.050126
 1      23.343598
 2      28.484033
 3      28.430964
 4      29.128472
          ...    
 244    45.934712
 245    48.710661
 246    62.754451
 247    65.741070
 248    73.867845
 Name: Tumor Volume (mm3), Length: 249, 

In [17]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [18]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
combine_dataset[["Mouse ID", "Drug Regimen", "Timepoint", "Tumor Volume (mm3)"]]
capomulin_dataset = combine_dataset.loc[combine_dataset["Drug Regimen"]=="Capomulin"]
capomulin_dataset
capo_mouse_data = capomulin_dataset.loc[capomulin_dataset["Mouse ID"]=="s185"]
capo_mouse_data

x_axis = []

for timepoint in capo_mouse_data["Timepoint"]:
    x_axis.append(timepoint)


y_axis = []

for tumor_vol in capo_mouse_data["Tumor Volume (mm3)"]:
    y_axis.append(tumor_vol)


plt.plot(x_axis,y_axis)


[<matplotlib.lines.Line2D at 0x7fab84ac30a0>]

In [24]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [25]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
