In [2]:
### Observations and Insights

In [18]:
### Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import json as json
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_study = pd.merge(mouse_metadata, study_results, how="left", on=["Mouse ID","Mouse ID"])
                                

# Display the data table for preview
mouse_study

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [19]:
 # Checking the number of mice.
    
mouse_study["Mouse ID"].count()
total_mice = mouse_study["Mouse ID"].count()
total_mice

1893

In [54]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_summary = mouse_study.drop_duplicates(subset = ['Mouse ID', 'Timepoint'], keep="first").reset_index(drop = True)
mouse_summary


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1883,z969,Naftisol,Male,9,30,25,63.145652,2
1884,z969,Naftisol,Male,9,30,30,65.841013,3
1885,z969,Naftisol,Male,9,30,35,69.176246,4
1886,z969,Naftisol,Male,9,30,40,70.314904,4


In [56]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicates = mouse_study.duplicated(subset=["Mouse ID", "Timepoint"], keep="first")
duplicates_loc = mouse_study.loc[duplicates == True]

duplicates_loc

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [57]:
# Checking the number of mice in the clean DataFrame.
mouse_summary["Mouse ID"].nunique()
total_clean_mice = mouse_summary["Mouse ID"].nunique()
total_clean_mice

249

In [58]:
### Summary Statistics

# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drugs_list = mouse_summary["Drug Regimen"].unique()
drugs_list.sort()
print(drugs_list)


# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 

# calculating mean tumor volume 
tumor_volume = mouse_summary.groupby(["Drug Regimen"]).mean()["Tumor Volume (mm3)"]
tumor_volume_mean = pd.DataFrame(tumor_volume)
tumor_volume_mean


['Capomulin' 'Ceftamin' 'Infubinol' 'Ketapril' 'Naftisol' 'Placebo'
 'Propriva' 'Ramicane' 'Stelasyn' 'Zoniferol']


Unnamed: 0_level_0,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,40.675741
Ceftamin,52.591172
Infubinol,52.884795
Ketapril,55.235638
Naftisol,54.331565
Placebo,54.033581
Propriva,52.393463
Ramicane,40.216745
Stelasyn,54.233149
Zoniferol,53.236507


In [59]:
# calculating median tumor volume
tumor_volume = mouse_summary.groupby(["Drug Regimen"]).median()["Tumor Volume (mm3)"]
tumor_volume_median = pd.DataFrame(tumor_volume)
tumor_volume_median

Unnamed: 0_level_0,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,41.557809
Ceftamin,51.776157
Infubinol,51.820584
Ketapril,53.698743
Naftisol,52.509285
Placebo,52.288934
Propriva,50.909965
Ramicane,40.673236
Stelasyn,52.431737
Zoniferol,51.818479


In [60]:
# calculating variance of tumor volume
tumor_volume = mouse_summary.groupby(["Drug Regimen"]).var()["Tumor Volume (mm3)"]
tumor_volume_variance = pd.DataFrame(tumor_volume)
tumor_volume_variance


Unnamed: 0_level_0,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,24.947764
Ceftamin,39.290177
Infubinol,43.128684
Ketapril,68.553577
Naftisol,66.173479
Placebo,61.168083
Propriva,43.138803
Ramicane,23.486704
Stelasyn,59.450562
Zoniferol,48.533355


In [61]:
# calculating standard deviation of tumor volume
tumor_volume = mouse_summary.groupby(["Drug Regimen"]).std()["Tumor Volume (mm3)"]
tumor_volume_std = pd.DataFrame(tumor_volume)
tumor_volume_std

Unnamed: 0_level_0,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,4.994774
Ceftamin,6.268188
Infubinol,6.567243
Ketapril,8.279709
Naftisol,8.134708
Placebo,7.821003
Propriva,6.568014
Ramicane,4.846308
Stelasyn,7.710419
Zoniferol,6.966589


In [62]:
# calculating SEM
tumor_volume = mouse_summary.groupby(["Drug Regimen"]).sem()["Tumor Volume (mm3)"]
tumor_volume_sem = pd.DataFrame(tumor_volume)
tumor_volume_sem

Unnamed: 0_level_0,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,0.329346
Ceftamin,0.469821
Infubinol,0.492236
Ketapril,0.60386
Naftisol,0.596466
Placebo,0.581331
Propriva,0.525862
Ramicane,0.320955
Stelasyn,0.573111
Zoniferol,0.516398


In [80]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Assemble the resulting series into a single summary dataframe.
drug = mouse_summary["Drug Regimen"] == "Capomulin"
mouse_summary[drug]
Capomulin = mouse_summary[drug]
Capomulin_df = pd.DataFrame(Capomulin["Tumor Volume (mm3)"].describe())
Capomulin_df



Unnamed: 0,Tumor Volume (mm3)
count,230.0
mean,40.675741
std,4.994774
min,23.343598
25%,37.685933
50%,41.557809
75%,45.0
max,48.158209


In [81]:
drug = mouse_summary["Drug Regimen"] == "Ceftamin"
mouse_summary[drug]
Ceftamin = mouse_summary[drug]
Ceftamin_df = pd.DataFrame(Ceftamin["Tumor Volume (mm3)"].describe())
Ceftamin_df

Unnamed: 0,Tumor Volume (mm3)
count,178.0
mean,52.591172
std,6.268188
min,45.0
25%,47.208427
50%,51.776157
75%,56.801438
max,68.923185


In [82]:
drug = mouse_summary["Drug Regimen"] == "Infubinol"
mouse_summary[drug]
Infubinol = mouse_summary[drug]
Infubinol_df = pd.DataFrame(Infubinol["Tumor Volume (mm3)"].describe())
Infubinol_df

Unnamed: 0,Tumor Volume (mm3)
count,178.0
mean,52.884795
std,6.567243
min,36.321346
25%,47.312353
50%,51.820584
75%,57.314444
max,72.226731


In [83]:
drug = mouse_summary["Drug Regimen"] == "Ketapril"
mouse_summary[drug]
Ketapril = mouse_summary[drug]
Ketapril_df = pd.DataFrame(Ketapril["Tumor Volume (mm3)"].describe())
Ketapril_df

Unnamed: 0,Tumor Volume (mm3)
count,188.0
mean,55.235638
std,8.279709
min,45.0
25%,48.232987
50%,53.698743
75%,60.870951
max,78.567014


In [84]:
drug = mouse_summary["Drug Regimen"] == "Naftisol"
mouse_summary[drug]
Naftisol = mouse_summary[drug]
Naftisol_df = pd.DataFrame(Naftisol["Tumor Volume (mm3)"].describe())
Naftisol_df

Unnamed: 0,Tumor Volume (mm3)
count,186.0
mean,54.331565
std,8.134708
min,45.0
25%,47.285874
50%,52.509285
75%,59.963034
max,76.668817


In [85]:
drug = mouse_summary["Drug Regimen"] == "Placebo"
mouse_summary[drug]
Placebo = mouse_summary[drug]
Placebo_df = pd.DataFrame(Placebo["Tumor Volume (mm3)"].describe())
Placebo_df

Unnamed: 0,Tumor Volume (mm3)
count,181.0
mean,54.033581
std,7.821003
min,45.0
25%,47.459053
50%,52.288934
75%,59.916934
max,73.212939


In [86]:
drug = mouse_summary["Drug Regimen"] == "Propriva"
mouse_summary[drug]
Propriva = mouse_summary[drug]
Propriva_df = pd.DataFrame(Propriva["Tumor Volume (mm3)"].describe())
Propriva_df

Unnamed: 0,Tumor Volume (mm3)
count,156.0
mean,52.393463
std,6.568014
min,45.0
25%,47.046068
50%,50.909965
75%,56.491585
max,72.455421


In [87]:
drug = mouse_summary["Drug Regimen"] == "Ramicane"
mouse_summary[drug]
Ramicane = mouse_summary[drug]
Ramicane_df = pd.DataFrame(Ramicane["Tumor Volume (mm3)"].describe())
Ramicane_df

Unnamed: 0,Tumor Volume (mm3)
count,228.0
mean,40.216745
std,4.846308
min,22.050126
25%,36.674635
50%,40.673236
75%,45.0
max,47.622816


In [88]:
drug = mouse_summary["Drug Regimen"] == "Stelasyn"
mouse_summary[drug]
Stelasyn = mouse_summary[drug]
Stelasyn_df = pd.DataFrame(Stelasyn["Tumor Volume (mm3)"].describe())
Stelasyn_df

Unnamed: 0,Tumor Volume (mm3)
count,181.0
mean,54.233149
std,7.710419
min,45.0
25%,48.047139
50%,52.431737
75%,58.719297
max,75.12369


In [89]:
drug = mouse_summary["Drug Regimen"] == "Zoniferol"
mouse_summary[drug]
Zoniferol = mouse_summary[drug]
Zoniferol_df = pd.DataFrame(Zoniferol["Tumor Volume (mm3)"].describe())
Zoniferol_df

Unnamed: 0,Tumor Volume (mm3)
count,182.0
mean,53.236507
std,6.966589
min,45.0
25%,47.337876
50%,51.818479
75%,57.954259
max,73.324432


In [90]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
tumor_summary = mouse_summary.groupby(["Drug Regimen"])

tumor_volume_mean = tumor_summary["Tumor Volume (mm3)"].mean()
tumor_volume_median = tumor_summary["Tumor Volume (mm3)"].median()
tumor_volume_variance = tumor_summary["Tumor Volume (mm3)"].var()
tumor_volume_std = tumor_summary["Tumor Volume (mm3)"].std()
tumor_volume_sem = tumor_summary["Tumor Volume (mm3)"].sem()

tumor_summary = pd.DataFrame({
                        "Mean": tumor_volume_mean,
                        "Median": tumor_volume_median,
                        "Variance": tumor_volume_variance,
                        "Standard Deviation": tumor_volume_std,
                        "SEM": tumor_volume_sem
})

tumor_summary

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [None]:
### Bar and Pie Charts

# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

In [None]:
### Quartiles, Outliers and Boxplots


# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

In [None]:
### Line and Scatter Plots


# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin



In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

In [None]:
###  Correlation and Regression


# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen