## Observations and Insights 

In [66]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouseStudy_df = pd.merge(mouse_metadata,study_results,how='outer',on='Mouse ID')
# Display the data table for preview
mouseStudy_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [67]:
#group/sort the dataframe
mouseStudy_df = mouseStudy_df.groupby("Mouse ID").apply(lambda x:x.sort_values("Timepoint")).reset_index(drop=True)
mouseStudy_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,a203,Infubinol,Female,20,23,0,45.000000,0
1,a203,Infubinol,Female,20,23,5,48.508468,0
2,a203,Infubinol,Female,20,23,10,51.852437,1
3,a203,Infubinol,Female,20,23,15,52.777870,1
4,a203,Infubinol,Female,20,23,20,55.173336,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [68]:
# Checking the number of mice.
numMice = len(mouseStudy_df['Mouse ID'].unique())
uniqueMiceID = mouseStudy_df['Mouse ID'].unique()
numMice

249

In [69]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicatedMiceData_df = mouseStudy_df[mouseStudy_df.duplicated(['Mouse ID','Timepoint'])]
duplicatedMiceData_df
# print(f"Duplicated rows based on 'Mouse ID' and 'Timepoint' are:\n{duplicatedMiceTimepoint_df}")

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
587,g989,Propriva,Female,21,26,0,45.0,0
589,g989,Propriva,Female,21,26,5,47.570392,0
591,g989,Propriva,Female,21,26,10,49.880528,0
593,g989,Propriva,Female,21,26,15,53.44202,0
595,g989,Propriva,Female,21,26,20,54.65765,1


In [70]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicatedMiceID = duplicatedMiceData_df['Mouse ID'].unique()
duplicatedMiceID

array(['g989'], dtype=object)

In [71]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouseClean_df = mouseStudy_df.drop_duplicates(subset=['Mouse ID','Timepoint']).reset_index(drop=True)
mouseClean_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,a203,Infubinol,Female,20,23,0,45.000000,0
1,a203,Infubinol,Female,20,23,5,48.508468,0
2,a203,Infubinol,Female,20,23,10,51.852437,1
3,a203,Infubinol,Female,20,23,15,52.777870,1
4,a203,Infubinol,Female,20,23,20,55.173336,1
...,...,...,...,...,...,...,...,...
1883,z969,Naftisol,Male,9,30,25,63.145652,2
1884,z969,Naftisol,Male,9,30,30,65.841013,3
1885,z969,Naftisol,Male,9,30,35,69.176246,4
1886,z969,Naftisol,Male,9,30,40,70.314904,4


In [72]:
# Checking the number of mice in the clean DataFrame.
check_numMice = len(mouseClean_df['Mouse ID'].unique())
check_uniqueMiceID = mouseClean_df['Mouse ID'].unique()
check_numMice

249

## Summary Statistics

In [101]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM (standard error of the mean) of the tumor volume for each regimen
tumorMeanSummary = mouseClean_df[["Drug Regimen","Tumor Volume (mm3)"]].groupby("Drug Regimen").mean()
tumorMedianSummary = mouseClean_df[["Drug Regimen","Tumor Volume (mm3)"]].groupby("Drug Regimen").median()
tumorVarianceSummary = mouseClean_df[["Drug Regimen","Tumor Volume (mm3)"]].groupby("Drug Regimen").var(ddof=0)
tumorStdevSummary = mouseClean_df[["Drug Regimen","Tumor Volume (mm3)"]].groupby("Drug Regimen").std()
tumorSEMSummary = mouseClean_df[["Drug Regimen","Tumor Volume (mm3)"]].groupby("Drug Regimen").sem()

# This method is the most straighforward, creating multiple series and putting them all together at the end.
tumorSummary1 = pd.DataFrame.merge(tumorMeanSummary,tumorMedianSummary,how="outer",left_index=True,right_index=True,suffixes=('_Mean','_Median'))
tumorSummary2 = pd.DataFrame.merge(tumorVarianceSummary,tumorStdevSummary,how="outer",left_index=True,right_index=True,suffixes=('_Var','_Stdev'))
tumorSummary3 = pd.DataFrame.merge(tumorSummary2,tumorSEMSummary,how="outer",left_index=True,right_index=True).rename(columns={'Tumor Volume (mm3)':'Tumor Volume (mm3)_SEM'})
tumorSummary_df = pd.DataFrame.merge(tumorSummary1,tumorSummary3,how="outer",left_index=True,right_index=True)
tumorSummary_df = tumorSummary_df.reset_index()
tumorSummary_df

Unnamed: 0,Drug Regimen,Tumor Volume (mm3)_Mean,Tumor Volume (mm3)_Median,Tumor Volume (mm3)_Var,Tumor Volume (mm3)_Stdev,Tumor Volume (mm3)_SEM
0,Capomulin,40.675741,41.557809,24.839296,4.994774,0.329346
1,Ceftamin,52.591172,51.776157,39.069446,6.268188,0.469821
2,Infubinol,52.884795,51.820584,42.886388,6.567243,0.492236
3,Ketapril,55.235638,53.698743,68.18893,8.279709,0.60386
4,Naftisol,54.331565,52.509285,65.817708,8.134708,0.596466
5,Placebo,54.033581,52.288934,60.830138,7.821003,0.581331
6,Propriva,52.393463,50.909965,42.862273,6.568014,0.525862
7,Ramicane,40.216745,40.673236,23.383692,4.846308,0.320955
8,Stelasyn,54.233149,52.431737,59.122106,7.710419,0.573111
9,Zoniferol,53.236507,51.818479,48.266689,6.966589,0.516398


In [105]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method produces everything in a single groupby function
tumorSummary_dfQuick = mouseClean_df[["Drug Regimen","Tumor Volume (mm3)"]].groupby("Drug Regimen").agg(['mean','median','var','std','sem'])
tumorSummary_dfQuick = tumorSummary_dfQuick.reset_index()
#MultiIndex column names
tumorSummary_dfQuick

Unnamed: 0_level_0,Drug Regimen,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,var,std,sem
0,Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
1,Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
2,Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
3,Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
4,Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
5,Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
6,Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
7,Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
8,Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
9,Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [118]:
#will return a df with all columns under the 'Tumor Volume (mm3)' index
tumorSummary_dfQuick[('Tumor Volume (mm3)')]

Unnamed: 0,mean,median,var,std,sem
0,40.675741,41.557809,24.947764,4.994774,0.329346
1,52.591172,51.776157,39.290177,6.268188,0.469821
2,52.884795,51.820584,43.128684,6.567243,0.492236
3,55.235638,53.698743,68.553577,8.279709,0.60386
4,54.331565,52.509285,66.173479,8.134708,0.596466
5,54.033581,52.288934,61.168083,7.821003,0.581331
6,52.393463,50.909965,43.138803,6.568014,0.525862
7,40.216745,40.673236,23.486704,4.846308,0.320955
8,54.233149,52.431737,59.450562,7.710419,0.573111
9,53.236507,51.818479,48.533355,6.966589,0.516398


In [119]:
#will return the specific values for 'mean' column under the column index 'Tumor Volume (mm3)'
tumorSummary_dfQuick[('Tumor Volume (mm3)','mean')]

0    40.675741
1    52.591172
2    52.884795
3    55.235638
4    54.331565
5    54.033581
6    52.393463
7    40.216745
8    54.233149
9    53.236507
Name: (Tumor Volume (mm3), mean), dtype: float64

## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 



In [10]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
