## Observations and Insights 

In [130]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_data = pd.merge(mouse_metadata, study_results, how = 'outer', on = 'Mouse ID')
combined_data.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [131]:
# Checking the number of mice in the DataFrame.
combined_data.groupby("Mouse ID").count()

Unnamed: 0_level_0,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Mouse ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a203,10,10,10,10,10,10,10
a251,10,10,10,10,10,10,10
a262,10,10,10,10,10,10,10
a275,10,10,10,10,10,10,10
a366,7,7,7,7,7,7,7
...,...,...,...,...,...,...,...
z435,3,3,3,3,3,3,3
z578,10,10,10,10,10,10,10
z581,10,10,10,10,10,10,10
z795,10,10,10,10,10,10,10


In [132]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 


In [133]:
# Optional: Get all the data for the duplicate mouse ID. 


In [134]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
combined_data.drop_duplicates(subset=["Mouse ID", "Timepoint"], inplace=True)
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1888 entries, 0 to 1892
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Mouse ID            1888 non-null   object 
 1   Drug Regimen        1888 non-null   object 
 2   Sex                 1888 non-null   object 
 3   Age_months          1888 non-null   int64  
 4   Weight (g)          1888 non-null   int64  
 5   Timepoint           1888 non-null   int64  
 6   Tumor Volume (mm3)  1888 non-null   float64
 7   Metastatic Sites    1888 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 132.8+ KB


In [135]:
# Checking the number of mice in the clean DataFrame.


## Summary Statistics

In [162]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.

#MEAN
tumor_mean = combined_data.groupby(["Drug Regimen", "Timepoint"]).mean()
tumor_mean = tumor_mean["Tumor Volume (mm3)"]
#tumor_mean = tumor_mean.reset_index()
tumor_mean = pd.DataFrame(tumor_mean)
tumor_mean.columns = ["Tummor Volume MEAN (mm3)"]

#MEDIAN
tumor_med = combined_data.groupby(["Drug Regimen", "Timepoint"]).median()
tumor_med = tumor_med["Tumor Volume (mm3)"]
#tumor_med = tumor_med.reset_index()
tumor_med = pd.DataFrame(tumor_med)
tumor_med.columns = ["Tummor Volume MED (mm3)"]

#VARIANCE
tumor_var = combined_data.groupby(["Drug Regimen", "Timepoint"]).var()
tumor_var = tumor_var["Tumor Volume (mm3)"]
#tumor_var = tumor_var.reset_index()
tumor_var = pd.DataFrame(tumor_var)
tumor_var.columns = ["Tummor Volume VAR (mm3)"]

#STANDARD DEVIATION
tumor_std = combined_data.groupby(["Drug Regimen", "Timepoint"]).std()
tumor_std = tumor_std["Tumor Volume (mm3)"]
#tumor_std = tumor_std.reset_index()
tumor_std = pd.DataFrame(tumor_std)
tumor_std.columns = ["Tummor Volume STD (mm3)"]

#SEM
tumor_sem = combined_data.groupby(["Drug Regimen", "Timepoint"]).sem()
tumor_sem = tumor_sem["Tumor Volume (mm3)"]
#tumor_sem = tumor_sem.reset_index()
tumor_sem = pd.DataFrame(tumor_sem)
tumor_sem.columns = ["Tummor Volume SEM (mm3)"]

#Merge them all to one table
stat_sum = pd.merge(tumor_mean, tumor_med, how = 'outer', on = ['Drug Regimen', "Timepoint"])
stat_sum = pd.merge(stat_sum, tumor_var, how = 'outer', on = ['Drug Regimen', "Timepoint"])
stat_sum = pd.merge(stat_sum, tumor_std, how = 'outer', on = ['Drug Regimen', "Timepoint"])
stat_sum = pd.merge(stat_sum, tumor_sem, how = 'outer', on = ['Drug Regimen', "Timepoint"])
stat_sum.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Tummor Volume MEAN (mm3),Tummor Volume MED (mm3),Tummor Volume VAR (mm3),Tummor Volume STD (mm3),Tummor Volume SEM (mm3)
Drug Regimen,Timepoint,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Capomulin,0,45.0,45.0,0.0,0.0,0.0
Capomulin,5,44.266086,45.597064,5.030889,2.242964,0.448593
Capomulin,10,43.084291,43.421014,12.344133,3.513422,0.702684
Capomulin,15,42.064317,42.79816,16.878693,4.108369,0.838617
Capomulin,20,40.716325,40.716428,19.035028,4.362915,0.909731


## Bar Plots

In [137]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas. 

In [138]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

## Pie Plots

In [139]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [140]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, Outliers and Boxplots

In [141]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [142]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [143]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [144]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [145]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
