In [1]:
# Import dependencies
import pandas as pd
import matplotlib as plt
import scipy.stats as st

In [2]:
# Open and read both data files
mouse_metadata_file = "Resources/Mouse_metadata.csv"
study_results_file = "Resources/Study_results.csv"

In [8]:
# Read and preview the data in a datafram
mouse_metadata_master = pd.read_csv(mouse_metadata_file)
mouse_metadata_master.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [7]:
study_results_master = pd.read_csv(study_results_file)
study_results_master.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [69]:
# Merge the dataframes
compiled_data_master = pd.merge(study_results_master, mouse_metadata_master, on="Mouse ID")
compiled_data_master

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22
...,...,...,...,...,...,...,...,...
1888,m601,25,33.118756,1,Capomulin,Male,22,17
1889,m601,30,31.758275,1,Capomulin,Male,22,17
1890,m601,35,30.834357,1,Capomulin,Male,22,17
1891,m601,40,31.378045,1,Capomulin,Male,22,17


In [36]:
# Confirm each row is unique
# Many thanks to Stephanie Richards for her help!
duplicates = compiled_data_master[compiled_data_master.duplicated(["Mouse ID", "Timepoint"], keep=False)]
duplicates

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
860,g989,0,45.0,0,Propriva,Female,21,26
861,g989,0,45.0,0,Propriva,Female,21,26
862,g989,5,48.786801,0,Propriva,Female,21,26
863,g989,5,47.570392,0,Propriva,Female,21,26
864,g989,10,51.745156,0,Propriva,Female,21,26
865,g989,10,49.880528,0,Propriva,Female,21,26
866,g989,15,51.325852,1,Propriva,Female,21,26
867,g989,15,53.44202,0,Propriva,Female,21,26
868,g989,20,55.326122,1,Propriva,Female,21,26
869,g989,20,54.65765,1,Propriva,Female,21,26


In [39]:
# Drop data for duplicated mouse
# df.loc[df['shield'] > 6]
cleaned_master = compiled_data_master.loc[compiled_data_master["Mouse ID"] != "g989"]
cleaned_master

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22
...,...,...,...,...,...,...,...,...
1888,m601,25,33.118756,1,Capomulin,Male,22,17
1889,m601,30,31.758275,1,Capomulin,Male,22,17
1890,m601,35,30.834357,1,Capomulin,Male,22,17
1891,m601,40,31.378045,1,Capomulin,Male,22,17


In [None]:
# Summary Statistics Table
# mean, median, variance, STD, SEM of tumor volume for EACH drug regimen

In [63]:
drug_groups = cleaned_master.groupby("Drug Regimen")
mean_tumor_volume = drug_groups["Tumor Volume (mm3)"].mean()
# mean_tumor_volume
median_tumor_volume = drug_groups["Tumor Volume (mm3)"].median()
# median_tumor_volume
variance_tumor_volume = drug_groups["Tumor Volume (mm3)"].var()
# variance_tumor_volume
STD_tumor_volume = drug_groups["Tumor Volume (mm3)"].std()
# STD_tumor_volume

Drug Regimen
Capomulin    4.994774
Ceftamin     6.268188
Infubinol    6.567243
Ketapril     8.279709
Naftisol     8.134708
Placebo      7.821003
Propriva     6.622085
Ramicane     4.846308
Stelasyn     7.710419
Zoniferol    6.966589
Name: Tumor Volume (mm3), dtype: float64

In [66]:
SEM_tumor_volume = drug_groups["Tumor Volume (mm3)"].sem()
SEM_tumor_volume

Drug Regimen
Capomulin    0.329346
Ceftamin     0.469821
Infubinol    0.492236
Ketapril     0.603860
Naftisol     0.596466
Placebo      0.581331
Propriva     0.544332
Ramicane     0.320955
Stelasyn     0.573111
Zoniferol    0.516398
Name: Tumor Volume (mm3), dtype: float64

In [67]:
# Start building summary table
Summary_Statistics_Table = pd.DataFrame({
    "Mean of Tumor Volume": mean_tumor_volume, 
    "Median of Tumor Volume": median_tumor_volume,
    "Variance of Tumor Volume": variance_tumor_volume,
    "Standard Deviation of Tumor Volume": STD_tumor_volume,
    "SEM of Tumor Volume": SEM_tumor_volume
})
Summary_Statistics_Table

Unnamed: 0_level_0,Mean of Tumor Volume,Median of Tumor Volume,Variance of Tumor Volume,Standard Deviation of Tumor Volume,SEM of Tumor Volume
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [None]:
# Make two identical bar charts with two different methods
# First with DataFrame.plot()
# Second with Matplotlib pyplot