In [121]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_data= pd.merge(study_results, mouse_metadata, how='left', on =['Mouse ID', 'Mouse ID'])
# Display the data table for preview

mouse_data

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,f932,0,45.000000,0,Ketapril,Male,15,29
2,g107,0,45.000000,0,Ketapril,Female,2,29
3,a457,0,45.000000,0,Ketapril,Female,11,30
4,c819,0,45.000000,0,Ketapril,Male,21,25
...,...,...,...,...,...,...,...,...
1888,r944,45,41.581521,2,Capomulin,Male,12,25
1889,u364,45,31.023923,3,Capomulin,Male,18,17
1890,p438,45,61.433892,1,Ceftamin,Female,11,26
1891,x773,45,58.634971,4,Placebo,Female,21,30


In [122]:
mouse_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1893 entries, 0 to 1892
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Mouse ID            1893 non-null   object 
 1   Timepoint           1893 non-null   int64  
 2   Tumor Volume (mm3)  1893 non-null   float64
 3   Metastatic Sites    1893 non-null   int64  
 4   Drug Regimen        1893 non-null   object 
 5   Sex                 1893 non-null   object 
 6   Age_months          1893 non-null   int64  
 7   Weight (g)          1893 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 133.1+ KB


In [123]:
#Unique Mouse IDs
len(mouse_data['Mouse ID'].unique())

249

In [124]:
#Find duplicate timepoints for Mouse ID
mouse_data[mouse_data.duplicated(['Mouse ID', 'Timepoint'], keep = False)].groupby(['Mouse ID', 'Timepoint']).min()

Unnamed: 0_level_0,Unnamed: 1_level_0,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
Mouse ID,Timepoint,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
g989,0,45.0,0,Propriva,Female,21,26
g989,5,47.570392,0,Propriva,Female,21,26
g989,10,49.880528,0,Propriva,Female,21,26
g989,15,51.325852,0,Propriva,Female,21,26
g989,20,54.65765,1,Propriva,Female,21,26


In [125]:
#create new dataframe without duplicate ID
cleaned_data = mouse_data.loc[mouse_data['Mouse ID'] != 'g989']

In [126]:
cleaned_data

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,f932,0,45.000000,0,Ketapril,Male,15,29
2,g107,0,45.000000,0,Ketapril,Female,2,29
3,a457,0,45.000000,0,Ketapril,Female,11,30
4,c819,0,45.000000,0,Ketapril,Male,21,25
...,...,...,...,...,...,...,...,...
1888,r944,45,41.581521,2,Capomulin,Male,12,25
1889,u364,45,31.023923,3,Capomulin,Male,18,17
1890,p438,45,61.433892,1,Ceftamin,Female,11,26
1891,x773,45,58.634971,4,Placebo,Female,21,30


In [127]:
group_drug.head()

Unnamed: 0_level_0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Sex,Age_months,Weight (g)
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Capomulin,b128,0,45.0,0,Female,9,22
Ketapril,f932,0,45.0,0,Male,15,29
Ketapril,g107,0,45.0,0,Female,2,29
Ketapril,a457,0,45.0,0,Female,11,30
Ketapril,c819,0,45.0,0,Male,21,25
Ketapril,h246,0,45.0,0,Male,13,30
Naftisol,f993,0,45.0,0,Male,21,28
Naftisol,z234,0,45.0,0,Female,19,27
Naftisol,b559,0,45.0,0,Male,20,26
Naftisol,x930,0,45.0,0,Male,13,26


In [128]:
len(cleaned_data['Mouse ID'].unique())

248

In [129]:
drug_regimen = cleaned_data.set_index('Drug Regimen')

group_drug = drug_regimen.groupby(['Drug Regimen'])


In [130]:
group_drug_mean = drug_regimen.groupby(['Drug Regimen']).mean()



drug_mean = group_drug_mean.rename(columns={"Tumor Volume (mm3)":"Mean Tumor Volume"})

In [131]:
group_drug_median = drug_regimen.groupby(['Drug Regimen']).median()



drug_median = group_drug_median.rename(columns={"Tumor Volume (mm3)":"Median Tumor Volume"})

In [132]:
group_drug_std = drug_regimen.groupby(['Drug Regimen']).std()



drug_std = group_drug_std.rename(columns={"Tumor Volume (mm3)":"Tumor Volume Std. Dev."})

In [133]:
group_drug_sem = drug_regimen.groupby(['Drug Regimen']).sem()


drug_sem = group_drug_sem.rename(columns={"Tumor Volume (mm3)":"Tumor Volume Std. Err."})



In [134]:
group_drug_var = drug_regimen.groupby(['Drug Regimen']).var()

drug_var = group_drug_var.rename(columns={"Tumor Volume (mm3)":"Tumor Volume Variance"})



In [143]:
pd.concat((drug_mean['Mean Tumor Volume'],drug_median["Median Tumor Volume"], drug_var ['Tumor Volume Variance'], drug_std['Tumor Volume Std. Dev.'],drug_sem['Tumor Volume Std. Err.']), axis = 1)

Unnamed: 0_level_0,Mean Tumor Volume,Median Tumor Volume,Tumor Volume Variance,Tumor Volume Std. Dev.,Tumor Volume Std. Err.
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398
