In [23]:
#import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

In [8]:
#csv files
mouse_metadata_path = "mouse_metadata.csv"
study_results_path = "pymaceuticals_data.csv"

In [9]:
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [10]:
#merge csv files
data_complete = pd.merge(mouse_metadata, study_results, how="left", on = ["Mouse ID", "Mouse ID"])

In [11]:
data_complete.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [13]:
#total mouse count
total_mice = data_complete["Mouse ID"].count()
total_mice

1893

In [14]:
#identify duplicate data
duplicate_data = data_complete[data_complete.duplicated(["Mouse ID", "Timepoint"])]
print(duplicate_data)

    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  \
909     g989     Propriva  Female          21          26          0   
911     g989     Propriva  Female          21          26          5   
913     g989     Propriva  Female          21          26         10   
915     g989     Propriva  Female          21          26         15   
917     g989     Propriva  Female          21          26         20   

     Tumor Volume (mm3)  Metastatic Sites  
909           45.000000                 0  
911           47.570392                 0  
913           49.880528                 0  
915           53.442020                 0  
917           54.657650                 1  


In [16]:
data_complete2 = data_complete.drop_duplicates(["Mouse ID", "Timepoint"])
data_complete2

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [17]:
total_mice2 = data_complete2["Mouse ID"].count()
total_mice2

1888

In [47]:
data_complete3 = data_complete2.set_index(["Drug Regimen"])
data_complete3

Unnamed: 0_level_0,Mouse ID,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ramicane,k403,Male,21,16,0,45.000000,0
Ramicane,k403,Male,21,16,5,38.825898,0
Ramicane,k403,Male,21,16,10,35.014271,1
Ramicane,k403,Male,21,16,15,34.223992,1
Ramicane,k403,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...
Naftisol,z969,Male,9,30,25,63.145652,2
Naftisol,z969,Male,9,30,30,65.841013,3
Naftisol,z969,Male,9,30,35,69.176246,4
Naftisol,z969,Male,9,30,40,70.314904,4


In [36]:
total_mice_per_regimen = data_complete2.groupby(["Drug Regimen"]).count()["Mouse ID"]
total_mice_per_regimen

Drug Regimen
Capomulin    230
Ceftamin     178
Infubinol    178
Ketapril     188
Naftisol     186
Placebo      181
Propriva     156
Ramicane     228
Stelasyn     181
Zoniferol    182
Name: Mouse ID, dtype: int64

In [37]:
mean_tumor_volume_per_regimen = data_complete2.groupby(["Drug Regimen"]).mean()["Tumor Volume (mm3)"]
mean_tumor_volume_per_regimen

Drug Regimen
Capomulin    40.675741
Ceftamin     52.591172
Infubinol    52.884795
Ketapril     55.235638
Naftisol     54.331565
Placebo      54.033581
Propriva     52.393463
Ramicane     40.216745
Stelasyn     54.233149
Zoniferol    53.236507
Name: Tumor Volume (mm3), dtype: float64

In [38]:
median_tumor_volume_per_regimen = data_complete2.groupby(["Drug Regimen"]).median()["Tumor Volume (mm3)"]
median_tumor_volume_per_regimen

Drug Regimen
Capomulin    41.557809
Ceftamin     51.776157
Infubinol    51.820584
Ketapril     53.698743
Naftisol     52.509285
Placebo      52.288934
Propriva     50.909965
Ramicane     40.673236
Stelasyn     52.431737
Zoniferol    51.818479
Name: Tumor Volume (mm3), dtype: float64

In [41]:
variance_tumor_volume_per_regimen = data_complete2.groupby(["Drug Regimen"]).var()["Tumor Volume (mm3)"]
variance_tumor_volume_per_regimen

Drug Regimen
Capomulin    24.947764
Ceftamin     39.290177
Infubinol    43.128684
Ketapril     68.553577
Naftisol     66.173479
Placebo      61.168083
Propriva     43.138803
Ramicane     23.486704
Stelasyn     59.450562
Zoniferol    48.533355
Name: Tumor Volume (mm3), dtype: float64

In [42]:
std_tumor_volume_per_regimen = data_complete2.groupby(["Drug Regimen"]).std()["Tumor Volume (mm3)"]
std_tumor_volume_per_regimen

Drug Regimen
Capomulin    4.994774
Ceftamin     6.268188
Infubinol    6.567243
Ketapril     8.279709
Naftisol     8.134708
Placebo      7.821003
Propriva     6.568014
Ramicane     4.846308
Stelasyn     7.710419
Zoniferol    6.966589
Name: Tumor Volume (mm3), dtype: float64

In [45]:
sem_tumor_volume_per_regimen = data_complete2.groupby(["Drug Regimen"]).sem()["Tumor Volume (mm3)"]
sem_tumor_volume_per_regimen

Drug Regimen
Capomulin    0.329346
Ceftamin     0.469821
Infubinol    0.492236
Ketapril     0.603860
Naftisol     0.596466
Placebo      0.581331
Propriva     0.525862
Ramicane     0.320955
Stelasyn     0.573111
Zoniferol    0.516398
Name: Tumor Volume (mm3), dtype: float64

In [49]:
stats_df = pd.DataFrame({"Drug Regimen": data_complete3,
                         "Total Mice": total_mice_per_regimen,
                        "Mean": mean_tumor_volume_per_regimen,
                        "Median": median_tumor_volume_per_regimen,
                        "Variance": variance_tumor_volume_per_regimen,
                        "Std Dev": std_tumor_volume_per_regimen,
                        "SEM": sem_tumor_volume_per_regimen})
stats_df

ValueError: Shape of passed values is (7, 7), indices imply (10, 7)

In [32]:
stats2_df = data_complete2.groupby(data_complete2["Drug Regimen"])["Tumor Volume (mm3)"].agg({"mean": np.mean,
                                                                                              "median": np.median,
                                                                                              "variance": np.var,
                                                                                              "std": np.std,
                                                                                              "sem": st.sem})
stats2_df

SpecificationError: nested renamer is not supported