## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "Mouse_metadata.csv"
study_results_path = "Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset

study_results=pd.DataFrame(study_results)
print(study_results)

all=pd.merge(mouse_metadata, study_results, on="Mouse ID")

all 
#Display the data table for preview
all.tail()

     Mouse ID  Timepoint  Tumor Volume (mm3)  Metastatic Sites
0        b128          0           45.000000                 0
1        f932          0           45.000000                 0
2        g107          0           45.000000                 0
3        a457          0           45.000000                 0
4        c819          0           45.000000                 0
...       ...        ...                 ...               ...
1888     r944         45           41.581521                 2
1889     u364         45           31.023923                 3
1890     p438         45           61.433892                 1
1891     x773         45           58.634971                 4
1892     b879         45           72.555239                 2

[1893 rows x 4 columns]


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4
1892,z969,Naftisol,Male,9,30,45,73.867845,4


In [2]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
##counting how many different type of "Timepoints" are

test=all.groupby("Mouse ID")
test_t=test["Timepoint"].value_counts() #this identifies how many times the value Timepoint is repeated
test_t=pd.DataFrame(test_t)
repeated=test_t.loc[test_t['Timepoint'] > 1] #So, if a Timepoint value is greater than 1, it means that is repeated at least once. 
repeated

Unnamed: 0_level_0,Unnamed: 1_level_0,Timepoint
Mouse ID,Timepoint,Unnamed: 2_level_1
g989,0,2
g989,5,2
g989,10,2
g989,15,2
g989,20,2


In [3]:
look_repeated=all.loc[all["Mouse ID"] =="g989"] #will get a subset with only the repetitions for this mouse
indices_repeated =list(look_repeated.index) #obtaining the indexes to use them to drop rows by index
cleaned_df=all.drop(indices_repeated)

In [4]:
cleaned_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [5]:
# Optional: Get all the data for the duplicate mouse ID. 



In [6]:
# Checking the number of mice in the clean DataFrame.
number_mices=cleaned_df[["Mouse ID"]].nunique()
number_mices

Mouse ID    248
dtype: int64

## Summary Statistics

In [25]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

drug_regimen=all.groupby("Drug Regimen")

drug_regimen_mean=drug_regimen[["Tumor Volume (mm3)"]].mean().round(2)
drug_regimen_mean=drug_regimen_mean.rename(columns={"Tumor Volume (mm3)":"Tum Vol Avg (mm3)"})

drug_regimen_med=drug_regimen[["Tumor Volume (mm3)"]].median().round(2)
drug_regimen_med=drug_regimen_med.rename(columns={"Tumor Volume (mm3)":"Tum Vol Median"})

drug_regimen_var=drug_regimen[["Tumor Volume (mm3)"]].var().round(2)
drug_regimen_var=drug_regimen_var.rename(columns={"Tumor Volume (mm3)":"Tum Vol Variance"})

drug_regimen_std=drug_regimen[["Tumor Volume (mm3)"]].std().round(2)
drug_regimen_std=drug_regimen_std.rename(columns={"Tumor Volume (mm3)":"Std Dev"})

drug_regimen_sem=drug_regimen[["Tumor Volume (mm3)"]].sem().round(2)
drug_regimen_sem=drug_regimen_sem.rename(columns={"Tumor Volume (mm3)":"SEM"})

sum_stats_regimen=pd.concat([drug_regimen_mean,drug_regimen_med,drug_regimen_var,drug_regimen_std,drug_regimen_sem], axis=1)
sum_stats_regimen = pd.DataFrame(sum_stats_regimen)
sum_stats_regimen


Unnamed: 0_level_0,Tum Vol Avg (mm3),Tum Vol Median,Tum Vol Variance,Std Dev,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.68,41.56,24.95,4.99,0.33
Ceftamin,52.59,51.78,39.29,6.27,0.47
Infubinol,52.88,51.82,43.13,6.57,0.49
Ketapril,55.24,53.7,68.55,8.28,0.6
Naftisol,54.33,52.51,66.17,8.13,0.6
Placebo,54.03,52.29,61.17,7.82,0.58
Propriva,52.32,50.85,42.35,6.51,0.51
Ramicane,40.22,40.67,23.49,4.85,0.32
Stelasyn,54.23,52.43,59.45,7.71,0.57
Zoniferol,53.24,51.82,48.53,6.97,0.52


In [31]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_regimen_tum=drug_regimen[["Tumor Volume (mm3)"]]
drug_regimen_tum=drug_regimen_tum.agg(["mean","median","var","std","sem"]).round(2)
drug_regimen_tum
# Using the aggregation method, produce the same summary statistics in a single line


Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.68,41.56,24.95,4.99,0.33
Ceftamin,52.59,51.78,39.29,6.27,0.47
Infubinol,52.88,51.82,43.13,6.57,0.49
Ketapril,55.24,53.7,68.55,8.28,0.6
Naftisol,54.33,52.51,66.17,8.13,0.6
Placebo,54.03,52.29,61.17,7.82,0.58
Propriva,52.32,50.85,42.35,6.51,0.51
Ramicane,40.22,40.67,23.49,4.85,0.32
Stelasyn,54.23,52.43,59.45,7.71,0.57
Zoniferol,53.24,51.82,48.53,6.97,0.52


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [10]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
