## Observations and Insights 

In [3]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)
print(mouse_metadata)
print(study_results)

    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)
0       k403     Ramicane    Male          21          16
1       s185    Capomulin  Female           3          17
2       x401    Capomulin  Female          16          15
3       m601    Capomulin    Male          22          17
4       g791     Ramicane    Male          11          16
..       ...          ...     ...         ...         ...
244     z314     Stelasyn  Female          21          28
245     z435     Propriva  Female          12          26
246     z581    Infubinol  Female          24          25
247     z795     Naftisol  Female          13          29
248     z969     Naftisol    Male           9          30

[249 rows x 5 columns]
     Mouse ID  Timepoint  Tumor Volume (mm3)  Metastatic Sites
0        b128          0           45.000000                 0
1        f932          0           45.000000                 0
2        g107          0           45.000000                 0
3        a457          0    

In [32]:
# Combine the data into a single dataset
mouse_study_data = pd.merge(mouse_metadata, study_results, on="Mouse ID", how = "outer")

# Display the data table for preview
mouse_study_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [39]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_mouse_data = mouse_study_data.drop_duplicates(subset = 'Mouse ID', keep = 'last')
cleaned_mouse_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
9,k403,Ramicane,Male,21,16,45,22.050126,1
19,s185,Capomulin,Female,3,17,45,23.343598,1
29,x401,Capomulin,Female,16,15,45,28.484033,0
39,m601,Capomulin,Male,22,17,45,28.430964,1
49,g791,Ramicane,Male,11,16,45,29.128472,1
...,...,...,...,...,...,...,...,...
1859,z314,Stelasyn,Female,21,28,5,45.934712,0
1862,z435,Propriva,Female,12,26,10,48.710661,0
1872,z581,Infubinol,Female,24,25,45,62.754451,3
1882,z795,Naftisol,Female,13,29,45,65.741070,3


In [40]:
# Checking the number of mice in the clean DataFrame.
clean_number_mice = len(cleaned_mouse_data['Mouse ID'])
clean_number_mice

249

## Summary Statistics

In [61]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_grouping_count = cleaned_mouse_data.groupby(['Drug Regimen']).count()
drug_grouping_mean = cleaned_mouse_data.groupby(['Drug Regimen']).mean()
drug_grouping_median = cleaned_mouse_data.groupby(['Drug Regimen']).median()
drug_grouping_var = cleaned_mouse_data.groupby(['Drug Regimen']).var()
drug_grouping_stddev = cleaned_mouse_data.groupby(['Drug Regimen']).std()
drug_grouping_sem = cleaned_mouse_data.groupby(['Drug Regimen']).sem()
#drug_grouping_mean
#drug_grouping_median
#drug_grouping_var
#drug_grouping_stddev
#drug_grouping_sem
drug_grouping_count

Unnamed: 0_level_0,Mouse ID,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Capomulin,25,25,25,25,25,25,25
Ceftamin,25,25,25,25,25,25,25
Infubinol,25,25,25,25,25,25,25
Ketapril,25,25,25,25,25,25,25
Naftisol,25,25,25,25,25,25,25
Placebo,25,25,25,25,25,25,25
Propriva,25,25,25,25,25,25,25
Ramicane,25,25,25,25,25,25,25
Stelasyn,24,24,24,24,24,24,24
Zoniferol,25,25,25,25,25,25,25


In [72]:
drug_grouping_stats = pd.DataFrame(drug_grouping_count['Tumor Volume (mm3)'])
drug_grouping_stats = drug_grouping_stats.rename(columns = {"Tumor Volume (mm3)": "Count"})
drug_grouping_stats['Mean'] = drug_grouping_mean['Tumor Volume (mm3)']
drug_grouping_stats['Median'] = drug_grouping_median['Tumor Volume (mm3)']
drug_grouping_stats['Variance'] = drug_grouping_var['Tumor Volume (mm3)']
drug_grouping_stats['Standard Deviation'] = drug_grouping_stddev['Tumor Volume (mm3)']
drug_grouping_stats['Standard Error'] = drug_grouping_sem['Tumor Volume (mm3)']
drug_grouping_stats

Unnamed: 0_level_0,Count,Mean,Median,Variance,Standard Deviation,Standard Error
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Capomulin,25,36.667568,38.125164,32.663378,5.715188,1.143038
Ceftamin,25,57.753977,59.851956,69.982735,8.365568,1.673114
Infubinol,25,58.178246,60.16518,74.010875,8.602957,1.720591
Ketapril,25,62.806191,64.487812,98.92133,9.94592,1.989184
Naftisol,25,61.205757,63.283288,106.029927,10.297083,2.059417
Placebo,25,60.508414,62.030594,78.759797,8.874672,1.774934
Propriva,25,56.736964,55.84141,69.349002,8.327605,1.665521
Ramicane,25,36.19139,36.561652,32.166354,5.671539,1.134308
Stelasyn,24,61.001707,62.19235,90.331586,9.504293,1.940056
Zoniferol,25,59.181258,61.840058,76.862027,8.767099,1.75342


## Bar and Pie Charts

In [73]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
cleaned_mouse_data.plot(x = drug_grouping_stats['Drug Regimen'], y = drug_grouping_count, kind = 'bar')


KeyError: 'Drug Regimen'

In [80]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
x_axis = drug_grouping_count.index
y_axis = drug_grouping_count
#plt.bar(x_axis, y_axis)
y_axis

Unnamed: 0_level_0,Mouse ID,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Capomulin,25,25,25,25,25,25,25
Ceftamin,25,25,25,25,25,25,25
Infubinol,25,25,25,25,25,25,25
Ketapril,25,25,25,25,25,25,25
Naftisol,25,25,25,25,25,25,25
Placebo,25,25,25,25,25,25,25
Propriva,25,25,25,25,25,25,25
Ramicane,25,25,25,25,25,25,25
Stelasyn,24,24,24,24,24,24,24
Zoniferol,25,25,25,25,25,25,25


In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
