## Observations and Insights 

In [1]:
%matplotlib notebook

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import sem
import random
import string
import scipy.stats as st
from scipy import stats

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)


In [3]:
# Checking the number of mice.
length_mouse_metadata = len(mouse_metadata['Mouse ID'].unique())
length_study_results = len(study_results['Mouse ID'].unique())
print(f'The number of unique mouse in the mouse metadata file is {length_mouse_metadata}.')
print(f'The number of unique mouse in the study results file is {length_study_results}.')

The number of unique mouse in the mouse metadata file is 249.
The number of unique mouse in the study results file is 249.


In [4]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
# Optional: Get all the data for the duplicate mouse ID. 
grouped_study_results = study_results.groupby(['Mouse ID','Timepoint']).count()
grouped_study_results.loc[grouped_study_results['Tumor Volume (mm3)']>1]


Unnamed: 0_level_0,Unnamed: 1_level_0,Tumor Volume (mm3),Metastatic Sites
Mouse ID,Timepoint,Unnamed: 2_level_1,Unnamed: 3_level_1
g989,0,2,2
g989,5,2,2
g989,10,2,2
g989,15,2,2
g989,20,2,2


In [5]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_study_results = study_results.loc[study_results['Mouse ID'] != 'g989']
cleaned_mouse_metadata = mouse_metadata.loc[mouse_metadata['Mouse ID'] != 'g989']

In [6]:
# Combine the data into a single dataset
merged_df = pd.merge(cleaned_mouse_metadata,cleaned_study_results, on="Mouse ID", how = "outer")
# Display the data table for preview
merged_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1875,z969,Naftisol,Male,9,30,25,63.145652,2
1876,z969,Naftisol,Male,9,30,30,65.841013,3
1877,z969,Naftisol,Male,9,30,35,69.176246,4
1878,z969,Naftisol,Male,9,30,40,70.314904,4


In [7]:
# Checking the number of mice in the clean DataFrame.
length_merged_df = len(merged_df['Mouse ID'].unique())
length_cleaned_study_results = len(cleaned_study_results['Mouse ID'].unique())
length_cleaned_mouse_metadata = len(cleaned_mouse_metadata['Mouse ID'].unique())
print(f'The number of mice in the merged DataFrame is {length_merged_df}.')
print(f'The number of mice in the cleaned study result file is {length_cleaned_study_results}.')
print(f'The number of mice in the cleaned mouse metadata file is {length_cleaned_mouse_metadata}.')

The number of mice in the merged DataFrame is 248.
The number of mice in the cleaned study result file is 248.
The number of mice in the cleaned mouse metadata file is 248.


## Summary Statistics

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
grouped_drug_regimen = merged_df.groupby(merged_df['Drug Regimen'])
tumor_data = grouped_drug_regimen['Tumor Volume (mm3)']
#Finding the mean for Tumor Volume (mm3)
mean_tumor = tumor_data.mean()

#Finding the median for Tumor Volume (mm3)
median_tumor = tumor_data.median()

#Finding the variance for Tumor Volume (mm3)
variance_tumor = tumor_data.var(ddof=1)
#[np.var(grouped_drug_regimen, ddof = 0) for x in tumor_data]

#standard deviation
sd_tumor = tumor_data.std(ddof=1)

#SEM 
std_error_tumor = tumor_data.sem(ddof=1)

# This method is the most straighforward, creating multiple series and putting them all together at the end.
summary_stats_tumor = pd.DataFrame([mean_tumor,median_tumor,variance_tumor,sd_tumor,std_error_tumor]).transpose()
summary_stats_tumor.columns.values[0] = 'Mean of Tumor Volume (mm3)'
summary_stats_tumor.columns.values[1] = 'Median of Tumor Volume (mm3)'
summary_stats_tumor.columns.values[2] = 'Variance of Tumor Volume (mm3)'
summary_stats_tumor.columns.values[3] = 'Std Deviation of Tumor Volume (mm3)'
summary_stats_tumor.columns.values[4] = 'SEM of Tumor Volume (mm3)'
summary_stats_tumor

Unnamed: 0_level_0,Mean of Tumor Volume (mm3),Median of Tumor Volume (mm3),Variance of Tumor Volume (mm3),Std Deviation of Tumor Volume (mm3),SEM of Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [9]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function
tumor_data_gb = grouped_drug_regimen['Tumor Volume (mm3)']
tumor_data_gb.describe()
tumor_data_gb.agg(["mean","median","var","std","sem"])

Unnamed: 0_level_0,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [10]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 

new_merged_df = merged_df[['Drug Regimen','Mouse ID']]
grouped_merged = new_merged_df.groupby(['Drug Regimen'])
bar_plot_data= grouped_merged.count()
x_axis = merged_df['Drug Regimen'].unique().tolist()
sorted_x_axis = sorted(x_axis)
sorted_x_axis

plt.figure(figsize=(6,9))

plt.bar(sorted_x_axis,bar_plot_data['Mouse ID'], align='center')
plt.title('Total number of mice for each Drug Regimen')
plt.xlabel('Drug Regimen')
plt.xticks(rotation=45)
plt.ylabel('Count of mice')


<IPython.core.display.Javascript object>

Text(0, 0.5, 'Count of mice')

In [11]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
new_merged_df = merged_df[['Drug Regimen','Mouse ID']]
new_merged_df
grouped_merged = new_merged_df.groupby(['Drug Regimen'])
bar_plot_data['Mouse ID']= pd.to_numeric(grouped_merged['Mouse ID'].count())
bar_plot_data
bar_plot_pandas = bar_plot_data.plot(kind="bar", figsize = (6,9.5),title = 'Total number of mice for each Drug Regimen',legend = False)
bar_plot_pandas.set_xlabel('Drug Regimen')
bar_plot_pandas.set_ylabel('Count of mice')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Count of mice')

In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
#sizes
fig = plt.figure()
ax = plt.axes()

sizes = merged_df.value_counts('Sex').tolist()

#labels
labels = merged_df['Sex'].unique().tolist()

plt.pie(sizes, labels = labels, autopct ="%1.1f%%")
plt.axis("equal")
plt.title('Distribution of female and male in the study')


<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Distribution of female and male in the study')

In [13]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

new= pd.DataFrame(merged_df['Sex'].value_counts())
new
pie_plot_pandas = new.plot.pie(y= 'Sex',autopct ="%1.1f%%", title = 'Distribution of female and male in the study', legend = False, xlabel=False)
pie_plot_pandas.set_ylabel('')

<IPython.core.display.Javascript object>

Text(0, 0.5, '')

## Quartiles, Outliers and Boxplots

In [14]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
filter_merged_Capomulin = merged_df.loc[merged_df['Drug Regimen']=='Capomulin']
filter_merged_Ramicane = merged_df.loc[merged_df['Drug Regimen']=='Ramicane']
filter_merged_Infubinol = merged_df.loc[merged_df['Drug Regimen']=='Infubinol']
filter_merged_Ceftamin = merged_df.loc[merged_df['Drug Regimen']=='Ceftamin']
filter_merged_df = pd.concat([filter_merged_Capomulin,filter_merged_Ramicane,filter_merged_Infubinol,filter_merged_Ceftamin])

# Start by getting the last (greatest) timepoint for each mouse
filter_MouseID = filter_merged_df.groupby('Mouse ID')
max_timepoint= pd.DataFrame(filter_MouseID['Timepoint'].max())

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
final_tumor_volume = filter_merged_df.merge(max_timepoint, how ='inner', left_on=['Mouse ID','Timepoint'], right_on = ['Mouse ID','Timepoint'] )
final_tumor_volume.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,s185,Capomulin,Female,3,17,45,23.343598,1
1,x401,Capomulin,Female,16,15,45,28.484033,0
2,m601,Capomulin,Male,22,17,45,28.430964,1
3,f966,Capomulin,Male,16,17,20,30.485985,0
4,u364,Capomulin,Male,18,17,45,31.023923,3


In [15]:
# Put treatments into a list for for loop (and later for plot labels)
list_drugs = final_tumor_volume['Drug Regimen'].unique().tolist()
list_drugs
# Create empty list to fill with tumor vol data (for plotting)
tumor_volume_data_list = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
#Capomulin

for drug_name in list_drugs:
    tumor_volume= final_tumor_volume.loc[final_tumor_volume['Drug Regimen']== drug_name]

    quartiles = tumor_volume['Tumor Volume (mm3)'].quantile([0.25,0.5,0.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq - lowerq

    print(f"The lower quartile of {drug_name} is: {round(lowerq,2)}")
    print(f"The upper quartile of {drug_name}  is: {round(upperq,2)}")
    print(f"The interquartile range of {drug_name} is: {round(iqr,2)}")
    print(f"The the median of {drug_name} is: {round(quartiles[0.5],2)} ")

    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    print(f"Values below {round(lower_bound,2)} could be outliers for {drug_name}.")
    print(f"Values above {round(upper_bound,2)} could be outliers for {drug_name}.")
    print('\n')
    
    #print('\n')


The lower quartile of Capomulin is: 32.38
The upper quartile of Capomulin  is: 40.16
The interquartile range of Capomulin is: 7.78
The the median of Capomulin is: 38.13 
Values below 20.7 could be outliers for Capomulin.
Values above 51.83 could be outliers for Capomulin.


The lower quartile of Ramicane is: 31.56
The upper quartile of Ramicane  is: 40.66
The interquartile range of Ramicane is: 9.1
The the median of Ramicane is: 36.56 
Values below 17.91 could be outliers for Ramicane.
Values above 54.31 could be outliers for Ramicane.


The lower quartile of Infubinol is: 54.05
The upper quartile of Infubinol  is: 65.53
The interquartile range of Infubinol is: 11.48
The the median of Infubinol is: 60.17 
Values below 36.83 could be outliers for Infubinol.
Values above 82.74 could be outliers for Infubinol.


The lower quartile of Ceftamin is: 48.72
The upper quartile of Ceftamin  is: 64.3
The interquartile range of Ceftamin is: 15.58
The the median of Ceftamin is: 59.85 
Values below 

In [16]:
#Boxplot

fig1, ax1 = plt.subplots()
ax1.set_ylim(20, 90)
drug1= final_tumor_volume.loc[final_tumor_volume['Drug Regimen']== 'Capomulin']
drug2= final_tumor_volume.loc[final_tumor_volume['Drug Regimen']== 'Ramicane']
drug3= final_tumor_volume.loc[final_tumor_volume['Drug Regimen']== 'Infubinol']
drug4= final_tumor_volume.loc[final_tumor_volume['Drug Regimen']== 'Ceftamin']

data1 = drug1['Tumor Volume (mm3)']
data2 = drug2['Tumor Volume (mm3)']
data3 = drug3['Tumor Volume (mm3)']
data4 = drug4['Tumor Volume (mm3)']
all_data = [data1,data2,data3,data4]

ax1.boxplot(all_data,sym='ro')

x_axis = range(1,len(list_drugs)+1)
tick_drug = [value for value in x_axis]
plt.xticks(tick_drug, list_drugs)


<IPython.core.display.Javascript object>

([<matplotlib.axis.XTick at 0x1f8040b4d90>,
  <matplotlib.axis.XTick at 0x1f8040b4d60>,
  <matplotlib.axis.XTick at 0x1f80408e820>,
  <matplotlib.axis.XTick at 0x1f8041218e0>],
 [Text(1, 0, 'Capomulin'),
  Text(2, 0, 'Ramicane'),
  Text(3, 0, 'Infubinol'),
  Text(4, 0, 'Ceftamin')])

## Line and Scatter Plots

In [17]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

#Find list of mice that were treated with Capomulin
capomulin_mice = merged_df.loc[merged_df['Drug Regimen']== 'Capomulin']
list_capomulin_mice = capomulin_mice['Mouse ID'].unique().tolist()
n = random.randint(0,len(list_capomulin_mice)+1)
timepoint_capomulin_mice = capomulin_mice.groupby('Mouse ID')['Timepoint'].apply(list).tolist()
tumorSize_capomulin_mice = capomulin_mice.groupby('Mouse ID')['Tumor Volume (mm3)'].apply(list).tolist()


#Data for plotting chart
x_axis= timepoint_capomulin_mice[n]
points = tumorSize_capomulin_mice[n]
mouse_id = list_capomulin_mice[n]

fig2, ax2 = plt.subplots()
plt.plot(x_axis,points)
plt.title(f'Timepoint vs Tumor Volume for Mouse ID: {mouse_id}')
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume (mm3)')
plt.xticks(np.arange(0, max(x_axis), step=5)) 



<IPython.core.display.Javascript object>

([<matplotlib.axis.XTick at 0x1f80415edf0>,
  <matplotlib.axis.XTick at 0x1f80415edc0>,
  <matplotlib.axis.XTick at 0x1f80415d8e0>,
  <matplotlib.axis.XTick at 0x1f80417d910>,
  <matplotlib.axis.XTick at 0x1f80417de20>,
  <matplotlib.axis.XTick at 0x1f804184370>,
  <matplotlib.axis.XTick at 0x1f804184880>,
  <matplotlib.axis.XTick at 0x1f804184d90>,
  <matplotlib.axis.XTick at 0x1f80418a2e0>],
 [Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, '')])

In [18]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

capomulin_mice2 = merged_df.loc[merged_df['Drug Regimen']== 'Capomulin']
weight_tumorVol_capomulin = (capomulin_mice2.groupby('Mouse ID').mean())[['Weight (g)','Tumor Volume (mm3)']]

fig3, ax3 = plt.subplots()
plt.scatter(weight_tumorVol_capomulin['Weight (g)'],weight_tumorVol_capomulin['Tumor Volume (mm3)'], facecolors = 'red', edgecolors = 'black')
plt.title('Weight vs Average Tumor Volume for Capomulin treated mice')
plt.xlabel('Weight of mice')
plt.ylabel('Average Tumor Volume(mm3)')



wt_slope, wt_int, wt_r, wt_p, wt_std_err = stats.linregress(weight_tumorVol_capomulin['Weight (g)'],weight_tumorVol_capomulin['Tumor Volume (mm3)'])
wt_fit = wt_slope * weight_tumorVol_capomulin['Weight (g)'] + wt_int
plt.plot(weight_tumorVol_capomulin['Weight (g)'],wt_fit,"g--")
line_eq = "y = " + str(round(wt_slope,2)) + "x + " + str(round(wt_int,2))
plt.annotate(line_eq,(16,36),fontsize=15,color="green")

<IPython.core.display.Javascript object>

Text(16, 36, 'y = 0.95x + 21.55')

## Correlation and Regression

In [19]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
print(f"The correlation coefficient between the weight of mice and average tumor volume in mm3 is {round(st.pearsonr(weight_tumorVol_capomulin['Weight (g)'],weight_tumorVol_capomulin['Tumor Volume (mm3)'])[0],2)}")

The correlation coefficient between the weight of mice and average tumor volume in mm3 is 0.84
