## Observations and Insights

1. The standard error margins for Ramicane and Capomulin were markedly lower than those for the other drug regimens in this study, while also drawing from larger samples.

2. No potential outliers were detected across the four regimens analyzed (Capomulin, Ramicane, Infubinol, Ceftamin).

3. There was a moderate positive correlation (0.5257) between mouse weight and average tumor volume for the Capomulin regimen.

## Dependencies and starter code

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
import scipy as sp
%matplotlib notebook

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
merge_mouse = pd.merge(mouse_metadata, study_results, on="Mouse ID", how="outer")
merge_mouse.head()

## Summary statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM
# of the tumor volume for each regimen
reg_mea = pd.DataFrame(merge_mouse.groupby(['Drug Regimen']).mean()['Tumor Volume (mm3)'])
reg_mea = reg_mea.rename(columns={'Tumor Volume (mm3)':'Mean'})
reg_med = pd.DataFrame(merge_mouse.groupby(['Drug Regimen']).median()['Tumor Volume (mm3)'])
reg_med = reg_med.rename(columns={'Tumor Volume (mm3)':'Median'})
reg_var = pd.DataFrame(merge_mouse.groupby(['Drug Regimen']).var()['Tumor Volume (mm3)'])
reg_var = reg_var.rename(columns={'Tumor Volume (mm3)':'Variance'})
reg_std = pd.DataFrame(merge_mouse.groupby(['Drug Regimen']).std()['Tumor Volume (mm3)'])
reg_std = reg_std.rename(columns={'Tumor Volume (mm3)':'StDev'})
reg_sem = pd.DataFrame(merge_mouse.groupby(['Drug Regimen']).sem()['Tumor Volume (mm3)'])
reg_sem = reg_sem.rename(columns={'Tumor Volume (mm3)':'SEM'})
regimen = [reg_mea, reg_med, reg_var, reg_std, reg_sem]
regimen = pd.concat(regimen,axis=1,join='inner')
regimen

## Bar plots

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas
reg_pandas = merge_mouse['Drug Regimen'].value_counts()
reg_pandas_bar = reg_pandas.plot(kind='bar', facecolor='red')
plt.title('Data Points by Treatment Regimen')
plt.xlabel('Drug Regimen')
plt.ylabel('Data Points')

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot
reg_pyplot = ['Capomulin', 'Ramicane', 'Ketapril', 'Naftisol', 'Zoniferol', 'Placebo', 'Stelasyn', 'Ceftamin',
              'Infubinol', 'Propriva']
dat_points = [230, 228, 188, 186, 182, 181, 181, 178, 178, 161]
x_axis = np.arange(len(dat_points))
plt.bar(x_axis, dat_points, color='b', align='center')
tick_locations = [value for value in x_axis]
plt.xticks(rotation=90)
plt.xticks(tick_locations, reg_pyplot)
plt.xlim(-0.75, len(x_axis)-0.25)
plt.ylim(0, max(dat_points)+10)
plt.title('Data Points by Treatment Regimen')
plt.xlabel('Drug Regimen')
plt.ylabel('Data Points')
plt.show()

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
mal_fem = pd.DataFrame(merge_mouse['Sex'].value_counts())
mal_fem

In [None]:
gen_pan = pd.DataFrame({'Gender':[958, 935]}, index = ['Male', 'Female'])
chart_x = gen_pan.plot.pie(y='Gender', figsize=(5, 5), autopct='%1.1f%%', startangle=60)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gen_pyp = pd.DataFrame(merge_mouse['Sex'].value_counts())
labels = ['Male', 'Female']
plt.pie(gen_pyp.values.ravel(), labels=labels, autopct='%1.1f%%', startangle=60)
plt.title('Gender')
plt.legend()
plt.tight_layout()
plt.show()

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens.
# Calculate the IQR and quantitatively determine if there are any potential outliers.
final_vol = merge_mouse.loc[(merge_mouse['Timepoint'] == 45)]
capo = final_vol.groupby(['Drug Regimen']).get_group('Capomulin')
capo.head()

In [None]:
rami = final_vol.groupby(['Drug Regimen']).get_group('Ramicane')
rami.head()

In [None]:
infu = final_vol.groupby(['Drug Regimen']).get_group('Infubinol')
infu.head()

In [None]:
ceft = final_vol.groupby(['Drug Regimen']).get_group('Ceftamin')
ceft.head()

In [None]:
capo_quar = capo.quantile([.25,.5,.75])
capo_quar

In [None]:
rami_quar = rami.quantile([.25,.5,.75])
rami_quar

In [None]:
infu_quar = infu.quantile([.25,.5,.75])
infu_quar

In [None]:
ceft_quar = ceft.quantile([.25,.5,.75])
ceft_quar

In [None]:
capo_low = capo_quar.iloc[0,3]
capo_upp = capo_quar.iloc[2,3]
capo_iqr = (capo_upp - capo_low).round(3)
rami_low = rami_quar.iloc[0,3]
rami_upp = rami_quar.iloc[2,3]
rami_iqr = (rami_upp - rami_low).round(3)
infu_low = infu_quar.iloc[0,3]
infu_upp = infu_quar.iloc[2,3]
infu_iqr = (infu_upp - infu_low).round(3)
ceft_low = ceft_quar.iloc[0,3]
ceft_upp = ceft_quar.iloc[2,3]
ceft_iqr = (ceft_upp - ceft_low).round(3)
print(f"The IQR for Capomulin is {capo_iqr}.")
print(f"The IQR for Ramicane is {rami_iqr}.")
print(f"The IQR for Infubinol is {infu_iqr}.")
print(f"The IQR for Ceftamin is {ceft_iqr}.")

In [None]:
capo_lob = (capo_low - (1.5 * capo_iqr)).round(3)
capo_upb = (capo_upp + (1.5 * capo_iqr)).round(3)
rami_lob = (rami_low - (1.5 * rami_iqr)).round(3)
rami_upb = (rami_upp + (1.5 * rami_iqr)).round(3)
infu_lob = (infu_low - (1.5 * infu_iqr)).round(3)
infu_upb = (infu_upp + (1.5 * infu_iqr)).round(3)
ceft_lob = (ceft_low - (1.5 * ceft_iqr)).round(3)
ceft_upb = (ceft_upp + (1.5 * ceft_iqr)).round(3)
print(f"Potential outliers for Capomulin are values below {capo_lob} and above {capo_upb}.")
print(f"Potential outliers for Ramicane are values below {rami_lob} and above {rami_upb}.")
print(f"Potential outliers for Infubinol are values below {infu_lob} and above {infu_upb}.")
print(f"Potential outliers for Ceftamin are values below {ceft_lob} and above {ceft_upb}.")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
regi_fin = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
data = [capo['Tumor Volume (mm3)'], rami['Tumor Volume (mm3)'], infu['Tumor Volume (mm3)'], ceft['Tumor Volume (mm3)']]
fig, axs = plt.subplots(figsize=(8, 5))
axs.set_title('Treatment of Tumors in Mice')
axs.set_xlabel('Drug Regimen')
axs.set_ylabel('Final Tumor Volume (mm3)')
axs.set_xticklabels(np.repeat(regi_fin, 1))
axs.boxplot(data)
plt.show()

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
capo_line = merge_mouse.groupby(['Mouse ID']).get_group('y793')
capo_line

In [None]:
time_pt = capo_line['Timepoint']
volume1 = capo_line['Tumor Volume (mm3)']
plt.plot(time_pt, volume1)
plt.xticks(np.arange(min(time_pt), max(time_pt)+1, 5))
plt.title('Results for Capomulin on Mouse ID y793')
plt.xlabel('Timepoint (days)')
plt.ylabel('Tumor Volume (mm3)')
plt.grid(linestyle='dashed')
plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capo_sca = merge_mouse.groupby(['Drug Regimen']).get_group('Capomulin')
capo_sca.head()

In [None]:
weight = capo_sca['Weight (g)']
tu_vol = capo_sca['Tumor Volume (mm3)']
plt.scatter(weight, tu_vol, marker="o", facecolors="teal", edgecolors="black")
plt.xlim(14,26)
plt.ylim(20,50)
plt.xticks(np.arange(min(weight), max(weight)+1, 1))
plt.title('Mouse Weight vs. Average Tumor Volume')
plt.xlabel('Weight (g)')
plt.ylabel('Tumor Volume (mm3)')
plt.grid(linestyle='dashed')
plt.show()

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight
# and average tumor volume for the Capomulin regimen
cor_w_v = st.pearsonr(capo_sca['Weight (g)'],capo_sca['Tumor Volume (mm3)'])
cor_w_v

In [None]:
mod_w_v = st.linregress(capo_sca['Weight (g)'],capo_sca['Tumor Volume (mm3)'])
mod_w_v

In [None]:
slope = 0.9609669432428346
intercept = 21.489827213299584

In [None]:
trend = capo_sca['Weight (g)'] * slope + intercept
plt.scatter(weight, tu_vol, marker="o", facecolors="teal", edgecolors="black")
plt.xlim(14,26)
plt.ylim(20,50)
plt.xticks(np.arange(min(weight), max(weight)+1, 1))
plt.title('Mouse Weight vs. Average Tumor Volume')
plt.xlabel('Weight (g)')
plt.ylabel('Tumor Volume (mm3)')
plt.plot(capo_sca['Weight (g)'], trend, color='red')
plt.grid(linestyle='dashed')
plt.show()