In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
import numpy as np


# Study data files
mouse_metadata = "Pymaceuticals_data_Mouse_metadata.csv"
study_results = "Pymaceuticals_data_Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

In [None]:
# Combine the data into a single dataset
combined_data_df = pd.merge(study_results, mouse_metadata, how='outer', on='Mouse ID')
combined_data_df 

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen 
grouped_data = combined_data_df.groupby(['Drug Regimen']).mean()
tum_mean = (grouped_data['Tumor Volume (mm3)'])
tum_median = combined_data_df.groupby(['Drug Regimen']).median()
tum_median1 = (tum_median['Tumor Volume (mm3)'])
tum_vars = combined_data_df.groupby(['Drug Regimen']).var()
tum_vars1 = (tum_vars['Tumor Volume (mm3)'])
tum_stdev = combined_data_df.groupby(['Drug Regimen']).std()
tum_stdev1 = (tum_stdev['Tumor Volume (mm3)'])
tum_stderr = combined_data_df.groupby(['Drug Regimen']).sem()
tum_stderr1 = (tum_stderr['Tumor Volume (mm3)'])

In [None]:
cols =['Mean Tumor Volume', 'Median Tumor Volume', 'Tumor Volume Variance', 'Tumor Volume Std. Dev.', 'Tumor Volume Std. Err.']
disc1 = {cols[0] : tum_mean, cols[1] : tum_median1, cols[2] : tum_vars1, cols[3] : tum_stdev1, cols[4] : tum_stderr1}
summary_table1_df = (pd.DataFrame(data = disc1))
summary_table1_df

In [None]:
drug_sum = combined_data_df.groupby(['Drug Regimen']).count()
drug_sum1 = (drug_sum ['Tumor Volume (mm3)'])
cols_bar =['Number of Data Points']
disc2 = {cols_bar[0] : drug_sum1}
summary_table2_df = (pd.DataFrame(data = disc2))

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas 
bar1 = summary_table2_df.plot(kind="bar")
bar1.set_ylabel("Number of Data Points")

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot 
bar1 = summary_table2_df.plot(kind="bar")
bar1.set_ylabel("Number of Data Points")

In [None]:
#count numbers of each gender
sex_num = combined_data_df.groupby(['Sex']).count()
sex_num1 = (sex_num['Mouse ID'])

cols_gen =['Gender']
disc3 = {cols_gen[0] : sex_num1}
summary_table3_df = (pd.DataFrame(data = disc3))
summary_table3_df

In [None]:
 # Generate a pie plot showing the distribution of female versus male mice using pyplot
colors = ["orange", "blue"]
labels2 = ["Females", "Males"]
plt.pie(sex_num1, explode= (0,0), labels=labels2, colors=colors, autopct="%1.1f%%", shadow=True, startangle=180)
plt.show()

In [None]:
# Calculate the final tumor volume of each mouse 
#across four of the most promising treatment regimens. 
#Calculate the IQR and quantitatively determine if there 
#are any potential outliers.  
# Quantitatively determine capomulin outlier


# Quantitatively determine ramicane outliers


# Quantitatively determine infubinol outliers


# Quantitatively determine ceftamin outliers

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest 
fig1, ax1 = plt.subplots()
ax1.set_ylabel('Fimal Tumor Volume (mm3)')
ax1.boxplot(times)
plt.show()


In [None]:
#single mousem
mouse_guy = combined_data_df.loc[combined_data_df['Mouse ID'] == "m601"]
mouse_guy1 = mouse_guy[['Drug Regimen', 'Timepoint']]
mouse_guy1

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin 
plt.line = mouse_guy1.plot.line()
plt.title("Capomulin treatment of mouse m601")
plt.xlabel("Timepoint (days)")
plt.ylabel("Tumor Volume(mm3)")



In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen 

scat_graph = combined_data_df.groupby(['Weight (g)']).mean()
scat_graph1 = (scat_graph['Tumor Volume (mm3)'])

cols_avetum =['Average Tumor Volume (mm3)']
disc4 = {cols_avetum[0]: scat_graph1}
summary_table4_df = (pd.DataFrame({"Average Tumor Volume (mm3)": scat_graph1}))
summary_table4_df["Weight"] = summary_table4_df.index + 0
summary_table4_df

summary_table4_df.plot.scatter(x= 'Weight', y='Average Tumor Volume (mm3)',color='red')
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.show()

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen 
summary_table4_df.plot.scatter(x= 'Weight', y='Average Tumor Volume (mm3)',color='red')
plt.show()


In [None]:
x_values = summary_table4_df["Weight"]
y_values = scat_graph1
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="red")
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.show()