In [None]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random as rd
import scipy.stats as st

# Data files path
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
data = pd.merge(mouse_metadata,study_results, on = "Mouse ID")

# Display the data table for preview
data.head()

In [None]:
# Checking the number of mice
mice = len(data["Mouse ID"].unique())
print(f"{mice} mice were observed in this study")

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
# This command generates a boolean variable that turns True when the Mouse ID is duplicate on the same Timepoint
duplicates= data.duplicated(subset=['Mouse ID', 'Timepoint'])

In [None]:
# Optional: Get all the data for the duplicate mouse ID.
# According to the boolean variable generated we get the rest of the data for the duplicated cases
duplicate_mice = data[duplicates]
duplicate_mice

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID

# In order to preserve data of the 249 mice we keep the last measurement of the duplicate mouse from each timepoint 
data_clean = data.drop_duplicates(subset=['Mouse ID', 'Timepoint'], keep='last')

# We rename the columns for easier manipulation
data_clean = data_clean.rename(columns = {'Drug Regimen': 'Drug_Regimen','Weight (g)' : 'Weight', 
                                          'Tumor Volume (mm3)': 'Tumor_Volume',
                                          'Metastatic Sites' : 'Metastatic_Sites'})
data_clean.head()

In [None]:
# Checking the number of mice in the clean DataFrame.

clean_mice =  len(data_clean["Mouse ID"].unique())
print(f"{clean_mice} mice were observed in this study, after cleaning the database")

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

# We group the data by Drug Regimen 
by_drug = data_clean.groupby(['Drug_Regimen'])

# Calculate the mean of the tumor volume by Drug Regimen
mean = by_drug.Tumor_Volume.mean()

# Calculate the median of the tumor volume by Drug Regimen
median = by_drug.Tumor_Volume.median()

# Calculate the variance of the tumor volume by Drug Regimen
variance =  by_drug.Tumor_Volume.var()

# Calculate the standard deviation of the tumor volume by Drug Regimen
sd = by_drug.Tumor_Volume.std()

# Calculate the SEM of the tumor volume by Drug Regimen
st_error = by_drug.Tumor_Volume.sem()

# Create a data frame with the results
pd.DataFrame({"Mean" : mean, "Median" : median, "Variance" : variance, "Standard Deviation" : sd, "SEM" : st_error})


In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
measurements = data_clean['Drug_Regimen'].value_counts()
measurements.plot(kind='bar', title ='Measurements per drug regimen', facecolor = 'b', width = 0.8)
plt.xlabel('Drug Regimen')
plt.ylabel('# of Mesurements')
plt.xlim(-1, len(measurements))
plt.ylim(0,240)

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
plt.bar(measurements.index.tolist(),measurements, color = 'b', width = 0.8)
plt.xticks(rotation='vertical')
plt.title('Measurements per drug regimen')
plt.xlabel('Drug Regimen')
plt.ylabel('# of Mesurements')
plt.xlim(-1, len(measurements))
plt.ylim(0,240)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender = data_clean['Sex'].value_counts()
gender.plot(kind='pie', title ='Distribution of female vs male mice',  autopct="%1.1f%%")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = gender.index.tolist()
plt.pie(gender, labels=labels,  autopct="%1.1f%%")
plt.title('Distribution of female vs male mice')
plt.ylabel('Sex')


In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# List of the promising treatment regimens 
p_treatments = ['Capomulin','Ramicane','Infubinol', 'Ceftamin']

# Sort the data by Timepoint in ascending order and identify the last observations per mouse by getting rid of
# the first observations.
f_points = data_clean.sort_values(by = "Timepoint").drop_duplicates(subset=["Mouse ID"], keep="last")  

# Get the data of the measurements of the four promising treatment regimens
fp_points = f_points[f_points.Drug_Regimen.isin(p_treatments)]

fp_points


In [None]:
df_fil = fp_points[['Mouse ID', 'Drug_Regimen', 'Tumor_Volume']]

for treatment in p_treatments:
    if treatment == 'Capomulin':
        ca_data = df_fil[df_fil['Drug_Regimen'] == treatment]
        ca_vol = ca_data['Tumor_Volume']
        ca_quar = ca_vol.quantile([.25,.5,.75])
        ca_lowerq = ca_quar[0.25]
        ca_upperq = ca_quar[0.75]
        ca_iqr = ca_upperq-ca_lowerq
        ca_lbound = ca_lowerq - (1.5*ca_iqr)
        ca_ubound = ca_upperq + (1.5*ca_iqr)
        print('------------------------------------------------------------------------------------------')
        print(f"The interquartile range of the tumor volume for the {treatment} treatment is: {round(ca_iqr,2)}")
        print(f"Values below {round(ca_lbound,2)} could be outliers for the {treatment} treatment.")
        print(f"Values above {round(ca_ubound,2)} could be outliers for the {treatment} treatment.")
        print('------------------------------------------------------------------------------------------')
  
    elif treatment == 'Ramicane':
        r_data = df_fil[df_fil['Drug_Regimen'] == treatment]
        r_vol = r_data['Tumor_Volume']
        r_quar = r_vol.quantile([.25,.5,.75])
        r_lowerq = r_quar[0.25]
        r_upperq = r_quar[0.75]
        r_iqr = r_upperq-r_lowerq
        r_lbound = r_lowerq - (1.5*r_iqr)
        r_ubound = r_upperq + (1.5*r_iqr)
        print('------------------------------------------------------------------------------------------')
        print(f"The interquartile range of the tumor volume for the {treatment} treatment is: {round(r_iqr,2)}")
        print(f"Values below {round(r_lbound,2)} could be outliers for the {treatment} treatment.")
        print(f"Values above {round(r_ubound,2)} could be outliers for the {treatment} treatment.")
        print('------------------------------------------------------------------------------------------')

    elif treatment == 'Infubinol':
        i_data = df_fil[df_fil['Drug_Regimen'] == treatment]
        i_vol = i_data['Tumor_Volume']
        i_data = df_fil[df_fil['Drug_Regimen'] == treatment]
        i_vol = i_data['Tumor_Volume']
        i_quar = i_vol.quantile([.25,.5,.75])
        i_lowerq = i_quar[0.25]
        i_upperq = i_quar[0.75]
        i_iqr = i_upperq-i_lowerq
        i_lbound = i_lowerq - (1.5*i_iqr)
        i_ubound = i_upperq + (1.5*i_iqr)
        print('------------------------------------------------------------------------------------------')
        print(f"The interquartile range of the tumor volume for the {treatment} treatment is: {round(i_iqr,2)}")
        print(f"Values below {round(i_lbound,2)} could be outliers for the {treatment} treatment.")
        print(f"Values above {round(i_ubound,2)} could be outliers for the {treatment} treatment.")
        print('------------------------------------------------------------------------------------------')
   
    else:
        ce_data = df_fil[df_fil['Drug_Regimen'] == treatment]
        ce_vol = ce_data['Tumor_Volume']
        ce_quar = ce_vol.quantile([.25,.5,.75])
        ce_lowerq = ce_quar[0.25]
        ce_upperq = ce_quar[0.75]
        ce_iqr = ce_upperq-ce_lowerq
        ce_lbound = ce_lowerq - (1.5*ce_iqr)
        ce_ubound = ce_upperq + (1.5*ce_iqr)
        print('------------------------------------------------------------------------------------------')
        print(f"The interquartile range of the tumor volume for the {treatment} treatment is: {round(ce_iqr,2)}")
        print(f"Values below {round(ce_lbound,2)} could be outliers for the {treatment} treatment.")
        print(f"Values above {round(ce_ubound,2)} could be outliers for the {treatment} treatment.")
        print('------------------------------------------------------------------------------------------')


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
red_diamond = dict(markerfacecolor='r', marker='D')
fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volume per Treatment Regimen ')
ax1.set_ylabel('Tumor Volume (mm3)')
ax1.boxplot([ca_vol, r_vol, i_vol, ce_vol], flierprops=red_diamond,labels = p_treatments, patch_artist = True)
plt.show()

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

# Select Capomulin data from the clean data frame
capo_data = data_clean[data_clean['Drug_Regimen'] == 'Capomulin']

# Pick a random mouse that was treated with Capomulin
mouse_selected = rd.choice(capo_data['Mouse ID'].unique())

# Get the data of the mouse that was randomly picked
mouse_df = capo_data[capo_data['Mouse ID'] == mouse_selected]

plt.plot(mouse_df['Timepoint'], mouse_df['Tumor_Volume'], color ='red', label = 'Tumor Volume', lw = 3)
plt.legend(loc="best")
plt.title('Tumor Volume Changes for ' + mouse_selected + ' mouse')
plt.xlabel('Time Points')
plt.xticks(np.arange(min(mouse_df['Timepoint']), max(mouse_df['Timepoint'])+1, 5))
plt.ylabel('Tumor Volume (mm3)')
plt.show


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
scatter_data = capo_data.groupby(['Mouse ID']).agg({'Tumor_Volume':'mean', 'Weight' : 'mean'})
scatter_data.plot(kind='scatter', x = 'Tumor_Volume', y = 'Weight', 
                  title = 'Average Tumor Volume vs. Mouse Weight for the Capomulin Regimen')
plt.show()

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

weight = scatter_data.Weight
volume = scatter_data.Tumor_Volume
correlation = st.pearsonr(volume,weight)

print(f"The correlation coefficient between mouse weight and average tumor volume is {round(correlation[0],2)}")


In [None]:
# Linear regression on weight versus average tumor volume for the Capomulin regime
(slope, intercept, rvalue, pvalue, std_err) = st.linregress(volume,weight)

# Create equation of line to calculate predicted violent crime rate
weight_fit = slope * volume + intercept
scatter_data.plot(kind='scatter', x = 'Tumor_Volume', y = 'Weight', 
                  title = 'Average Tumor Volume vs. Mouse Weight for the Capomulin Regimen')
plt.plot(volume, weight_fit,"--")
plt.show()
