In [1]:
%matplotlib notebook

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import statistics as stat
from scipy.stats import linregress
%matplotlib inline

In [7]:
# Study data files
mouse_metadata_df = pd.read_csv("Resources/Mouse_metadata.csv")
study_results_df = pd.read_csv("Resources/Study_results.csv")

In [None]:
# Loading the Mouse Metadata dataset into Pandas DataFrame
mouse_metadata_df.head()

In [None]:
# Loading the Study Result dataset into Pandas DataFrame
study_results_df

In [None]:
# Combining the data into a single dataset

combined_df = pd.merge(mouse_metadata_df, study_results_df, how="left", on=["Mouse ID"])
combined_df

In [None]:
# Checking the number of mice in the Merged DataFrame.
total_mice = combined_df['Mouse ID'].unique()
len(total_mice)

In [None]:
# Checking the number of mice.
total_mice = combined_df["Mouse ID"].value_counts()
total_mice

In [None]:
# Checking for duplicates in Mouse ID
combined_df.loc[combined_df["Mouse ID"]== "g989"]

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint
combined_df.loc[combined_df.duplicated(subset=["Mouse ID","Timepoint"])]

In [None]:
# Checking for Mouse ID that does not equal duplicated ID
clean_df= combined_df.loc[combined_df["Mouse ID"] != "g989"]
clean_df.head()

In [None]:
clean_df.shape

In [None]:
# Checking the number of mice in the clean DataFrame.
total_mice = clean_df['Mouse ID'].unique()
len(total_mice)

# Summary Statistics

In [None]:
# Using groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 

# Calculating Mean values
Mean = (round(clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean(),2))
# Calculating Median values
Median = (round(clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].median(),2))
# Calculating Variance values
Variance = (round(clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].var(),2))
# Calculating the Standard Deviation
Standard_Dev = (round(clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].std(),2))
# Calculating the Standard Error
SEM = (round(clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem(),2))

In [None]:
# Calculating Mean values
Mean

In [None]:
# Calculating Median values
Median

In [None]:
# Calculating Variance values
Variance

In [None]:
# Calculating the Standard Deviation
Standard_Dev

In [None]:
# Calculating the Standard Error
SEM

In [None]:
# Assembling the resulting series into a single summary dataframe.


Stat_data = {
    
    'Drug Regimen':['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo', 'Propriva', 'Ramicane', 
                    'Stelasyn', 'Zoniferol'], 
    'Mean': [40.68, 52.59, 52.88, 55.24, 54.33, 54.03, 52.32, 40.22, 54.23, 53.24],
    'Median': [41.56, 51.78, 51.82, 53.70, 52.51, 52.29, 50.45, 40.67, 52.43, 51.8],
    'Variance': [24.95, 39.29, 43.13, 68.55, 66.17, 61.17, 43.85, 23.49, 59.45, 48.53],
    'Std_Dev':[4.99, 6.27, 6.57, 8.28, 8.13, 7.82, 6.62, 4.85, 7.71, 6.97],
    'SEM': [0.33, 0.47, 0.49, 0.60, 0.60, 0.58, 0.54, 0.32, 0.57, 0.52]


    }


Summary_df=pd.DataFrame(Stat_data)
Summary_df

In [None]:
# Plotting a histogram to see if the data was evenly distributed
plt.hist(clean_df["Tumor Volume (mm3)"])

In [None]:
st.normaltest(clean_df["Tumor Volume (mm3)"])

In [None]:
#showing the total number of timepoints for all mice tested
drug_reg_df = clean_df["Drug Regimen"].value_counts()
drug_reg_df

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas
bar = drug_reg_df             
bar.plot(kind = 'bar')
plt.title("The total number of timepoints for all mice tested for each drug regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Timepoint")
plt.show()

plt.title("The total number of timepoints for all mice tested for each drug regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Timepoint")
plt.legend("best")

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug 
#regimen using Matplotlib 

y_axis = drug_reg_df
x_axis= clean_df['Drug Regimen'].unique()
plt.bar(x_axis, y_axis)
plt.title("The total number of timepoints for all mice tested for each drug regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Timepoint")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Checking for the counts of the sex column
total_sex = clean_df['Sex'].value_counts()
total_sex

In [None]:
# Plotting a pie chart to display the values of sex column 
pie = total_sex
labels = ["Male", "Female"]
colors = ["lightcoral", "lightskyblue"]
explode = [0.1, 0]

In [None]:
# Tell matplotlib to create a pie chart based upon the above 
plt.pie(pie, explode=explode, labels=labels, colors=colors, 
        autopct="%1.1f%%", shadow=True, startangle=45)

plt.show()

In [None]:
# Telling Pandas to create a pie chart based upon the above
pie_chart = total_sex
explode = [0.1, 0]
pie_chart.plot(kind = 'pie', fontsize= 12, explode=explode, shadow= True, autopct="%1.1f%%", startangle=40)

# Quartiles, Outliers, and Box Plot





In [None]:
# Checking for the max value of Mouse timepoint

sel_df = clean_df.groupby(["Mouse ID"])["Timepoint"].max()
sel_df

In [None]:
# Merging the two cleaned DataFrame 

new_df = pd.merge(sel_df,clean_df, on=(["Mouse ID","Timepoint"]))
new_df.head()

In [None]:
# Put treatments into a list for for loop (and later for plot labels)

prt_drug = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol = []

for drug in prt_drug:
    reg_tum_df = new_df.loc[new_df["Drug Regimen"]== drug]
    tumor = reg_tum_df["Tumor Volume (mm3)"]
    tumor_vol.append(tumor)


In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 

quartiles = tumor_vol[0].quantile([.25,.5,.75])
lowerq = quartiles[.25]
upperq = quartiles[.75]
iqr = upperq - lowerq

lowerq_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

In [None]:
lowerq_bound

In [None]:
upper_bound

In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 

quartiles_0 = tumor_vol[1].quantile([.25,.5,.75])
lowerq_0 = quartiles_0[.25]
upperq_0 = quartiles_0[.75]
iqr_0 = upperq_0 - lowerq_0

lowerq_bound_0 = lowerq_0 - (1.5*iqr_0)
upper_bound_0 = upperq_0 + (1.5*iqr_0)

In [None]:
lowerq_bound_0

In [None]:
upper_bound_0

In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 

quartiles_1 = tumor_vol[2].quantile([.25,.5,.75])
lowerq_1 = quartiles_1[.25]
upperq_1 = quartiles_1[.75]
iqr_1 = upperq_1 - lowerq_1

lowerq_bound_1 = lowerq_1 - (1.5*iqr_1)
upper_bound_1 = upperq_1 + (1.5*iqr_1)

In [None]:
lowerq_bound_1

In [None]:
upper_bound_1

In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 

quartiles_2 = tumor_vol[3].quantile([.25,.5,.75])
lowerq_2 = quartiles_2[.25]
upperq_2 = quartiles_2[.75]
iqr_2 = upperq_2 - lowerq_2

lowerq_bound_2 = lowerq_2 - (1.5*iqr_2)
upper_bound_2 = upperq_2 + (1.5*iqr_2)

In [None]:
lowerq_bound_2

In [None]:
upper_bound_2

In [None]:
# Showing values for mice treated with Capomulin
cap_mouse = clean_df.loc[clean_df["Drug Regimen"]== "Capomulin"]["Mouse ID"].value_counts()
cap_mouse

In [None]:
# Preparing Mouse with Mouse ID l897 for plotting

l897 = clean_df.loc[clean_df["Mouse ID"]== "l897"]
l897

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
l897['Tumor Volume (mm3)'].plot(label = "Tumor Volume")
l897['Timepoint'].plot(label = "Timepoint")
plt.legend()
plt.show()

In [None]:
capo_df = clean_df.loc[clean_df["Drug Regimen"]== "Capomulin"]
capo_df.head()

In [None]:
# Calculating average tumor volume 

average_capo = capo_df.groupby(["Mouse ID"]).mean()
average_capo.head()

In [None]:
# Plot colour_intensity versus alcohol on a scatterplot
tumor_volume = average_capo['Tumor Volume (mm3)']
mouse_weight = average_capo['Weight (g)']
plt.scatter(tumor_volume,mouse_weight)
plt.xlabel("Tumor Volume")
plt.ylabel("Mouse Weight")
plt.show()

In [None]:
# Calculate the correlation coefficient between mouse weight and average tumor volume
print(f"The correlation coefficient between mouse weight and average tumor volume for the Capomulin regimen is {round(st.pearsonr(tumor_volume,mouse_weight)[0],2)}")

In [None]:
reg = np.polyfit(tumor_volume,mouse_weight, deg= 1)
reg

In [None]:
#linear regression model for mouse weight and average tumor volume for the Capomulin regimen
trend = np.polyval(reg, tumor_volume)
plt.scatter(tumor_volume,mouse_weight)
plt.plot(tumor_volume, trend, 'r')