In [None]:
#import dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats

#hide warning messages
import warnings
warnings.filterwarnings('ignore')

#read csv data
mouse_drug_data = pd.read_csv("Resources/mouse_drug_data.csv")
clinical_trial_data = pd.read_csv("Resources/clinicaltrial_data.csv")

# Clean and Combine

In [None]:
#check Mouse ID column
mouse_drug_data["Mouse ID"].value_counts() #g989 has 2 entries, maximum of 1
clinical_trial_data["Mouse ID"].value_counts() #g989 has 13 entries, maximum of 10

#locate the g989 rows in the drug data
g989Drug = mouse_drug_data.loc[mouse_drug_data["Mouse ID"] == "g989", :]
g989Drug #each g989 has a diffrent drug: stelasyn, propriva

#locate the g989 rows in the clinical data
g989Trial = clinical_trial_data.loc[clinical_trial_data["Mouse ID"] == "g989", :]
g989Trial #g989 has 2 simultaneous timelines, unable to parse which timeline corresponds to which drug

#remove g989 data from both data sets
clean_mouse_drug_data = mouse_drug_data.loc[mouse_drug_data["Mouse ID"] != "g989", :]
clean_clinical_trial_data = clinical_trial_data.loc[clinical_trial_data["Mouse ID"] != "g989", :]

#merge data tables
drugData = pd.merge(clean_mouse_drug_data, clean_clinical_trial_data, on="Mouse ID", how="outer")
drugData.head(15)

# Tumor Response to Treatment

In [None]:
#group data by drug and timepoint 
groupedDrugData = drugData.groupby(["Drug", "Timepoint"])

#find avg tumor volume and metastic sites for each drug at each timepoint
drugAvgs = groupedDrugData.mean()
drugAvgs.head(15)

In [None]:
#list drugs to examine
drugList = ["Capomulin", "Infubinol", "Ketapril", "Placebo"]
#list timepoints
timepoints = np.arange(0, 50, 5)

#list tumor vol avg by drug and timepoint
tumorVolAvg = drugAvgs["Tumor Volume (mm3)"]
#separate tumor vol avg list by drug
capo_tumorVolAvg = tumorVolAvg.loc["Capomulin", :]
infu_tumorVolAvg = tumorVolAvg.loc["Infubinol", :]
keta_tumorVolAvg = tumorVolAvg.loc["Ketapril", :]
plac_tumorVolAvg = tumorVolAvg.loc["Placebo", :]

#plot each drug's tumor vol avg over time
capo_tumorVolAvgPlot = plt.scatter(timepoints, capo_tumorVolAvg, marker = "D", label = "Capomulin", alpha = .75)
infu_tumorVolAvgPlot = plt.scatter(timepoints, infu_tumorVolAvg, marker = "s", label = "Infubinol", alpha = .75)
keta_tumorVolAvgPlot = plt.scatter(timepoints, keta_tumorVolAvg, marker = "o", label = "Ketapril", alpha = .75)
plac_tumorVolAvgPlot = plt.scatter(timepoints, plac_tumorVolAvg, marker = "X", label = "Placebo", alpha = .75)

#formatting and titles
plt.hlines(45, 0, 45, alpha=0.25)
plt.title("Average Tumor Volume Over Treatment")
plt.xlabel("Days")
plt.ylabel("Tumor Volume (mm^3)")
plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
#find std errors for each drug and timepoint
tumorStdErrors = [stats.sem(drugData.loc[(drugData["Drug"] == drug) 
                                         & (drugData["Timepoint"] == time), "Tumor Volume (mm3)"]) 
                  for drug in drugList for time in timepoints]

#separate std errors by drug
capo_tumorStdErrors = [tumorStdErrors[i] for i in range(10)]
infu_tumorStdErrors = [tumorStdErrors[i] for i in range(10, 20)]
keta_tumorStdErrors = [tumorStdErrors[i] for i in range(20, 30)]
plac_tumorStdErrors = [tumorStdErrors[i] for i in range(30, 40)]

#plot tumor vol avg with std error bars
capo_tumorErrorPlot = plt.errorbar(timepoints, capo_tumorVolAvg, capo_tumorStdErrors, marker = "D", linestyle="None", label = "Capomulin", alpha = .75)
infu_tumorErrorPlot = plt.errorbar(timepoints, infu_tumorVolAvg, infu_tumorStdErrors, marker = "s", linestyle="None", label = "Infubinol", alpha = .75)
keta_tumorErrorPlot = plt.errorbar(timepoints, keta_tumorVolAvg, keta_tumorStdErrors, marker = "o", linestyle="None", label = "Ketapril", alpha = .75)
plac_tumorErrorPlot = plt.errorbar(timepoints, plac_tumorVolAvg, plac_tumorStdErrors, marker = "X", linestyle="None", label = "Placebo", alpha = .75)

#formatting and titles
plt.hlines(45, 0, 45, alpha=0.25)
plt.title("Average Tumor Volume Over Treatment")
plt.xlabel("Days")
plt.ylabel("Tumor Volume (mm^3)")
plt.legend(loc="best")
plt.tight_layout()
plt.show()

# Metastatic Response to Treatment

In [None]:
#list metastatic sites avg by drug and timepoint 
metastaticAvg = drugAvgs["Metastatic Sites"]

#separate metastatic avg list by drug
capo_metastaticAvg = metastaticAvg.loc["Capomulin", :]
infu_metastaticAvg = metastaticAvg.loc["Infubinol", :]
keta_metastaticAvg = metastaticAvg.loc["Ketapril", :]
plac_metastaticAvg = metastaticAvg.loc["Placebo", :]

#plot metastatic avg over time
capo_metastaticAvgPlot = plt.scatter(timepoints, capo_metastaticAvg, marker = "D", label = "Capomulin", alpha = .75)
infu_metastaticAvgPlot = plt.scatter(timepoints, infu_metastaticAvg, marker = "s", label = "Infubinol", alpha = .75)
keta_metastaticAvgPlot = plt.scatter(timepoints, keta_metastaticAvg, marker = "o", label = "Ketapril", alpha = .75)
plac_metastaticAvgPlot = plt.scatter(timepoints, plac_metastaticAvg, marker = "X", label = "Placebo", alpha = .75)

#formatting and titles
plt.title("Average Metastic Sites Over Treatment")
plt.xlabel("Days")
plt.ylabel("Metastic Sites")
plt.legend(loc="best")
plt.grid(alpha = .4)
plt.tight_layout()
plt.show()

In [None]:
#find std errors for each drug and timepoint
metastaticStdErrors = [stats.sem(drugData.loc[(drugData["Drug"] == drug) 
                                         & (drugData["Timepoint"] == time), "Metastatic Sites"]) 
                  for drug in drugList for time in timepoints]

#separate std errors by drug
capo_metastaticStdErrors = [metastaticStdErrors[i] for i in range(10)]
infu_metastaticStdErrors = [metastaticStdErrors[i] for i in range(10, 20)]
keta_metastaticStdErrors = [metastaticStdErrors[i] for i in range(20, 30)]
plac_metastaticStdErrors = [metastaticStdErrors[i] for i in range(30, 40)]

#plot metastatic avg with error bars
capo_metastaticErrorPlot = plt.errorbar(timepoints, capo_metastaticAvg, capo_metastaticStdErrors, marker = "D", linestyle="None", label = "Capomulin", alpha = .75)
infu_metastaticErrorPlot = plt.errorbar(timepoints, infu_metastaticAvg, infu_metastaticStdErrors, marker = "s", linestyle="None", label = "Infubinol", alpha = .75)
keta_metastaticErrorPlot = plt.errorbar(timepoints, keta_metastaticAvg, keta_metastaticStdErrors, marker = "o", linestyle="None", label = "Ketapril", alpha = .75)
plac_metastaticErrorPlot = plt.errorbar(timepoints, plac_metastaticAvg, plac_metastaticStdErrors, marker = "X", linestyle="None", label = "Placebo", alpha = .75)

#formatting and titles
plt.title("Average Tumor Volume Over Treatment")
plt.xlabel("Days")
plt.ylabel("Tumor Volume (mm^3)")
plt.legend(loc="best")
plt.grid(alpha = .4)
plt.tight_layout()
plt.show()

# Survival Rates

In [None]:
#find mouse count
drugCounts = groupedDrugData.count()
mouseCounts = drugCounts["Mouse ID"]
#list mouse count by drug and timepoint
mouseCounts = pd.DataFrame(mouseCounts)
mouseCounts = mouseCounts.rename(columns={"Mouse ID": "Mouse Count"})
mouseCounts.head(15)

In [None]:
#separate mouse count list by drug
capo_mouseCounts = mouseCounts.loc["Capomulin", :]
infu_mouseCounts = mouseCounts.loc["Infubinol", :]
keta_mouseCounts = mouseCounts.loc["Ketapril", :]
plac_mouseCounts = mouseCounts.loc["Placebo", :]

#convert mouse counts to % of starting mice
capo_survivalPerc = capo_mouseCounts / capo_mouseCounts.iloc[0, 0] * 100
infu_survivalPerc = infu_mouseCounts / infu_mouseCounts.iloc[0, 0] * 100
keta_survivalPerc = keta_mouseCounts / keta_mouseCounts.iloc[0, 0] * 100
plac_survivalPerc = plac_mouseCounts / plac_mouseCounts.iloc[0, 0] * 100

#plot each drug's mouse count over time
capo_tumorVolAvgPlot = plt.scatter(timepoints, capo_survivalPerc, marker = "D", label = "Capomulin", alpha = .75)
infu_tumorVolAvgPlot = plt.scatter(timepoints, infu_survivalPerc, marker = "s", label = "Infubinol", alpha = .75)
keta_tumorVolAvgPlot = plt.scatter(timepoints, keta_survivalPerc, marker = "o", label = "Ketapril", alpha = .75)
plac_tumorVolAvgPlot = plt.scatter(timepoints, plac_survivalPerc, marker = "X", label = "Placebo", alpha = .75)

#formatting and titles
plt.title("Survival Rate Over Treatment")
plt.xlabel("Days")
plt.ylabel("Survival Rate (%)")
plt.legend(loc="best")
plt.grid(alpha = .4)
plt.tight_layout()
plt.show()

# Summary Bar Graph

In [None]:
#calculate tumor % change
capo_tumorPercChange = (capo_tumorVolAvg[9] / capo_tumorVolAvg[0] * 100) - 100
infu_tumorPercChange = (infu_tumorVolAvg[9] / infu_tumorVolAvg[0] * 100) - 100
keta_tumorPercChange = (keta_tumorVolAvg[9] / keta_tumorVolAvg[0] * 100) - 100
plac_tumorPercChange = (plac_tumorVolAvg[9] / plac_tumorVolAvg[0] * 100) - 100

#list tumor % changes
tumorPercList = [capo_tumorPercChange, infu_tumorPercChange, keta_tumorPercChange, plac_tumorPercChange]

#create table with drug and tumor % change
tumorPercTable = pd.DataFrame({"Drug": drugList, "Tumor Percent Change": tumorPercList})
tumorPercTable

In [None]:
#plot tumor % table as bar chart
tumorBarChart = plt.bar(tumorPercTable["Drug"], tumorPercTable["Tumor Percent Change"])

#format color and position of text based on positive or negative change
for i in range(len(drugList)):
    if tumorPercList[i] < 0:
        tumorBarChart[i].set_color("limegreen")
        percText = format(tumorPercList[i] / 100, ".2%")
        plt.text(-.2 + i, -5, percText)
    else:
        tumorBarChart[i].set_color("firebrick")
        percText = format(tumorPercList[i] / 100, ".2%")
        plt.text(-.2 + i, 2, percText, color = "w")
        
#formatting and titles
plt.hlines(0, -.5, 3.5, alpha=0.25)
plt.xlim(-.5, 3.5)
plt.title("Tumor Percent Total Change by Drug")
plt.xlabel("Drug")
plt.ylabel("Tumor Percent Change (%)")
plt.tight_layout()
plt.show()