# Pymaceuticals Inc.
---

### Observations and Insights
- Total number of measurements taken on each drug regimen ranged ~ 150 - 240. 
- Pretty equal distribution of female (49%) versus male (51%) mice.
- On the mice tested, Capomulin & Ramicane had the higher tumor volume compared to other drugs.
- For Capomulin drug regimen, the average tumor volume increased with increased mouse weight.


In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

import numpy as np
import random

import operator   # needed to make sort work

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [None]:
mouse_metadata

In [None]:
study_results

In [None]:
# keep orginal data and make a copy
mm_df = mouse_metadata
sr_df = study_results

In [None]:
# Merge our two data frames together
combo_df = pd.merge(mm_df, sr_df, on="Mouse ID")
combo_df.head()

In [None]:
# clean the data but keep original combo_df file

combined_df = combo_df.dropna(how="any")
combined_df

# ** NOTE:  Going fwd use combined_df as the dataframe to use for calculations, etc.

In [None]:
# Check the number of mice.

num_mice = len(pd.unique(mm_df["Mouse ID"]))
print('Numnber of mice: ', num_mice)

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
#   ** Note:  Mouse result should be:  array(['g989'], dtype=object)
# Optional: Get all the data for the duplicate mouse ID. 

# Selecting duplicate rows except first  

# occurrence based on all columns 
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
duplicate = combined_df[combined_df.duplicated(['Mouse ID','Timepoint'], keep=False)] 
  
print("Duplicate Row(s) :") 
  
# Print the resultant Dataframe 
duplicate 

In [None]:
# Checking the number of mice in the clean DataFrame.

# remove duplicates
clean_df=combined_df[combined_df['Mouse ID'] != 'g989']
clean_df['Mouse ID'].nunique()

num_mice = len(pd.unique(clean_df["Mouse ID"]))
print('Numnber of mice after removal of duplicate: ', num_mice)
print()

In [None]:
# Display the 'cleaned' dataframe
clean_df

In [None]:
num_drug_regimen = len(pd.unique(clean_df["Drug Regimen"]))
print('Numnber of Drug Regimen: ', num_drug_regimen)
print()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
#and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

summary_stats_df = combined_df.groupby('Drug Regimen').agg({'Tumor Volume (mm3)': 
                                                            ['mean', 'median', 'var','std', 'sem']})
                                          
summary_stats_df

## Bar and Pie Charts

#### Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.

#### NOTE:  USE clean_df 

In [None]:
num_drug_regimen = len(pd.unique(clean_df["Drug Regimen"]))
print('Number of Drug Regimen: ', num_drug_regimen)
print()

drugs = pd.unique(clean_df["Drug Regimen"])
drugs

In [None]:
drugs = pd.unique(clean_df["Drug Regimen"])
drugs

In [None]:
#for reference only
clean_df.columns

In [None]:
# -- for reference only -- to be used for charts below

summary_stats_df = summary_stats_df.rename(columns={'Tumor Volume (mm3)': 'Mean Tumor Volume'})  
summary_stats_df

new_df = combined_df.groupby('Drug Regimen').agg({'Tumor Volume (mm3)': ['mean', 'median', 'var','std', 'sem']})
new_df.head()

In [None]:
# total number of measurements per drug
grp_by_drug_measurement = combined_df.groupby('Drug Regimen').agg({'Tumor Volume (mm3)': 'count'})
grp_by_drug_measurement

#### The following cells will be used for chart references below:

In [None]:
drugs = pd.unique(clean_df["Drug Regimen"])
drugs

In [None]:
# measurements sorted by Drug Regimen
measurements = grp_by_drug_measurement['Tumor Volume (mm3)']
measurements

In [None]:
#sort by Tumor Volume (mm3) in descending order
type(measurements)
measurements = measurements.sort_values(ascending=False)
measurements

## BAR chart using PANDAS

In [None]:
# Generate a bar plot showing the total number of measurements taken 
# on each drug regimen using PANDAS.
measurements.plot(kind="bar")

plt.legend()
plt.close

## BAR chart using PYPLOT

In [None]:
# Generate a bar plot showing the total number of measurements taken 
# on each drug regimen using PYPLOT.

#%matplotlib notebook

# NOTE: use variables above for the charts
# x-axis will be drugs --> variable: drugs
# y-axis will be # of measurements taken per drug regimen --> variable: measurements

x_axis = np.arange(0, len(measurements))

# Tell matplotlib that chart to be created is a bar chart
# Apply align="edge" to ensure the bars line up with the tick marks
plt.bar(x_axis, measurements, facecolor="red", alpha=0.75, align="center")

# Tell matplotlib where we would like to place each of our x axis headers
tick_locations = [value for value in x_axis]

# pass in the tick location and the label you want to appear on the ticks (in order)
#plt.xticks(tick_locations, drugs, rotation="vertical")
plt.xticks(tick_locations, drugs, rotation=65)

# Sets the x limits of the current chart
# using len(x_axis)-0.25 will go to the last value in your x-axis and come back a small amount. 
# This is based on the size of your data

plt.xlim(-0.75, len(x_axis)-0.25)

plt.ylim(0, max(measurements)+25, +50)

# Set a Title and labels
plt.title("Total Number of Measurements per Drug")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Unique Mice Tested")

# tight_layout() adjusts the visual of our graph making it easier to see
plt.tight_layout()

# Save our graph and show the graph
# plt.savefig("../Images/<xyz.png")
plt.show()

plt.close

####  Generate a pie plot showing the distribution of female versus male mice using PANDAS.

In [None]:
# count number of male, female mice 

mice_df = mm_df.groupby('Sex').agg({'Sex': 'count'})  

# rename column
ren_mice_df = mice_df.rename(columns={'Sex': 'Count'})                      
ren_mice_df

In [None]:
# Display mouse count by sex -- using loc 

num_female = mice_df.loc['Female', 'Sex']
num_male = mice_df.loc['Male', 'Sex']

print("Female: ", num_female, "  Male: ", num_male)

# PIE chart using PANDAS

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using PANDAS.

mice_df.plot(kind="pie", subplots = True, figsize = (4, 4), 
             colors = ["pink", "blue"], autopct="%1.1f%%", startangle=40)

plt.close

# PIE chart using PYPLOT

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

# Labels for the sections of our pie chart
labels = ["Females", "Males"]

# The values of each section of the pie chart  -- use the variables for count of mice: female & male
sizes = [num_female, num_male]

# The colors of each section of the pie chart
colors = ["pink", "blue"]

# Tells matplotlib to seperate the "Females" section from the others
explode = (0.1, 0)

# Creates the pie chart based upon the values above
# Automatically finds the percentages of each part of the pie chart

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct="%1.1f%%", shadow=True, startangle=140)

# Tells matplotlib that we want a pie chart with equal axes
#plt.axis("equal")

# Set a Title and labels
plt.title("Distribution of Female vs Male Mice")

#plt.legend()

plt.show()
plt.close

for ref only
import matplotlib
matplotlib.axes.Axes.pie
matplotlib.pyplot.pie
matplotlib.axes.Axes.legend
matplotlib.pyplot.legend

## Quartiles, Outliers and Boxplots

#### Calculate the final tumor volume of each mouse across four of the treatment regimens:  
#### Capomulin, Ramicane, Infubinol, and Ceftamin
#### Start by getting the last (greatest) timepoint for each mouse
#### Merge this group df with the original dataframe to get the tumor volume at the last timepoint

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
#treatment_list = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
#tumor_vol_list = []

#subset_df = sorted_clean_df[['Mouse ID', 'Drug Regimen', 'Timepoint', 'Tumor Volume (mm3)']]

#tumor_vol_list = subset1_df['Tumor Volume (mm3)']
#tumor_vol_list.head()

In [None]:
clean_df.columns

In [None]:
# sort the clean_df by timepoint (descending)
sorted_clean_df = clean_df.sort_values(['Timepoint'], ascending=False)
sorted_clean_df

Expected results:

Capomulin's potential outliers: Series([], Name: Tumor Volume (mm3), dtype: float64)

Ramicane's potential outliers: Series([], Name: Tumor Volume (mm3), dtype: float64)

Infubinol's potential outliers: 31    36.321346

Name: Tumor Volume (mm3), dtype: float64

Ceftamin's potential outliers: Series([], Name: Tumor Volume (mm3), dtype: float64)


#### Calculate the IQR and quantitatively determine if there are any potential outliers for drug in treatment_list:
    # Locate the rows which contain mice on each drug and get the tumor volumes
    # add subset 
    # Determine outliers using upper and lower bounds

In [None]:
# create subset dataframe for drugs: "Capomulin", "Ramicane", "Infubinol", "Ceftamin"
subset_df = sorted_clean_df[['Mouse ID', 'Drug Regimen', 'Timepoint', 'Tumor Volume (mm3)']]
subset1_df = subset_df[subset_df['Drug Regimen'].isin (["Capomulin", "Ramicane", "Infubinol", "Ceftamin"])]
subset1_df

In [None]:
# Capomulin subset
Capomulin_df = subset1_df[subset1_df['Drug Regimen'] == "Capomulin"]
Capomulin_df.head()

In [None]:
Capomulin_tum_vol=Capomulin_df['Tumor Volume (mm3)']
Capomulin_tum_vol=Capomulin_tum_vol.sort_values
Capomulin_tum_vol

In [None]:
# Ramicane subset
Ramicane_df = subset1_df[subset1_df['Drug Regimen'] == "Ramicane"]
Ramicane_df.head()

In [None]:
Ramicane_tum_vol=Ramicane_df['Tumor Volume (mm3)']
Ramicane_tum_vol=Ramicane_tum_vol.sort_values
Ramicane_tum_vol

In [None]:
# Infubinol subset
Infubinol_df = subset1_df[subset1_df['Drug Regimen'] == "Infubinol"]
Infubinol_df.head()

In [None]:
Infubinol_tum_vol=Infubinol_df['Tumor Volume (mm3)']
Infubinol_tum_vol=Infubinol_tum_vol.sort_values
Infubinol_tum_vol

In [None]:
# Ceftamin subset
Ceftamin_df = subset1_df[subset1_df['Drug Regimen'] == "Ceftamin"]
Ceftamin_df.head()

In [None]:
Ceftamin_tum_vol=Ceftamin_df['Tumor Volume (mm3)']
Ceftamin_tum_vol

#sorted_tumor_vol_list = tumor_vol_list.sort_values(['Tumor Volume (mm3)'], ascending=False)

#TEST CELL only
#tumor_vol_list = [40.159220, 38.407618, 67918767, 64.729837]
#For each numeric attribute of dataframe 
#sorted_clean_df.plot.box() 
#Capomulin_df.plot.box() 
#individual attribute box plot 
#plt.boxplot(sorted_clean_df['Timepoint']) 
#plt.show() 

## Generate a box plot of the final tumor volume of each mouse across four regimens of interest

In [None]:
#NOTE:  
# x_axis = treatment_list
# y_axis = tumor_vol_list ==> Final Tumor Volume(mm3)

treatment_list = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# -- Capomulin
tumor_vol_list = Capomulin_tum_vol

#tumor_vol_list = Ramicane_tum_vol
#tumor_vol_list = Infubinol_tum_vol
#tumor_vol_list = Ceftamin_tum_vol

#Capomulin_df.plot.box()
#Ramicane_df.plot.box()
#Infubinol_df.plot.box()
#Ceftamin_df.plot.box()

x_axis = treatment_list
y_axis = tumor_vol_list

# Setting up the plot
fig, ax = plt.subplots()

#ax.errorbar(x_axis, means, standard_errors, fmt="o")
ax.set_xlim(0, len(treatment_list) + 1)
ax.set_ylim(20,80)

ax.set_title('Final Tumor Volume Across Four of the Treament Regimens')
ax.set_xlabel("Drug Regimen")
ax.set_ylabel("Final Tumor Volume (mm3)")

ax.boxplot(tumor_vol_list)

plt.tight_layout()

plt.show()

#smaller bars are better

# Line and Scatter Plots

### Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

In [None]:
# for reference
clean_df

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

drug_tumor_timept_df = clean_df[['Drug Regimen', 'Mouse ID', 'Weight (g)', 'Timepoint', 'Tumor Volume (mm3)']]
drug_tumor_timept_df

In [None]:
Capomulin_only_tumor_timept_df = drug_tumor_timept_df[drug_tumor_timept_df['Drug Regimen'] == 'Capomulin']
Capomulin_only_tumor_timept_df

In [None]:
Capomulin_only_mouse_l509_df = drug_tumor_timept_df[(drug_tumor_timept_df['Mouse ID'] == 'l509')]
Capomulin_only_mouse_l509_df

In [None]:
timepoint = Capomulin_only_mouse_l509_df['Timepoint']
timepoint

In [None]:
tumor_vol = Capomulin_only_mouse_l509_df['Tumor Volume (mm3)']
tumor_vol

In [None]:
#plt.figure()
#plt.plot(timepoint, tumor_vol)

## Line plot of tumor volume vs. time point for Mouse l509 treated with Capomulin

In [None]:
plt.close()

plt.plot(timepoint, tumor_vol)

# Set a Title and labels
plt.title("Capomulin Treatment of Mouse l509")
plt.xlabel("Timepoint(days)")
plt.ylabel("Tumor Volume (mm3)")

# tight_layout() adjusts the visual of our graph making it easier to see
plt.tight_layout()

# Save our graph and show the graph
# plt.savefig("../Images/<xyz.png")
plt.show()

## Scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

In [None]:
# extract Capomulin only
Capomulin_only = clean_df[(clean_df['Drug Regimen'] == 'Capomulin')]
Capomulin_only

In [None]:
Capomulin_weight_tumor_vol = Capomulin_only.groupby('Mouse ID').agg({'Weight (g)': 'mean', 'Tumor Volume (mm3)': 'mean'})
Capomulin_weight_tumor_vol.head()

In [None]:
Capomulin_weight = Capomulin_weight_tumor_vol['Weight (g)']
Capomulin_weight.head()

In [None]:
Capomulin_t_vol = Capomulin_weight_tumor_vol['Tumor Volume (mm3)']
Capomulin_t_vol.head()

## Scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen¶

In [None]:
# Get the maximum value for the x_limit
min_weight = Capomulin_weight.min()
min_weight

In [None]:
max_weight = Capomulin_weight.max()
max_weight

In [None]:
# Get the minimum & maximum value for the y_limit
min_tumor_vol = Capomulin_t_vol.min()
min_tumor_vol

In [None]:
max_tumor_vol = Capomulin_t_vol.max()
max_tumor_vol

## Scatter Plot

In [None]:
# enable matplotlib so that the correlation / regression line can be drawn over the scatter plot

%matplotlib notebook 

#----------------------------------------------
# Dependencies
from matplotlib import pyplot as plt
from scipy.stats import linregress
import numpy as np
from sklearn import datasets
import pandas as pd
#----------------------------------------------

# The maximum x value for our chart will be 25
x_limit = max_weight

# List of values from 15 to 25 each value being 1 greater than the last
x_axis = np.arange(0, x_limit, 2)

# Create a random array of data that we will use for our y values
#data = [random.random() for value in x_axis]

data = Capomulin_t_vol

# Tells matplotlib to create a scatter plot
# The size of each point on our plot is determined by their x value

plt.scatter(Capomulin_weight_tumor_vol['Weight (g)'], Capomulin_weight_tumor_vol['Tumor Volume (mm3)'], 
            marker="o", facecolors="red", edgecolors="black", s=Capomulin_weight, alpha=0.75)

# The x limits of our scatter plot is 15 to 25
plt.xlim(min_weight-0.5, max_weight+0.5)   

# The y limits of our scatter plot is 34 to 46
plt.ylim(min_tumor_vol-1, max_tumor_vol+1)

# Set a Title and labels
plt.title("Average Tumor Volume vs Mouse Weight")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")

# tigh_layout() adjusts the visual of our graph making it easier to see
plt.tight_layout()

# Save our graph and show the graph
# plt.savefig("../Images/<xyz.png")
plt.show()

#plt.close  

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

In [None]:
mouse_weight = Capomulin_weight_tumor_vol['Weight (g)']
tumor_volume = Capomulin_weight_tumor_vol['Tumor Volume (mm3)']

correlation = st.pearsonr(mouse_weight, tumor_volume)
correlation

## Scatter Plot -- Line

In [None]:
# Add the linear regression equation and line to plot
x_values = Capomulin_weight_tumor_vol['Weight (g)']
y_values = Capomulin_weight_tumor_vol['Tumor Volume (mm3)']

In [None]:
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

In [None]:
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="red")
plt.show()
plt.close 