In [178]:
%matplotlib notebook


In [179]:
#Dependancies

import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
from statsmodels.graphics.gofplots import qqplot
import numpy as np
import random


In [180]:
# Study data files
mouse_metadata = "Pymaceuticals/data/Mouse_metadata.csv"
study_results = "Pymaceuticals/data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single data
combined_pymaceutical_data = pd.merge(study_results,mouse_metadata, how="left", on="Mouse ID")

combined_pymaceutical_data.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [181]:
# Summary Statistics

#Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen 
tumor_vol_groupby_mean = combined_pymaceutical_data.groupby("Drug Regimen").mean()["Tumor Volume (mm3)"]
tumor_vol_groupby_median = combined_pymaceutical_data.groupby("Drug Regimen").median()["Tumor Volume (mm3)"]
tumor_vol_groupby_std_Dev = combined_pymaceutical_data.groupby("Drug Regimen").std()["Tumor Volume (mm3)"]
tumor_vol_groupby_var = combined_pymaceutical_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].var()
tumor_vol_groupby_std = combined_pymaceutical_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].std()

tumor_vol_dataframe = pd.DataFrame({"Mean Tumor Volume":tumor_vol_groupby_mean,
                                   "Median Tumor Volume":tumor_vol_groupby_median,
                                    "Tumor Volume Variance":tumor_vol_groupby_var,
                                   "Tumor Volume Std.Dev.":tumor_vol_groupby_std_Dev,
                                  })
tumor_vol_dataframe

Unnamed: 0_level_0,Mean Tumor Volume,Median Tumor Volume,Tumor Volume Variance,Tumor Volume Std.Dev.
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774
Ceftamin,52.591172,51.776157,39.290177,6.268188
Infubinol,52.884795,51.820584,43.128684,6.567243
Ketapril,55.235638,53.698743,68.553577,8.279709
Naftisol,54.331565,52.509285,66.173479,8.134708
Placebo,54.033581,52.288934,61.168083,7.821003
Propriva,52.322552,50.854632,42.35107,6.50777
Ramicane,40.216745,40.673236,23.486704,4.846308
Stelasyn,54.233149,52.431737,59.450562,7.710419
Zoniferol,53.236507,51.818479,48.533355,6.966589


In [182]:
tumor_vol_groupby_sem = combined_pymaceutical_data.groupby("Drug Regimen").sem()["Tumor Volume (mm3)"]
        
tumor_vol_dataframe = pd.DataFrame({"Mean":tumor_vol_groupby_mean,
                                   "Median":tumor_vol_groupby_median,
                                    "Variance":tumor_vol_groupby_var,
                                   "std":tumor_vol_groupby_std_Dev,
                                     "Sem":tumor_vol_groupby_sem

                                  })
tumor_vol_dataframe

Unnamed: 0_level_0,Mean,Median,Variance,std,Sem
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.322552,50.854632,42.35107,6.50777,0.512884
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [184]:
# Generating a bar plot showing the number of data points for each treatment regimen using pandas

#generating y and x Axis
points = [230,230,180,180,175,175,175,170,170,160]
x_axis = np.arange(len(points))

#making the bar chart
plt.bar(x_axis,points,alpha=0.6, align="center")


#placing x_axis headers
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations,["Capomulin","Ramicane","Ketapril","Naftisol","Zoniferol","Stelasyn","Placebo",
                          "Infubinol","Ceftamin","Propriva"],
          rotation="vertical")

# Sets the x limits of the current chart
plt.xlim(-0.75, len(x_axis)-0.25)

# Sets the y limits of the current chart
plt.ylim(0, max(points)+10)

# Assigning title and labels

plt.title("Treatment Regimen ")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Data Points")

plt.show()

<IPython.core.display.Javascript object>

In [185]:
# Generating a bar plot showing the number of data points for each treatment regimen using pyplot
drugs =["Capomulin","Ramicane","Ketapril","Naftisol","Zoniferol","Stelasyn","Placebo",
                          "Infubinol","Ceftamin","Propriva"]
# Create a bar chart based upon the above data
number_of_data_points= [230,230,180,180,175,175,175,170,170,160]
x_axis = x_axis = np.arange(len(number_of_data_points))

# Create a bar chart based upon the above data
plt.bar(x_axis, number_of_data_points, align="center")

# Create the ticks for our bar chart's x axis
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, drugs)

# Set the limits of the x axis
plt.xlim(-0.75, len(x_axis)-0.25)

# Set the limits of the y axis
plt.ylim(0, max(number_of_data_points)+10)

plt.title("Treatment Regimen ")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Data Points")

plt.show()

<IPython.core.display.Javascript object>

In [186]:
#Generating a pie plot showing the distribution of females versus male mice using pandas

#mice sex count

mice_gender_count = combined_pymaceutical_data["Sex"].value_counts()

mice_gender_count.head()




Male      958
Female    935
Name: Sex, dtype: int64

In [187]:
# Labels for the sections of the pie chart
labels = ["Male","Female"]

# The values of each section of the pie chart
sizes = [958,935]


# The colors of each section of the pie chart
colors = ["blue","darkorange"]

#Creates the pie chart based upon the values above
# Automatically finds the percentages of each part of the pie chart
plt.pie(sizes,labels=labels, colors=colors,
        autopct="%1.1f%%")

# Tells matplotlib that we want a pie chart with equal axes
plt.axis("equal")

plt.title("Sex", loc="left")


plt.show()


<IPython.core.display.Javascript object>

In [175]:
# Generating a pie plot showing the distribution of female mice versus male mice using pyplot



In [188]:
# Quartiles, Outliers and Boxplots

# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers.  

#loc The drug Regimens
capomulin = combined_pymaceutical_data.loc[combined_pymaceutical_data["Drug Regimen"] == "Capomulin",:]
ramicane= combined_pymaceutical_data.loc[combined_pymaceutical_data["Drug Regimen"] == "Ramicane", :]
infubinol = combined_pymaceutical_data.loc[combined_pymaceutical_data["Drug Regimen"] == "Infubinol", :]
ceftamin = combined_pymaceutical_data.loc[combined_pymaceutical_data["Drug Regimen"] == "Ceftamin", :]

#Capomulin Grouping and merging
capomulin_grouping = capomulin.groupby('Mouse ID').max()['Timepoint']
capomulin_last_tumor_vol = pd.DataFrame(capomulin_grouping)
capomulinmerge = pd.merge(capomulin_last_tumor_vol, combined_pymaceutical_data, on=("Mouse ID","Timepoint"))

#finding the quartiles and Outliers for Capomulin

capo_final_tumor_vol_mouse = capomulinmerge["Tumor Volume (mm3)"]

quartiles = capo_final_tumor_vol_mouse.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq


lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

# Printing the quartiles and outliers of Capomulin

print(f"The lower quartile of Capomulin is: {lowerq}")
print(f"The upper quartile of Capomulin is: {upperq}")

print(f"Capomulin above {upper_bound} could be outliers.")
print(f"Capomulin below {lower_bound} could be outliers.")
print(f"The interquartile range of Capomulin is: {iqr}")

print(f".................................................")


#Ramicane Grouping and merging

ramicane_grouping = ramicane.groupby('Mouse ID').max()['Timepoint']
ramicane_last_tumor_vol = pd.DataFrame(ramicane_grouping)
ramicanemerge = pd.merge(ramicane_last_tumor_vol, combined_pymaceutical_data, on=("Mouse ID","Timepoint"))
#Outliers for Ramicane
rami_final_tumor_vol_mouse = ramicanemerge["Tumor Volume (mm3)"]

#finding the quartiles and Outliers for Ramicane

quartiles = rami_final_tumor_vol_mouse.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq


lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

# Printing the quartiles and outliers of Ramicane


print(f"The lower quartile of Ramicane is: {lowerq}")
print(f"The upper quartile of Ramicane is: {upperq}")

print(f"Ramicane above {upper_bound} could be outliers.")
print(f"Ramicane below {lower_bound} could be outliers.")
print(f"The interquartile range of Ramicane is: {iqr}")

print(f".................................................")

# Infubinol Grouping and merging

infubinol_grouping = infubinol.groupby('Mouse ID').max()['Timepoint']
infubinol_last_tumor_vol = pd.DataFrame(infubinol_grouping)
infubinolmerge = pd.merge(infubinol_last_tumor_vol, combined_pymaceutical_data, on=("Mouse ID","Timepoint"))


#finding the quartiles and Outliers for Infubinol
infu_final_tumor_vol_mouse = infubinolmerge["Tumor Volume (mm3)"]

quartiles = infu_final_tumor_vol_mouse.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq


lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

# Printing the quartiles and outliers of infubinol

print(f"The lower quartile of Infubinol is: {lowerq}")
print(f"The upper quartile of Infubinol is: {upperq}")

print(f"Infubinol above {upper_bound} could be outliers.")
print(f"Infubinol below {lower_bound} could be outliers.")
print(f"The interquartile range of Infubinol is: {iqr}")


print(f".................................................")


# Ceftamin Grouping and merging

ceftamin_grouping = ceftamin.groupby('Mouse ID').max()['Timepoint']
ceftamin_last_tumor_vol = pd.DataFrame(ceftamin_grouping)
ceftaminmerge = pd.merge(ceftamin_last_tumor_vol, combined_pymaceutical_data, on=("Mouse ID","Timepoint"))

#Finding the quartiles and Outliers of Ceftamin

ceft_final_tumor_vol_mouse = ceftaminmerge["Tumor Volume (mm3)"]

quartiles = ceft_final_tumor_vol_mouse.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq


lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)


# Printing the quartiles and outliers
print(f"The lower quartile of Ceftamin is: {lowerq}")
print(f"The upper quartile of Ceftamin is: {upperq}")

print(f"Ceftamin above {upper_bound} could be outliers.")
print(f"Ceftamin below {lower_bound} could be outliers.")
print(f"The interquartile range of Ceftamin is: {iqr}")


The lower quartile of Capomulin is: 32.37735684
The upper quartile of Capomulin is: 40.1592203
Capomulin above 51.83201549 could be outliers.
Capomulin below 20.70456164999999 could be outliers.
The interquartile range of Capomulin is: 7.781863460000004
.................................................
The lower quartile of Ramicane is: 31.56046955
The upper quartile of Ramicane is: 40.65900627
Ramicane above 54.30681135 could be outliers.
Ramicane below 17.912664470000003 could be outliers.
The interquartile range of Ramicane is: 9.098536719999998
.................................................
The lower quartile of Infubinol is: 54.04860769
The upper quartile of Infubinol is: 65.52574285
Infubinol above 82.74144559000001 could be outliers.
Infubinol below 36.83290494999999 could be outliers.
The interquartile range of Infubinol is: 11.477135160000003
.................................................
The lower quartile of Ceftamin is: 48.72207785
The upper quartile of Ceftamin is: 6

In [189]:
# Generating the boxplot

box_plot = [capomulinmerge,ramicanemerge,infubinolmerge,ceftaminmerge]

fig4, ax4 = plt.subplots()
ax4.set_title('Drug Regimens Effects on Final Tumor Volume')
ax4.set_xlabel('Drug Regimen')
ax4.set_ylabel('Final Tumor Volume (mm3)')
ax1.boxplot(box_plot, labels=["Capomulin","Ramicane","Infubinol","Ceftamin"])

# ,labels = ["Capomulin","Ramicane","Infubinol","Ceftamin"])
plt.show()


<IPython.core.display.Javascript object>

ValueError: cannot copy sequence with size 25 to array axis with dimension 8

In [None]:
#Line and scatter plots

#Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

#locating Mouse ID l509 

mousel509 = combined_pymaceutical_data.loc[combined_pymaceutical_data["Mouse ID"] =="l509"]
mousel509

In [198]:
# Generating the line plot 

column1 = mousel509["Timepoint"]
column2 = mousel509["Tumor Volume (mm3)"]

plt.plot(column1, column2,linewidth=1, markersize=12)

plt.title('Capomulin treatmeant of mouse l509')
plt.xlabel('Timepoint (Days)')
plt.ylabel('Tumor Volume (mm3)')

plt.savefig('linechart')
plt.show()


<IPython.core.display.Javascript object>