# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.


In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np



# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"


# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
mouse_df = pd.merge(mouse_metadata, study_results)

# Display the data table for preview
mouse_df.head()

In [None]:
# Checking the number of mice.
# Found http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.nunique.html to assist counting the unique values in the DataFrame
number_mouse= mouse_df["Mouse ID"].nunique()
number_mouse

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
dups_mice_id_df = mouse_df.loc[mouse_df.duplicated(subset=["Mouse ID", "Timepoint"]), "Mouse ID"].unique()
dups_mice_id_df

In [None]:
# Optional: Get all the data for the duplicate mouse ID.

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Assistance on how to drop duplicates found here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html
clean_df = mouse_df.drop_duplicates(subset = ['Mouse ID', 'Timepoint'])
clean_df

In [None]:
# Checking the number of mice in the clean DataFrame.
number_mice= clean_df["Mouse ID"].nunique()
number_mice

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
# mean, median, variance, standard deviation, and SEM of the tumor volume.
# Assemble the resulting series into a single summary DataFrame.

summary_mean = clean_df.groupby("Drug Regimen").mean()["Tumor Volume (mm3)"]
summary_median = clean_df.groupby("Drug Regimen").median()["Tumor Volume (mm3)"]
summary_variance = clean_df.groupby("Drug Regimen").var()["Tumor Volume (mm3)"]
summary_stdv = clean_df.groupby("Drug Regimen").std()["Tumor Volume (mm3)"]
summary_sem = clean_df.groupby("Drug Regimen").sem()["Tumor Volume (mm3)"]

# Assemble the resulting series into a single summary DataFrame.
summary_table = pd.DataFrame({
                              "Mean Tumor Volume": summary_mean,
                              "Median Tumor Volume": summary_median,
                              "Tumor Volume Variance": summary_variance,
                              "Tumor Volume Std. Dev.": summary_stdv,
                              "Tumor Volume Std. Err.": summary_sem

})

summary_table

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
mice_drug = clean_df.groupby(["Drug Regimen"]).count()["Mouse ID"]

plt_chart = mice_drug.plot(kind="bar", color="b",figsize=(6,8))
plt_chart.set_xlabel("Drug Regimen")
plt_chart.set_ylabel("# of Observed Mouse Timepoints")

plt.show()
plt.tight_layout()


In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
type_drug = clean_df["Drug Regimen"].unique()

x_axis = type_drug
y_axis = clean_df.groupby(["Drug Regimen"]).count()["Mouse ID"]

plt.xlabel("Drug Regimen")
plt.ylabel("# of Observed Mouse Timepoints")
plt.bar(x_axis, mice_drug, facecolor="blue", alpha=0.75, align="edge")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gender_data = clean_df["Sex"].value_counts()
gender_data.plot.pie(autopct = '%1.1f%%')
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gender_data = clean_df["Sex"].value_counts()
plt.pie(gender_data, autopct="%1.1f%%", shadow =True)
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers.


    # Locate the rows which contain mice on each drug and get the tumor volumes


    # add subset


    # Determine outliers using upper and lower bounds


In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
capomulin_df = clean_df.loc[clean_df["Drug Regimen"] == "Capomulin"]
capomulin_df
mouse_capo_df = capomulin_df.loc[capomulin_df["Mouse ID"] == "l509"]
mouse_capo_df

x_axis = mouse_capo_df["Timepoint"]
tumor_vol = mouse_capo_df["Tumor Volume (mm3)"]

plt.plot(x_axis,tumor_vol, color="blue")
plt.title("Capomulin treatment of mouse l509")
plt.xlabel("Timepoint (Days)")
plt.ylabel("Tumor Volume (mm3)")
plt.show()

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
#capomulin_df = clean_df.loc[clean_df["Drug Regimen"] == "Capomulin"]
avg_capomulin_df = clean_df.loc[clean_df["Drug Regimen"] == "Capomulin"].groupby("Mouse ID")
mouse_weight_df = avg_capomulin_df["Weight (g)"].unique()
avg_tumor_df = avg_capomulin_df["Tumor Volume (mm3)"].mean()

plt.scatter(mouse_weight_df,avg_tumor_df)
plt.xlabel("Weight (g)")
plt.ylabel("Avg Tumor Volume (mm3)")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
avg_capomulin_df = clean_df.loc[clean_df["Drug Regimen"] == "Capomulin"].groupby("Mouse ID")
mouse_weight_df = avg_capomulin_df["Weight (g)"].unique()
avg_tumor_df = avg_capomulin_df["Tumor Volume (mm3)"].mean()


(slope, intercept, rvalue, pvalue, stderr) = linregress(mouse_weight_df, avg_tumor_df)
regress_values = mouse_weight_df * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x = " + str(round(intercept,2))

plt.scatter(mouse_weight_df,avg_tumor_df)
plt.xlabel("Weight (g)")
plt.ylabel("Avg Tumor Volume (mm3)")