# Pymaceuticals Inc.
----

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from sklearn import datasets
from scipy.stats import linregress


# Study data files
mouse_metadata = pd.read_csv("Mouse_metadata.csv")
study_results = pd.read_csv("Study_results.csv")


# Read the mouse data and the study results
mice_data = pd.merge(study_results, mouse_metadata,on = ['Mouse ID'], how = 'left')
mice_df = pd.DataFrame(mice_data)
mice_df.head()
#mice_df

In [None]:
mouse_counts=mice_df['Mouse ID'].unique()
len(mouse_counts)
#mouse_counts

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 


duplicate = mice_df[mice_df.duplicated(["Mouse ID","Timepoint"])]['Mouse ID'].unique()
duplicate

# duplicate = mice_data[mice_df.duplicated(["Mouse ID","Timepoint"])]["Mouse ID"].unique()
# duplicate

In [None]:
# get all the data for the duplicate mice
# isin() method to create a Boolean Series indicating which rows have Mouse IDs that are in duplicate_mice
duplicate_data = mice_df.loc[mice_df["Mouse ID"].isin(duplicate)]
duplicate_data

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# clean_mice_df = mice_df.drop_duplicates(subset=['Mouse ID', 'Timepoint'])
clean_mice_df = mice_df[mice_df['Mouse ID'].isin(duplicate)==False]
# clean_mice_df = mice_df[mice_df['Mouse ID']=='g989']
# clean_mice_df = clean_mice_df.drop(clean_mice_df['Mouse ID']=='g989')
clean_mice_df

In [None]:
# Checking the number of mice in the clean DataFrame.


mice_counts=clean_mice_df['Mouse ID'].unique()
len(mice_counts)

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.

mice_tumor_mean = clean_mice_df.groupby(["Drug Regimen"]).mean(numeric_only=True)['Tumor Volume (mm3)']
#mice_tumor_mean
mice_tumor_median = clean_mice_df.groupby(["Drug Regimen"]).median(numeric_only=True)['Tumor Volume (mm3)']
mice_tumor_variance = clean_mice_df.groupby(["Drug Regimen"]).var(numeric_only=True)['Tumor Volume (mm3)']
mice_tumor_std = clean_mice_df.groupby(["Drug Regimen"]).std(numeric_only=True)['Tumor Volume (mm3)']
mice_tumor_SEM = clean_mice_df.groupby(["Drug Regimen"]).sem(numeric_only=True)['Tumor Volume (mm3)']

In [None]:
summary_stat = pd.merge(mice_tumor_mean,mice_tumor_median, on = 'Drug Regimen')\
.merge(mice_tumor_variance, on = 'Drug Regimen')\
.merge(mice_tumor_std, on = 'Drug Regimen')\
.merge(mice_tumor_SEM, on = 'Drug Regimen')

summary_stat = pd.DataFrame({
    'Mean Tumor Volume':mice_tumor_mean,
    'Median Tumor Volume':mice_tumor_median,
    'Tumor Volume Variance':mice_tumor_variance,
    'Tumor Volume Std. Dev.':mice_tumor_std,
    'Tumor Volume Std. Err.':mice_tumor_SEM
})
summary_stat


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line.

summary_stats = clean_mice_df.groupby('Drug Regimen').agg({
    'Tumor Volume (mm3)': ['mean', 'median', 'var', 'std','sem'] 
})
summary_stats

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.

# group the data by drug regimen and sum the timepoints


#mice_tumor_count = clean_mice_df.groupby(["Drug Regimen"]).count()['Timepoint']
#mice_tumor_count
#mice_tumor_count.plot(kind="bar")

timepoints_by_drug = clean_mice_df.groupby('Drug Regimen')['Timepoint'].count()
#Now the graph looks like the assignment, 'trial and error your way to victory'........ ouch
timepoints_by_drug = timepoints_by_drug.sort_values(ascending=False)
timepoints_by_drug.plot(kind='bar')


In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
timepoints_by_drug = clean_mice_df.groupby('Drug Regimen')['Timepoint'].count().sort_values(ascending=False)
drugs=timepoints_by_drug.index
timepoints = timepoints_by_drug.values
plt.bar(drugs,timepoints)
plt.xticks(drugs, rotation=90)
# plt.xticks(tick_locations, rain_df["State"], rotation="vertical")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
mice_by_sex = clean_mice_df.groupby('Sex').size()

mice_by_sex.plot.pie(colors = ['orange','blue'],autopct="%1.1f%%",startangle=180)


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
mice_by_sex = clean_mice_df.groupby('Sex').size()
gender = mice_by_sex.index
gender_by_number = mice_by_sex.values
colors = ['orange','blue']
plt.pie(gender_by_number,labels = gender,colors = colors, autopct="%1.1f%%", startangle=180)
#autopct="%d%%"
plt.show

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

mice_group=clean_mice_df.groupby('Mouse ID')['Timepoint'].max()


clean2_mice_df = pd.merge(mice_group,clean_mice_df, on =['Mouse ID','Timepoint'],how = 'left')
clean2_mice_df.head()


In [None]:
#Thank you Daina from BCS Learning Assistant and classmate Zalak Gajjar

drug_list = ['Capomulin','Ramicane','Infubinol','Ceftamin']
# tumor_mice_drug = clean2_mice_df.loc[(clean2_mice_df['Drug Regimen'] == 'Capomulin') | (clean2_mice_df['Drug Regimen'] == 'Ramicane')|\
#                          (clean2_mice_df['Drug Regimen'] == 'Infubinol')|(clean2_mice_df['Drug Regimen'] == 'Ceftamin'),'Tumor Volume (mm3)']

#This appends to a list, 
tumor_volume = []

for drug in drug_list:
    vol = clean2_mice_df.loc[clean2_mice_df['Drug Regimen'] == drug,'Tumor Volume (mm3)']
    tumor_volume.append(vol)
    #print(vol)
    # tumor_volume
    quartiles = vol.quantile([.25,.5,.75])

    q1 = quartiles[0.25]
    q3 = quartiles[0.75]
    iqr = q3-q1
 
    lower_bound = q1 - (1.5*iqr)
    upper_bound = q3 + (1.5*iqr)

    outliers = vol.loc[(vol > upper_bound) | (vol < lower_bound)]
    print(f"{drug}'s potential outliers: {outliers}.")




In [None]:
fig1, ax1 = plt.subplots()
ax1.set_ylabel('Final Tumor Volume (mm3)')
labels = ['Capomulin','Ramicane','Infubinol','Ceftamin']

#flierprops={'markerfacecolor': 'red', 'marker': 'D', 'markersize': 8, 'markeredgecolor': 'black'})
#learning something new everyday.
ax1.boxplot(tumor_volume,labels = labels,flierprops={'markerfacecolor': 'red', 'marker': 'o','markersize': 14})

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

timepoints_L509 = []
tumor_L509 = []

#(clean2_mice_df['Mouse ID'] == 'l509').sum()
#((clean_mice_df['Mouse ID'] == 'l509') & (clean2_mice_df['Drug Regimen'] == 'Capomulin')).sum()

#Doesn't Work
# for index, row in clean2_mice_df.iterrows():
#     if row['Drug Regimen'] == 'Capomulin':
#         timepoints_L509.append(row['Timepoint'])
#         tumor_L509.append(row['Tumor Volume (mm3)'])

# This works and matches itterows() is very powerful function
for index, row in clean_mice_df.iterrows():
    if row['Mouse ID'] == 'l509':
        timepoints_L509.append(row['Timepoint'])
        tumor_L509.append(row['Tumor Volume (mm3)'])

# mice_reduced = pd.DataFrame(clean2_mice_df.loc[((clean2_mice_df['Mouse ID'] == 'l509'))]) 

# mice_reduced


plt.plot(timepoints_L509,tumor_L509)
plt.xlabel('Timepoint (days)')
plt.ylabel('Average Tumer Volume (mm3)')
plt.title("Capmulin treatment of mouse l509")

plt.show()


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


mice_reduced = clean2_mice_df.loc[(clean2_mice_df['Drug Regimen'] == 'Capomulin')] 


average_tumor = mice_reduced.groupby(['Mouse ID']).mean(numeric_only=True)['Tumor Volume (mm3)']
weight_mouse = mice_reduced.groupby(['Mouse ID']).mean(numeric_only=True)['Weight (g)']

plt.scatter(weight_mouse,average_tumor)
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumer Volume (mm3)')
plt.show()

#The graph is little off 

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen


(slope, intercept, rvalue, pvalue, stderr) = linregress(weight_mouse, average_tumor)
regress_values = weight_mouse * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(weight_mouse,average_tumor)
plt.plot(weight_mouse,regress_values,"r-")
plt.annotate(line_eq,(5.8,0.8),fontsize=15,color="red")
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumer Volume (mm3)')
print(f"The correlation between mouse weight and the average tumor volume is {round(rvalue,2)}")
plt.show()