In [None]:
#%matplotlib notebook

## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data count and the study results()
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)
type(study_results)
study_results.nunique()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint
study_results[study_results.duplicated(['Mouse ID', 'Timepoint'])]


In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint
#Dropping all the columns for mice with duplicate timepoints
indexNames = study_results[ study_results['Mouse ID'] == 'g989' ].index
study_results.drop(indexNames , inplace=True)



In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
study_results1=pd.DataFrame(study_results)

In [None]:
study_results1.nunique()

In [None]:
## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe
df_right = pd.merge( mouse_metadata, study_results1, on='Mouse ID', how='right')
df_right.head()
#df_right.nunique()
df_group= df_right[['Drug Regimen','Tumor Volume (mm3)']]
df_group.head()
stat_anl=df_group.groupby('Drug Regimen').mean()
stat_anl.columns=['Mean']
#stat_anlm=df_group.groupby('Drug Regimen').mode()
stat_anlme=df_group.groupby('Drug Regimen').median()
stat_anlme.columns=['Median']
stat_anlstd=df_group.groupby('Drug Regimen').std()
stat_anlstd.columns=['Std Deviation']
stat_anlsem=df_group.groupby('Drug Regimen').sem()
stat_anlsem.columns=['SEM']
stat_anlvar= df_group.groupby('Drug Regimen').var()
stat_anlvar.columns=['Variance']
stat_anlvar

tot_sum= stat_anl.join(stat_anlme)
tot_sum1= tot_sum.join(stat_anlstd)
tot_sum2= tot_sum1.join(stat_anlsem)
tot_sum3 =tot_sum2.join(stat_anlvar)
tot_sum3


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
Agg_sum=df_right['Tumor Volume (mm3)'].groupby([df_right['Drug Regimen']]).agg(["mean","median","var","sem","std"])
Agg_sum


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.
Mice_count=df_right['Mouse ID'].groupby([df_right['Drug Regimen']]).count()
Mice_count.index


Mice_count.plot(kind="bar", figsize=(10,3))

# Set a title for the chart
plt.title("Total Number of Unique Mice Tested per Drug")
plt.show()
plt.tight_layout()

In [None]:
Mice_count
Mice_count.columns=["Total_Mice"]
y_lim=Mice_count.max()

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.
plt.figure(figsize=(10,4))
plt.bar(Mice_count.index, Mice_count, color='r', alpha=0.5, align="center")
#plt.xticks(tick_locations, rain_df["State"], rotation="vertical")

plt.ylim(0, y_lim+10)
plt.title("Total Number of Unique Mice Tested per Drug")
plt.xlabel("Drug Regimen")
plt.ylabel("Total Mice")


In [None]:
#Generate a pie plot showing the distribution of female versus male mice using pandas

Mice_df = df_right.groupby('Sex')['Mouse ID'].nunique()
Mice_df
sex_chart=Mice_df.plot(kind="pie", autopct="%1.1f%%", figsize = (5,5))
sex_chart.set_ylabel("Distribution of female vesus male mouse")
plt.show()
plt.tight_layout()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.figure(figsize=(5,5))
label=  ["Female", "Male"]
plt.pie(Mice_df, autopct="%1.1f%%",labels=label)
plt.ylabel("Distribution of female vesus male mouse")


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the origial dataframe to get the tumor volume at the last timepoint
df_right
Last_tum=df_right['Tumor Volume (mm3)'].groupby(df_right['Mouse ID']).last()
#Last_tum_drug=Last_tum.loc["Capomulin",:]

Last_tum1=pd.DataFrame(Last_tum)
Last_tum1.columns=['Last_Tumor_Size']
Last_tum1

merged_tumor=pd.merge(mouse_metadata, Last_tum1, on='Mouse ID', how='right')
merged_tumor.set_index('Drug Regimen', inplace=True)
merged_tumor.index
merged_tumor_filt=merged_tumor.loc[('Capomulin','Ramicane','Infubinol', 'Ceftamin'), :]
merged_tumor_filt

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treat_drug = merged_tumor_filt.index.unique()
treat_drug

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data=merged_tumor_filt['Last_Tumor_Size']
tumor_vol_data

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
quartiles = tumor_vol_data.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
merged_tumor_filt1=merged_tumor_filt.loc["Capomulin","Last_Tumor_Size"]
merged_tumor_filt2=merged_tumor_filt.loc["Ramicane","Last_Tumor_Size"]
merged_tumor_filt3=merged_tumor_filt.loc["Infubinol","Last_Tumor_Size"]
merged_tumor_filt4=merged_tumor_filt.loc["Ceftamin","Last_Tumor_Size"]

# Calculate the IQR and quantitatively determine if there are any potential outliers for Capomulin. 
quartiles = merged_tumor_filt1.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr1 = upperq-lowerq
iqr1


# Calculate the IQR and quantitatively determine if there are any potential outliers for Ramicane. 
quartiles = merged_tumor_filt2.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr2 = upperq-lowerq
iqr2

# Calculate the IQR and quantitatively determine if there are any potential outliers for Infubinol. 
quartiles = merged_tumor_filt3.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr3 = upperq-lowerq
#iqr

# Calculate the IQR and quantitatively determine if there are any potential outliers for Ceftamin. 
quartiles = merged_tumor_filt4.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr4 = upperq-lowerq

#Print upper and lower bounds for each drug
lower_bound1 = lowerq - (1.5*iqr1)
upper_bound1= upperq + (1.5*iqr1)
print(f"Values below {lower_bound1} could be outliers for Capamulin")
print(f"Values above {upper_bound1} could be outliers for Capamulin")

lower_bound2 = lowerq - (1.5*iqr2)
upper_bound2= upperq + (1.5*iqr2)
print(f"Values below {lower_bound2} could be outliers for Ramicane")
print(f"Values above {upper_bound2} could be outliers for Ramicane")

lower_bound3 = lowerq - (1.5*iqr3)
upper_bound3= upperq + (1.5*iqr3)
print(f"Values below {lower_bound3} could be outliers for Infubinol")
print(f"Values above {upper_bound3} could be outliers for Infubinol")

lower_bound4 = lowerq - (1.5*iqr4)
upper_bound4= upperq + (1.5*iqr4)
print(f"Values below {lower_bound4} could be outliers for Ceftamin")
print(f"Values above {upper_bound4} could be outliers for Ceftamin")

 

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
df_cap=df_right.loc[:,['Drug Regimen','Tumor Volume (mm3)','Timepoint','Mouse ID']]
df_cap.set_index('Drug Regimen', inplace=True)
df_cap1=df_cap.loc["Capomulin",:]

df_cap1.set_index('Mouse ID', inplace=True)
df_cap2=df_cap1.loc["s185",:]
               
plt.figure(figsize=(8,8))            
plt.plot(df_cap2['Tumor Volume (mm3)'], df_cap2['Timepoint'], color='r', alpha=0.5, linewidth=2, marker='o')

plt.ylim(0, 55)
plt.xlim(20,50)
plt.title("Tumor Volume Vs Timepoint for Capomulin for mouse S185")
plt.xlabel("Tumor Volume mm3")
plt.ylabel("Timepoint")


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

df_mv=df_right.loc[:,['Drug Regimen','Tumor Volume (mm3)','Weight (g)','Mouse ID']]
df_mv.set_index('Drug Regimen', inplace=True)
df_mv1=df_mv.loc["Capomulin",:]

avg_tum=df_mv1['Tumor Volume (mm3)'].groupby(df_mv1['Mouse ID']).mean()

avg_tum1=pd.DataFrame(avg_tum)
avg_tum1.columns=['Average']
#avg_tum1.set_index('Mouse ID', inplace=True)
mv_wt=df_mv1['Weight (g)'].groupby(df_mv1['Mouse ID']).unique()
mv_wt1=pd.DataFrame(mv_wt)


#avg_tum1.index
mv_wt1
#x_axis = np.arange(0, 10, 0.1)
#y_axis=

plt.title("Average Tumor Volume Vs Mouse Weight for Capomulin")
plt.xlabel("Mouse Weight")
plt.ylabel("Tumor Volume (mm3)")
#plt.figure(figsize=(8,8))   
plt.ylim(30,55)
plt.xlim(10,30)
plt.scatter(mv_wt1['Weight (g)'], avg_tum1['Average'], marker="o", facecolor="red",edgecolors="black",alpha=0.75, s=40)
plt.show()
 


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen


#Observations:  
#1.From one of the studies it was concluded with treatment for Capomulin tumor sizes decreased as the timepoint increased
#2.Also Capomulin average tumore volume is between the range of 40-45 mm3 when the mouse weight is between 17.5 and 25 months.
#3.Most mices were tested with Capomulin and Ramicane.
#4.The male and female popluation mice tested were equally tested.
 
