# Observations and Insights

#### Analysis

The data indicates that the drugs Ramicane and Capomulin are the most effective in reducing Tumor Volume (mm3). 

There were no outliers for Capomulin, Ramicane, or Ceftamin. Infubinol was the only drug with an outlier.

The Capmulin drug was less effective as the mouse's weight increased.

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_mice_data = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
combined_mice_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [2]:
# Checking the number of mice.
mice_count = combined_mice_data["Mouse ID"].count()
mice_count

1893

In [3]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicate_mice = combined_mice_data.loc[combined_mice_data.duplicated(subset=["Mouse ID", "Timepoint"]),"Mouse ID"].unique()
duplicate_mice

array(['g989'], dtype=object)

In [4]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mice_id = combined_mice_data.loc[combined_mice_data["Mouse ID"] == "g989"]
duplicate_mice_id                              

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [6]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mice_duplicates_removed = combined_mice_data[combined_mice_data["Mouse ID"].isin(duplicate_mice)==False]
mice_duplicates_removed

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [None]:
# Checking the number of mice in the clean DataFrame.
final_mice = mice_duplicates_removed["Mouse ID"].count()
final_mice

## Summary Statistics

In [None]:
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:       
# mean, median, variance, standard deviation, and SEM of the tumor volume. 

mean = mice_duplicates_removed.groupby("Drug Regimen").mean()['Tumor Volume (mm3)']
print(f"The mean tumor volume for each regimen is {mean}")

median = mice_duplicates_removed.groupby("Drug Regimen").median()['Tumor Volume (mm3)']
print(f"The median tumor volume for each regimen is {median}")

variance = mice_duplicates_removed.groupby("Drug Regimen").var()['Tumor Volume (mm3)']
print(f"The tumor volume variance for each regimen is {variance}")

std = mice_duplicates_removed.groupby("Drug Regimen").std()['Tumor Volume (mm3)']
print(f"The standard deviation of tumor volume for each regimen is {std}")

sem = mice_duplicates_removed.groupby("Drug Regimen").sem()['Tumor Volume (mm3)']
#final_mice.sem = (0, True, None, 1, None)
print(f"The SEM value for tumor volume for each regimen is {sem}")

# Assemble the resulting series into a single summary dataframe.
stat_summary = pd.DataFrame({"Mean": mean,
                            "Median": median,
                            "Variance": variance,
                            "Standard Deviation": std,
                            "SEM": sem})

In [None]:
stat_summary

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
# Set a title for the chart
list1 = mice_duplicates_removed.groupby(["Drug Regimen"]).count()["Timepoint"]
list1

In [None]:
# bar_charts_one= list1['Drug Regimen'].value_counts()
x_axis=np.arange(len(list1))
ticks = [value for value in x_axis]

bar_charts_one=list1.plot(kind="bar", title="Drug Regimens vs Number of Mice Tested")
bar_charts_one.set_xlabel("Drug Regimen")
bar_charts_one.set_ylabel("Number of Mice Tested")
plt.show()

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
counts = mice_duplicates_removed["Drug Regimen"].value_counts()
plt.bar(counts.index.values,counts.values)
plt.xlabel("Drug Regimen")
plt.xticks(rotation=90)
plt.ylabel("Number of Mice Tested")
plt.title("Drug Regimens vs Number of Mice Tested")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
counts = mice_duplicates_removed.Sex.value_counts()
counts.plot(kind="pie",autopct='%1.1f%%')
plt.show()

In [None]:
list2 = mice_duplicates_removed.groupby(["Sex"]).count()["Mouse ID"]
list2

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sizes = ["922", "958"]
labels = ["Female", "Male"]
colors = ["orange", "steelblue"]
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%')
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
final_tumor_volume = mice_duplicates_removed.groupby(['Mouse ID'])[['Drug Regimen', 'Timepoint']].max()


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
final_tumor_volume = final_tumor_volume.merge(combined_mice_data[['Mouse ID', 'Timepoint', 'Tumor Volume (mm3)']],
                                                                 how="inner", on =['Mouse ID', 'Timepoint'])                                                                    
final_tumor_volume.head()

In [None]:
final_tumor_volume.dtypes

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
drugs = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drug in drugs:
    
     # Locate the rows which contain mice on each drug and get the tumor volumes
    tumor_count = final_tumor_volume.loc[final_tumor_volume['Drug Regimen'] == drug, 'Tumor Volume (mm3)']
    # add subset 
    tumor_vol_data.append(tumor_count)
    
    # Determine outliers using upper and lower bounds
    
    quartiles = tumor_count.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    lower_bound = lowerq -(1.5*iqr)
    upper_bound = upperq +(1.5*iqr)
    
    outliers = tumor_count.loc[(tumor_count < lower_bound) | (tumor_count > upper_bound)]
    
    print(f"{drug}")
    print(f"Outliers for {drug} are {outliers}")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

fig1, ax1 = plt.subplots()
ax1.set_title("Tumor Volume vs Drug Regimens")
ax1.set_ylabel("Tumor Volume (mm3)")
ax1.boxplot(tumor_vol_data, labels=drugs)
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
capomulin_data = mice_duplicates_removed.loc[mice_duplicates_removed['Drug Regimen'] == 'Capomulin']
capomulin_data.head()

cap_mouse_sample=capomulin_data.sample().reset_index()['Mouse ID'][0]
cap_mouse_sample=capomulin_data[capomulin_data['Mouse ID'] == cap_mouse_sample]
plt.plot(cap_mouse_sample['Timepoint'],
         cap_mouse_sample['Tumor Volume (mm3)'],
         color='blue', label='Capomulin')
plt.title("Tumor Volume over Time with Capomulin Treatment")
plt.xlabel("Timepoint (Days)")
plt.ylabel("Tumor Volume mm3 ")

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
x = capomulin_data.groupby(['Mouse ID'])["Weight (g)"].mean()
y = capomulin_data.groupby(["Mouse ID"])['Tumor Volume (mm3)'].mean()

plt.scatter(x_axis, y_axis)
plt.title("Capomulin Regimen Tumor Volume vs. Weight")
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
correlation = st.pearsonr(x, y)
print(f"The Correlation is {round(correlation[0],2)}")

(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x, y)
regress_values = x * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

plt.scatter(x, y)
plt.plot(x,regress_values,"r-")
plt.annotate(line_eq,(20,20), fontsize=10, color="red")
plt.title("Capomulin Regimen - Tumor Volume vs. Weight")
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.show()