## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_data = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
combined_data.head()

In [None]:
# Checking the number of mice.
combined_data['Mouse ID'].value_counts()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_id = combined_data.loc[combined_data.duplicated(subset=['Mouse ID', 'Timepoint',]),'Mouse ID'].unique()
duplicate_id

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
combined_data.loc[combined_data['Mouse ID'] == 'g989']


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_combined_data = combined_data[combined_data['Mouse ID'].isin(duplicate_id)==False]
cleaned_combined_data.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
cleaned_combined_data['Mouse ID'].value_counts()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
mean = cleaned_combined_data.groupby('Drug Regimen').mean()['Tumor Volume (mm3)']
median = cleaned_combined_data.groupby('Drug Regimen').median()['Tumor Volume (mm3)']
var = cleaned_combined_data.groupby('Drug Regimen').var()['Tumor Volume (mm3)']
std_dev = cleaned_combined_data.groupby('Drug Regimen').std()['Tumor Volume (mm3)']
SEM = cleaned_combined_data.groupby('Drug Regimen').sem()['Tumor Volume (mm3)']
# Assemble the resulting series into a single summary dataframe.
summary_groupby = pd.DataFrame({"Mean": mean,
                                "Median": median,
                                "Variance": var,
                                "Standard Deviation": std_dev,
                                "SEM": SEM})
summary_groupby


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
summary_agg = cleaned_combined_data.groupby('Drug Regimen').agg({"Tumor Volume (mm3)":["mean","median","var","std","sem"]})
summary_agg

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.
unique_mice = cleaned_combined_data['Drug Regimen'].value_counts()
unique_mice.plot.bar()
plt.xlabel("Drug Regimen")
plt.ylabel("Total Number of Measurements")


In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.
x_axis = cleaned_combined_data['Drug Regimen'].unique()
plt.bar(x_axis, unique_mice,0.5)
plt.xticks(rotation=90)
plt.xlabel("Drug Regimen")
plt.ylabel("Total Number of Measurements")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_data = cleaned_combined_data['Sex'].value_counts()
gender_data.plot.pie(title="Distribution of female versus male",autopct='%1.1f%%')



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = cleaned_combined_data['Sex'].unique()
plt.pie(gender_data,labels=labels,autopct='%1.1f%%',)
plt.ylabel("Sex")




## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
last_timepoint = cleaned_combined_data.groupby(['Mouse ID'])['Timepoint'].max()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merged_df = pd.merge(cleaned_combined_data, last_timepoint, on='Mouse ID')
merged_df.head()


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
drugs = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
merged_df = merged_df.loc[merged_df["Drug Regimen"] == drug]


# Create empty list to fill with tumor vol data (for plotting)
tumor_vol = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 


for drug in drugs:
    # Locate the rows which contain mice on each drug and get the tumor volumes
    tumor_vol_value = merged_df.loc[merged_df["Drug Regimen"] == drug,'Tumor Volume (mm3)']

    # add subset 
    tumor_vol.append(tumor_vol_value)

    # Determine outliers using upper and lower bounds
    quartiles = tumor_vol_value.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq - (1.5*iqr)
    outliers = tumor_vol_value.loc[(merged_df['Tumor Volume (mm3)'] < lower_bound) | (merged_df['Tumor Volume (mm3)'] > upper_bound)]
    print(f"{drug}'s potential outliers: {outliers}")



In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
    
flierprops = dict(markerfacecolor = 'red', markersize=8)
plt.boxplot(tumor_vol, labels = drugs, flierprops = flierprops)


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
capomulin_df = cleaned_combined_data.loc[cleaned_combined_data['Drug Regimen'] == "Capomulin"]
mouse = capomulin_df.loc[capomulin_df['Mouse ID']== 's185']
plt.plot(mouse['Timepoint'], mouse['Tumor Volume (mm3)'])
plt.xlabel('Timepoint (days)')
plt.ylabel('Tumor Volume (mm3)')
plt.title('Capomulin treatment of mouse s185')


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
capomulin_average = capomulin_df.groupby(['Mouse ID']).mean()
plt.scatter(capomulin_average['Weight (g)'],capomulin_average['Tumor Volume (mm3)'])
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
capomulin_df = cleaned_combined_data.loc[cleaned_combined_data['Drug Regimen'] == "Capomulin"]
capomulin_average = capomulin_df.groupby(['Mouse ID']).mean()
plt.scatter(capomulin_average['Weight (g)'],capomulin_average['Tumor Volume (mm3)'])
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
x = capomulin_average['Weight (g)']
y = capomulin_average['Tumor Volume (mm3)']
correlation = st.pearsonr(x,y)
line = st.linregress(capomulin_average['Weight (g)'],capomulin_average['Tumor Volume (mm3)'])
regression_line = capomulin_average['Weight (g)']*line[0]+line[1]
plt.plot(capomulin_average['Weight (g)'],regression_line,color="red")
