# Pymaceuticals Inc.
---
## Analysis of Mouse Study Data

This study analyzes the effects of various drug regimens on tumor growth in mice.

### Dependencies and Setup

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import sem

# Study data files
mouse_metadata_path = "Mouse_metadata.csv"
study_results_path = "Study_results.csv"

# Read the mouse data and study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
study_data_complete = pd.merge(study_results, mouse_metadata, how="left", on="Mouse ID")

### Display the number of unique mice.

In [None]:
# Check the number of mice.
number_of_mice = len(study_data_complete["Mouse ID"].unique())
print(f"Number of unique mice: {number_of_mice}")

### Check for any mouse ID with duplicate time points

In [None]:
# Getting the duplicate mice by ID/Timepoint
duplicates = study_data_complete[study_data_complete.duplicated(subset=['Mouse ID', 'Timepoint'])]

print("Duplicate mice by ID and Timepoint:")
print(duplicates)

### Create a clean DataFrame by dropping the duplicate mouse by its ID.

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_study_data = study_data_complete.drop_duplicates(subset=['Mouse ID', 'Timepoint'])

# Checking the number of mice in the clean DataFrame
clean_mice_count = len(clean_study_data["Mouse ID"].unique())
print(f"Number of mice in clean data: {clean_mice_count}")

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_stats = clean_study_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].agg(['mean', 'median', 'var', 'std', 'sem'])

# Display the summary statistics
print(summary_stats)

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing number of total mice for each treatment regimen using pandas
regimen_counts = clean_study_data['Drug Regimen'].value_counts()

# Create bar chart using Pandas
regimen_counts.plot(kind='bar')
plt.title('Mouse Count per Drug Regimen')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Mice')
plt.xticks(rotation=45)
plt.show()

# Create bar chart using pyplot
plt.figure(figsize=(10,6))
plt.bar(regimen_counts.index, regimen_counts.values)
plt.title('Mouse Count per Drug Regimen')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Mice')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_counts = clean_study_data['Sex'].value_counts()

# Pie chart using pandas
gender_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title('Male vs Female Mouse Distribution')
plt.axis('equal')
plt.show()

# Pie chart using pyplot
plt.figure(figsize=(10,6))
plt.pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%')
plt.title('Male vs Female Mouse Distribution')
plt.axis('equal')
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Get the maximum timepoint for each mouse
max_timepoint = clean_study_data.groupby(['Mouse ID'])['Timepoint'].max()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merge_max = pd.merge(max_timepoint, clean_study_data, on=['Mouse ID', 'Timepoint'], how='left')

# Put treatments into a list for for loop (and later for plot labels)
treatments = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data (for plotting)
final_tumor_vol = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drug in treatments:
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    vol = merge_max.loc[merge_max['Drug Regimen'] == drug, 'Tumor Volume (mm3)']
    
    # Add subset to the list
    final_tumor_vol.append(vol)
    
    # Calculate quartiles, IQR, bounds
    quartiles = vol.quantile([0.25, 0.5, 0.75])
    lower_quartile = quartiles[0.25]
    upper_quartile = quartiles[0.75]
    iqr = upper_quartile - lower_quartile
    lower_bound = lower_quartile - (1.5 * iqr)
    upper_bound = upper_quartile + (1.5 * iqr)
    
    # Print results
    print(f"
{drug}")
    print(f"--------------------")
    print(f"Lower Quartile: {lower_quartile:.2f}")
    print(f"Upper Quartile: {upper_quartile:.2f}")
    print(f"IQR: {iqr:.2f}")
    print(f"Lower Bound: {lower_bound:.2f}")
    print(f"Upper Bound: {upper_bound:.2f}")
    
    # Determine outliers
    outliers = vol[(vol < lower_bound) | (vol > upper_bound)]
    if len(outliers) > 0:
        print('Outliers:')
        print(outliers)

In [None]:
# Generate a box plot that shows the distribution of the tumor volume for each treatment group
plt.figure(figsize=(10,6))
plt.boxplot(final_tumor_vol, labels=treatments)
plt.title('Final Tumor Volume by Drug Regimen')
plt.ylabel('Final Tumor Volume (mm3)')
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
mouse_capomulin = clean_study_data.loc[clean_study_data['Drug Regimen'] == 'Capomulin']
mouse_id = mouse_capomulin['Mouse ID'].iloc[0]
single_mouse = mouse_capomulin.loc[mouse_capomulin['Mouse ID'] == mouse_id]

plt.plot(single_mouse['Timepoint'], single_mouse['Tumor Volume (mm3)'], 'b-o')
plt.title(f'Tumor Volume vs. Timepoint for Mouse {mouse_id}')
plt.xlabel('Timepoint (days)')
plt.ylabel('Tumor Volume (mm3)')
plt.grid()
plt.show()

In [None]:
# Generate a scatter plot of mouse weight vs. average observed tumor volume for the Capomulin regimen
capomulin_data = clean_study_data.loc[clean_study_data['Drug Regimen'] == 'Capomulin']
avg_tumor_vol = capomulin_data.groupby('Mouse ID').agg({'Weight (g)': 'mean', 'Tumor Volume (mm3)': 'mean'})

# Create scatter plot
plt.scatter(avg_tumor_vol['Weight (g)'], avg_tumor_vol['Tumor Volume (mm3)'])
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.title('Mouse Weight vs. Average Tumor Volume (Capomulin)')
plt.grid(True)

# Calculate correlation coefficient
correlation = st.pearsonr(avg_tumor_vol['Weight (g)'], avg_tumor_vol['Tumor Volume (mm3)'])
print(f"The correlation coefficient between mouse weight and average tumor volume is {correlation[0]:.2f}")

# Calculate linear regression
slope, intercept, r_value, p_value, std_err = st.linregress(avg_tumor_vol['Weight (g)'], 
                                                           avg_tumor_vol['Tumor Volume (mm3)'])

# Add regression line
x_values = avg_tumor_vol['Weight (g)']
y_values = slope * x_values + intercept
plt.plot(x_values, y_values, 'r--', label=f'Regression line (r = {r_value:.2f})')
plt.legend()
plt.show()

print(f"The r-squared value is: {r_value**2:.4f}")

## Correlation and Regression

In [None]:
# Add the linear regression equation and line to plot
slope, intercept, r_value, p_value, std_err = st.linregress(avg_tumor_vol['Weight (g)'], 
                                                           avg_tumor_vol['Tumor Volume (mm3)'])

print(f"The r-squared value is: {r_value**2:.4f}")
print(f"The slope is: {slope:.4f}")
print(f"The y-intercept is: {intercept:.4f}")
print(f"Therefore, the line equation is: y = {slope:.4f}x + {intercept:.4f}")