# Observations and Insights

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_mice_data = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
combined_mice_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [2]:
# Checking the number of mice.
mice_count = combined_mice_data["Mouse ID"].count()
mice_count

1893

In [3]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicate_mice = combined_mice_data.loc[combined_mice_data.duplicated(subset=["Mouse ID", "Timepoint"]),"Mouse ID"].unique()
duplicate_mice

array(['g989'], dtype=object)

In [4]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mice_id = combined_mice_data.loc[combined_mice_data["Mouse ID"] == "g989"]
duplicate_mice_id                              

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [5]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mice_duplicates_removed = combined_mice_data[combined_mice_data["Mouse ID"].isin(duplicate_mice)==False]
mice_duplicates_removed

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [14]:
# Checking the number of mice in the clean DataFrame.
final_mice = mice_duplicates_removed["Mouse ID"].count()
final_mice

1880

## Summary Statistics

In [16]:
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:       
# mean, median, variance, standard deviation, and SEM of the tumor volume. 

tumor_volume = mice_duplicates_removed["Tumor Volume (mm3)"]
tumors = tumor_volume.groupby("Drug Regimen")

mean_numpy = np.mean(tumors)
print(f"The mean tumor volume for each regimen is {mean_numpy}")

median_numpy = np.median(tumors)
print(f"The median tumor volume for each regimen is {median_numpy}")

var_numpy = np.var(tumors,ddof = 0)
print(f"The tumor volume variance for each regimen is {var_numpy}")

sd_numpy = np.std(tumors,ddof = 0)
print(f"The standard deviation of tumor volume for each regimen is {sd_numpy}")

final_mice.sem = (0, True, None, 1, None)
print(f"The SEM value for tumor volume for each regimen is {sem(sample['Tumor Volume (mm3)'])}")

# Assemble the resulting series into a single summary dataframe.
stat_summary = pd.DataFrame({"Mean": mean_numpy,
                            "Median": median_numpy,
                            "Variance": var_numpy,
                            "Standard Deviation": sd_numpy,
                            "SEM": sem})

KeyError: 'Drug Regimen'

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
bar_chart_one = final_mice[["Drug Regimen","Timepoint"]]
bar_chart_one = bar_chart_one.set_index("Drug Regemin")

bar_chart_one.head()

bar_chart_one.plot(kind="bar", figsize=(20,3.5))

# Set a title for the chart
plt.title("Drug Regimens vs Timepoints")

plt.show()
plt.tight_layout()

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
x_axis=np.arange(len(final_mice))
tick_locations = [value for value in x_axis]

plt.bar(x_axis, final_mice["Timepoint"], color='r', alpha=0.5, align="center")
plt.xticks(tick_locations, final_mice["Drug Regimen"], rotation="vertical")

plt.title("Drug Regimens vs Timepoints")
plt.xlabel("Drug Regimens")
plt.ylabel ("Timepoints")
plt.xlim (0, 100)
plt.ylim (0, 100)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gender_pie = final_mice.plot(kind="pie", y='Sex', title=("Mice Gender"))
branch_pie.set_ylabel("Gender")

plt.axis("equal")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = ["Female", "Male"]
colors = ["red", "blue"]
plt.pie(final_mice, explode=explode, labels=pies, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)
plt.axis("equal")

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
drug_regimens = ["Capomulin" , "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drugs in drug_regimens:
    
    quartiles = volume.quantile([.25,.5,.75])
    lowerq=quartiles[0.25]
    upperq=quartiles[0.75]
    iqr=upperq-lowerq
    
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    tumor_count = new_df.loc[new_df["Drug Regimen"] == drugs, "Tumor Volume (mm3)"]
    
    # add subset 
    tumor_vol_data.append(tumor_count)
    
    # Determine outliers using upper and lower bounds
    lower_bound = lowerq -(1.5*iqr)
    upper_bound = upperq +(1.5*iqr)
    outliers = tumor_count.loc[tumor_count < lower_bound) | (tumor_count > upper_bound)]
    
    print(f"{drugs}'s potential outliers: {outliers}\n")
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
Capomulin_data = combine_data.loc[combine_data["Drug Regimen"] == "Capomulin",:]
Capomulin_data.head()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
x_axis = Campomulin_data["Weight (g)"]]
avg_tumor_vol = Campomulin_data.groupby(["Mouse ID"].mean()

fig1, ax1 = plt.subplots(figsize =(15,10))
plt.title("Camopulin Regimen Tumor Volume vs. Weight", fontsize = 14)
plt.scatter(x_axis, avg_tumor_vol)
plt.xlabel("Weight (g)", fontsize = 12)
plt.ylabel ("Tumor Volume (mm3)", fontsize = 12)

plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
fig1, ax1 = plt.subplots(figsize=(15,10))