# Pymaceuticals Inc.
---

### Analysis

- In this Data we are going to analyze to CSV for the behavior of several Drug Regimen, how they impact in the Tumor Volume of the mice that are in the treatment. In this Data we found that Campulin and Ramicane have more effectivity in the treatmet, and Ketapril and Naftisol have the lower effectivity in the treatment, so with all the data we have, we analyze the behavior for 40 days, and Campulin has good efectivity in the treatment, and when we analyze only Campulin we saw a significant decrease in the tumor volume after 20 days, and also we know that if the mouse has more weight, the tumor volume is bigger. 
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
combined_mouse_study = pd.merge(study_results, mouse_metadata, how="left", on="Mouse ID")

# Display the data table for preview
combined_mouse_study.head()


In [None]:
# Checking the number of mice.
number_mice = len(combined_mouse_study["Mouse ID"].unique())

# Display the data table
number_mice


In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
mouse_id_timepoint = combined_mouse_study[combined_mouse_study.duplicated(subset=["Mouse ID", "Timepoint"], keep=False)]


In [None]:
# Optional: Get all the data for the duplicate mouse ID.
mouse_id_duplicate = mouse_id_timepoint["Mouse ID"].unique()
mouse_duplicate = combined_mouse_study[combined_mouse_study["Mouse ID"].isin(mouse_id_duplicate)]

# Display the data table
mouse_duplicate.head()


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_no_duplicate = combined_mouse_study[~combined_mouse_study["Mouse ID"].isin(mouse_id_duplicate)]

# Display the data table
mouse_no_duplicate.head()


In [None]:
# Checking the number of mice in the clean DataFrame.
number_mice_no_duplicate = len(mouse_no_duplicate["Mouse ID"].unique())

number_mice_no_duplicate


## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
# mean, median, variance, standard deviation, and SEM of the tumor volume.
summary_statistics = combined_mouse_study.groupby("Drug Regimen")["Tumor Volume (mm3)"].agg(['mean', 'median', 'var', 'std', 'sem'])

summary_statistics = summary_statistics.rename(columns={
    'mean': 'Mean Tumor Volume',
    'median': 'Median Tumor Volume',
    'var': 'Tumor Volume Variance',
    'std': 'Tumor Volume Std. Dev',
    'sem': 'Tumor Volume Std. Err.'})


# Assemble the resulting series into a single summary DataFrame.
summary_statistics.head()


In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)
summary_statistics_advanced = mouse_no_duplicate.groupby("Drug Regimen")["Tumor Volume (mm3)"].agg(
    mean='mean',
    median ='median',
    var ='var',
    std='std',
    sem ='sem')



# Using the aggregation method, produce the same summary statistics in a single line
# Display the data table
summary_statistics_advanced.head()


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
number_drug = combined_mouse_study['Drug Regimen'].value_counts()
number_drug.plot(kind='bar', figsize=(7,4), rot=45, color='#31668a', alpha=1.00)

# Adding labels and title
plt.xlabel('Drug Regimen')
plt.ylabel('# of Observed Mouse Timepoints')

# Display the plot
plt.show()


In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
drug_regimens = number_drug.index
counts = number_drug.values

# Plotting using pyplot
plt.figure(figsize=(7, 4))
plt.bar(drug_regimens, counts, color='#31668a', alpha=1.00)

# Adding labels and title
plt.xlabel('Drug Regimen')
plt.ylabel('# of Observed Mouse Timepoints')

plt.xticks(rotation=45)  

# Display the plot
plt.show()


In [None]:
# Generate a pie chart, using Pandas, showing the distribution of unique female versus male mice used in the study
# Get the unique mice with their gender
mice_gender = mouse_no_duplicate.drop_duplicates(subset="Mouse ID")["Sex"].value_counts()

# Make the pie chart
mice_gender.plot(kind='pie', autopct='%1.1f%%', figsize=(5,5), startangle=0, labels=['Male', 'Female'], colors=['#31668a', 'darkorange'])

# Display the pie chart
plt.show()


In [None]:
# Generate a pie chart, using pyplot, showing the distribution of unique female versus male mice used in the study
# Get the unique mice with their gender
plt.figure(figsize=(5,5))
plt.pie(mice_gender, labels=['Male', 'Female'], autopct='%1.1f%%', startangle=0, colors=['#31668a', 'darkorange'])


# Display the pie chart
plt.show()


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:
# Capomulin, Ramicane, Infubinol, and Ceftamin
selected_regimens = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
filtered_df = mouse_no_duplicate[mouse_no_duplicate["Drug Regimen"].isin(selected_regimens)]

# Start by getting the last (greatest) timepoint for each mouse
last_timepoints = filtered_df.groupby("Mouse ID")["Timepoint"].max().reset_index()

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
last_timepoints = filtered_df.groupby("Mouse ID")["Timepoint"].max().reset_index()
final_tumor_volume_df = pd.merge(last_timepoints, filtered_df, on=["Mouse ID", "Timepoint"], how="left")

# Display the data table
final_tumor_volume_df.head()


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]


# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers.
for treatment in treatments:

    # Locate the rows which contain mice on each drug and get the tumor volumes
    treatment_data = final_tumor_volume_df[final_tumor_volume_df["Drug Regimen"] == treatment]["Tumor Volume (mm3)"]

    # add subset
    tumor_vol_data.append(treatment_data)
    
    quartiles = treatment_data.quantile([.25, .5, .75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq - lowerq

    # Determine outliers using upper and lower bounds
    lower_bound = lowerq - (1.5 * iqr)
    upper_bound = upperq + (1.5 * iqr)
    outliers = treatment_data[(treatment_data < lower_bound) | (treatment_data > upper_bound)]
    
    print(f"{treatment}'s potential outliers: {outliers}")


In [None]:
# Generate a box plot that shows the distribution of the tumor volume for each treatment group.
fig1, ax1 = plt.subplots(figsize=(6, 4))
selected_regimens = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
data_plot = [final_tumor_volume_df[final_tumor_volume_df["Drug Regimen"] == regimen]["Tumor Volume (mm3)"] for regimen in selected_regimens]
flierprops = dict(marker='o', markerfacecolor='red', markersize=8)
ax1.boxplot(data_plot, labels=selected_regimens, flierprops=flierprops)
ax1.set_ylabel('Final Tumor Volume (mm3)')

# Display the plot
plt.show()


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
capomulin_data = combined_mouse_study[combined_mouse_study["Drug Regimen"] == "Capomulin"]
mouse_id = "l509"
single_mouse_data = capomulin_data[capomulin_data["Mouse ID"] == mouse_id]
if not single_mouse_data.empty:
    plt.figure(figsize=(6, 5))
    plt.plot(single_mouse_data["Timepoint"], single_mouse_data["Tumor Volume (mm3)"], marker='o', linestyle='-', color='#31668a')
    
    plt.title(f'Campulin treatment of mouse l509')
    plt.xlabel('Time Point (days)')
    plt.ylabel('Tumor Volume (mm3)')

# Display the plot
    plt.show()

else:
    print(f"No data found for Mouse ID {mouse_id} treated with Capomulin.")


In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
average_tumor_volume = capomulin_data.groupby("Mouse ID").agg({
    "Tumor Volume (mm3)": "mean",
    "Weight (g)": "mean"  
}).reset_index()
plt.figure(figsize=(6, 5))
plt.scatter(average_tumor_volume["Weight (g)"], average_tumor_volume["Tumor Volume (mm3)"], color='#31668a', alpha=1.00)
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')

# Display the plot
plt.show()


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
correlation = st.pearsonr(average_tumor_volume["Weight (g)"], average_tumor_volume["Tumor Volume (mm3)"])
correlation_coefficient = correlation[0]
slope, intercept, r_value, p_value_regression, std_err = st.linregress(average_tumor_volume["Weight (g)"], average_tumor_volume["Tumor Volume (mm3)"])
x_values = average_tumor_volume["Weight (g)"]
y_values = slope * x_values + intercept
plt.figure(figsize=(6, 5))
plt.scatter(average_tumor_volume["Weight (g)"], average_tumor_volume["Tumor Volume (mm3)"], color='#31668a', alpha=1.00, label='Observed Data')
plt.plot(x_values, y_values, color='red', linewidth=2, label='Linear Regression Line')
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')

# Display the plot
plt.show()
