## Observations and Insights 

In [97]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)


In [98]:
study_results.dtypes

Mouse ID               object
Timepoint               int64
Tumor Volume (mm3)    float64
Metastatic Sites        int64
dtype: object

In [99]:
mouse_metadata.dtypes

Mouse ID        object
Drug Regimen    object
Sex             object
Age_months       int64
Weight (g)       int64
dtype: object

In [100]:
# Checking the number of mice.
number_of_mice = len(mouse_metadata['Mouse ID'].unique())
number_of_mice

249

In [101]:
# Check the data for any mouse ID with duplicate time points and remove any data associated with that mouse ID.

sorted_study_results_df = study_results.sort_values(['Mouse ID'], ascending=True)
clean_study_results_df = sorted_study_results_df.reset_index(drop=True)

# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
largo =len(clean_study_results_df)
for i in range (0,largo-1): 
    if clean_study_results_df.loc[i+1][1] == clean_study_results_df.loc[i][1]:
        if (clean_study_results_df.loc[i+1][0] == clean_study_results_df.loc[i][0]):
            NOMBRE =clean_study_results_df.loc[i+1][0]

In [102]:
# Optional: Get all the data for the duplicate mouse ID. 
index_names = clean_study_results_df[clean_study_results_df['Mouse ID'] == NOMBRE].index 
duplicate_mouse_ID_frame = clean_study_results_df.loc[index_names, :]
dup_mouse_ID = duplicate_mouse_ID_frame.sort_values(['Timepoint'], ascending=True)
duplicate_mouse_ID = dup_mouse_ID.reset_index(drop=True)
duplicate_mouse_ID


#falta agregar la info del mouse data


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,g989,0,45.0,0
1,g989,0,45.0,0
2,g989,5,48.786801,0
3,g989,5,47.570392,0
4,g989,10,51.745156,0
5,g989,10,49.880528,0
6,g989,15,51.325852,1
7,g989,15,53.44202,0
8,g989,20,55.326122,1
9,g989,20,54.65765,1


In [103]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
index_names = clean_study_results_df[clean_study_results_df['Mouse ID'] == NOMBRE].index 
index2_names = mouse_metadata[mouse_metadata['Mouse ID'] == NOMBRE].index
clean_study_results_df.drop(index_names, inplace = True) 
mouse_metadata.drop(index2_names, inplace = True) 


In [104]:
# Combine the data into a single dataset
combined_df=pd.merge(mouse_metadata,clean_study_results_df,on='Mouse ID',how ='outer')
combined_df

# Display the data table for preview

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,5,38.825898,0
1,k403,Ramicane,Male,21,16,15,34.223992,1
2,k403,Ramicane,Male,21,16,45,22.050126,1
3,k403,Ramicane,Male,21,16,25,33.464577,1
4,k403,Ramicane,Male,21,16,30,31.099498,1
...,...,...,...,...,...,...,...,...
1875,z969,Naftisol,Male,9,30,25,63.145652,2
1876,z969,Naftisol,Male,9,30,20,57.898778,2
1877,z969,Naftisol,Male,9,30,5,49.332999,0
1878,z969,Naftisol,Male,9,30,35,69.176246,4


In [105]:
# Checking the number of mice in the clean DataFrame.
new_number_of_mice = len(combined_df['Mouse ID'].unique())
new_number_of_mice

248

## Summary Statistics

In [106]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_statistics=combined_df.groupby(['Drug Regimen'])
mean =summary_statistics['Tumor Volume (mm3)'].mean()
median =summary_statistics['Tumor Volume (mm3)'].median()
std =summary_statistics['Tumor Volume (mm3)'].std()
sem = summary_statistics['Tumor Volume (mm3)'].sem()

# This method is the most straighforward, creating multiple series and putting them all together at the end.

tabla = pd.merge(mean,median, on='Drug Regimen')
tabla = tabla.rename(columns={"Tumor Volume (mm3)_x":"Mean", "Tumor Volume (mm3)_y":"Median"})
tabla2 = pd.merge(std,sem, on='Drug Regimen')
tabla2 = tabla2.rename(columns={"Tumor Volume (mm3)_x":"Std dev", "Tumor Volume (mm3)_y":"SEM"})
#tabla3=pd.merge(tabla2,nueva4, on='Drug Regimen')
Summ_statistics=pd.merge(tabla,tabla2,on='Drug Regimen')
Summ_statistics


Unnamed: 0_level_0,Mean,Median,Std dev,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Capomulin,40.675741,41.557809,4.994774,0.329346
Ceftamin,52.591172,51.776157,6.268188,0.469821
Infubinol,52.884795,51.820584,6.567243,0.492236
Ketapril,55.235638,53.698743,8.279709,0.60386
Naftisol,54.331565,52.509285,8.134708,0.596466
Placebo,54.033581,52.288934,7.821003,0.581331
Propriva,52.32093,50.446266,6.622085,0.544332
Ramicane,40.216745,40.673236,4.846308,0.320955
Stelasyn,54.233149,52.431737,7.710419,0.573111
Zoniferol,53.236507,51.818479,6.966589,0.516398


## Bar and Pie Charts

In [161]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 

summary_statistics=combined_df.groupby(['Drug Regimen'])
mouse_count=summary_statistics['Mouse ID'].value_counts() 
columna1=summary_statistics['Drug Regimen'].unique()
num_ratones = len(summary_statistics['Mouse ID'])

tablita = pd.merge(columna1,mouse_count, on='Drug Regimen')
tablita
#unique_players_df =summary_statistics.drop_duplicates(subset=['Member ID'])
#unique_players_df








#x_axis = np.arange(len(Summ_statistics))
#tick_locations = [value+0.4 for value in x_axis]
#plt.figure(figsize=(20,3))
#plt.bar(x_axis, combined_df['Drug Regimen], color='r', alpha=0.5, align="edge")
#plt.xticks(tick_locations, combined_df['Mouse ID'], rotation="vertical")


# Filter the DataFrame down only to those columns to chart
#state_and_inches = rain_df[["State","Inches"]]

# Set the index to be "State" so they will be used as labels
#state_and_inches = state_and_inches.set_index("State")

# Use DataFrame.plot() in order to create a bar chart of the data
#state_and_inches.plot(kind="bar", figsize=(20,3))
#plt.show()
#plt.tight_layout()



ValueError: 'Drug Regimen' is both an index level and a column label, which is ambiguous.

In [10]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
