## Observations and Insights 

In [30]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# check each data set and group to look for duplicates of Mouse ID
mouse_check_df = mouse_metadata.groupby(['Mouse ID'])['Mouse ID'].sum()
#print(mouse_check_df)
mouse_count=len(mouse_check_df)

print(mouse_count)
#no duplicate mouse IDs found in mouse_metadata.  249 unique mouse data entries  

#Total # of study results entries
study_len=len(study_results)

# Combine the data into a single dataset
combined_df=pd.merge(left=mouse_metadata, right=study_results, on="Mouse ID", how="outer")
combined_df


249


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [25]:
# Checking the number of mice in the DataFrame.
mouse_count=combined_df.groupby(['Mouse ID'])
#print(mouse_count)

len(mouse_count.count())
#249 mouse entries(rows) same as mouse_metadata.  We have all mice accounted for and all studies in the same table.


249

In [42]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.  Counting for each timepoint to reveal timepoint duplicates. 
mouse_id_tp=pd.DataFrame(combined_df.groupby(['Mouse ID','Timepoint'])['Timepoint'].count())
mouse_id_tp.head()
#checking for duplicate "timepoint and mouse id entries with for-loop"
for x in mouse_id_tp['Timepoint']:
    if x > 1:
        print(x)
           
# Returns 5 values above 1 so we havea total of 5 duplicate Mouse ID and Timepoints.  Need to figure out which ones

2
2
2
2
2


In [44]:
# Optional: Get all the data for the duplicate mouse ID. 
tp_dup_df=mouse_id_tp
tp_dup_df['TP Counts'] = combined_df.groupby(['Mouse ID','Timepoint'])['Timepoint'].count()
tp_dup_df=tp_dup_df.loc[tp_dup_df['TP Counts'] > 1]

#Will use to double check that all duplicate Mouse ID and Timepoint entries are removed from orginal entry list by count.  
tot_dup_count=len(tp_dup_df)

#Merge back with combined list
dup_combined_df=pd.merge(left=combined_df, right=tp_dup_df, on="Mouse ID", how="inner")
#len(dup_combined_df)
#Resulted with duplicates based on merge above, will remove duplicates to create data frame for duplicate mouse ID
dup_info_df=dup_combined_df.drop_duplicates(subset=['Mouse ID', 'Timepoint_x'], keep='first')


dup_info_df


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint_x,Tumor Volume (mm3),Metastatic Sites,Timepoint_y,TP Counts
0,g989,Propriva,Female,21,26,0,45.0,0,2,2
10,g989,Propriva,Female,21,26,5,48.786801,0,2,2
20,g989,Propriva,Female,21,26,10,51.745156,0,2,2
30,g989,Propriva,Female,21,26,15,51.325852,1,2,2
40,g989,Propriva,Female,21,26,20,55.326122,1,2,2
50,g989,Propriva,Female,21,26,25,56.045564,1,2,2
55,g989,Propriva,Female,21,26,30,59.082294,1,2,2
60,g989,Propriva,Female,21,26,35,62.57088,2,2,2


In [45]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
#Using drop duplicates to drop duplicated Mouse ID and Timepoint_x
cleaned_combined_df=combined_df.drop_duplicates(subset=['Mouse ID', 'Timepoint'], keep='first')
cleaned_combined_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [66]:
# Checking the number of mice in the clean DataFrame.
cleaned_len=len(cleaned_combined_df['Mouse ID'])

#lenght we started from original merge of the lists, with duplicates subtracted
non_dup_len= (study_len) - (tot_dup_count)
assert True cleaned_len = non_dup_len, "Calculation correct!"


SyntaxError: invalid syntax (<ipython-input-66-81866bec2010>, line 6)

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straightforward, creating multiple series and putting them all together at the end.

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function.

## Bar Plots

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

## Pie Plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
