## Observations and Insights 

In [None]:
%matplotlib notebook

In [3]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)
#mouse_metadata.head()
# mouse_metadata has Mouse ID, Drug Regimen, Sex, Age_months, Weight (g)
#study_results.head()
# study_results has Mouse ID, Timepoint, Tumor Volume (mm3) Metastatic Sites
# Combine the data into a single dataset
study_data_complete = pd.merge(mouse_metadata, study_results, how="left", on=["Mouse ID", "Mouse ID"])

# Display the data table for preview
study_data_complete.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [8]:
# Checking the number of mice.
#study_data_complete.count()
mice = study_data_complete['Mouse ID'].unique()
len(mice)

249

In [24]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
#array([ 0,  5, 10, 15, 20, 25, 30, 35, 40, 45], dtype=int64)
timept = study_data_complete['Timepoint'].unique() 
#create a column that has a merged value of Mouse Id and Timepoint
study_data_complete['Merged'] = study_data_complete['Mouse ID'] + study_data_complete['Timepoint'].astype(str)
#study_data_complete['Merged'].value_counts()
#From <https://stackoverflow.com/questions/50242968/check-for-duplicate-values-in-pandas-dataframe-column> 
dup_values = study_data_complete['Merged'].duplicated()
for i in range(study_data_complete['Merged'].count()):
    if (dup_values[i] == True):
        dup_index.append(i)
#any(study_data_complete['Merged'].duplicated())
dup_index

# Drop all rows with missing information 
#cleaned_df = study_data_complete.dropna(how='any')




[909, 911, 913, 915, 917]

In [30]:
# Optional: Get all the data for the duplicate mouse ID. 
#print out all the data in study_data_complete for the indexes found in dup_index
for x in range(len(dup_index)):
    study_data_complete.loc[dup_index[x]]


In [27]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Drop all rows with missing information 
cleaned_df = study_data_complete.drop_duplicates(subset=['Merged'])
cleaned_df.count()


Mouse ID              1888
Drug Regimen          1888
Sex                   1888
Age_months            1888
Weight (g)            1888
Timepoint             1888
Tumor Volume (mm3)    1888
Metastatic Sites      1888
Merged                1888
dtype: int64

In [28]:
# Checking the number of mice in the clean DataFrame.
mice = study_data_complete['Mouse ID'].unique()
len(mice)

249

## Summary Statistics

In [37]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
# Display a statistical overview
# We can infer the maximum allowable individual contribution from 'max'
regimens = cleaned_df['Drug Regimen'].unique()
regimens
#cleaned_df.describe()
#cleaned_df
# Using GroupBy in order to separate the data into fields according to "state" values
grouped_tumor_df = cleaned_df.groupby(['Drug Regimen'])
tumor_mean = grouped_tumor_df['Tumor Volume (mm3)'].mean()
tumor_median = grouped_tumor_df['Tumor Volume (mm3)'].median()
tumor_var = grouped_tumor_df['Tumor Volume (mm3)'].var()
tumor_sd = grouped_tumor_df['Tumor Volume (mm3)'].std()
tumor_sem = grouped_tumor_df['Tumor Volume (mm3)'].sem()
# Creating a new DataFrame using both duration and count
tumor_summary_df = pd.DataFrame({"Mean": tumor_mean, "Median": tumor_median, "Variance": tumor_var,
                                "Standard Deviation": tumor_sd, "SEM": tumor_sem})
tumor_summary_df.head()


Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466


In [40]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same summary statistics in a single line
cleaned_df['Drug Regimen'].agg({"Tumor Volume (mm3)": ["min", "max", "median", "skew"]})
# Set x axis to numerical value for month
#x_axis = np.arange(1,13,1)
# Create a handle for each plot
#fahrenheit, = plt.plot(x_axis, points_F, marker="+",color="blue", linewidth=1, label="Fahreneit")
#celsius, = plt.plot(x_axis, points_C, marker="s", color="Red", linewidth=1, label="Celsius")
# Set our legend to where the chart thinks is best
#plt.legend(handles=[fahrenheit, celsius], loc="best")
# Create labels for the X and Y axis
#plt.xlabel("Months")
#plt.ylabel("Degrees")

SpecificationError: nested renamer is not supported

## Bar and Pie Charts

In [48]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
ax = cleaned_df.plot.bar(x='Drug Regimen', y=measure, rot=90)
# Create an array that contains the number of users each language has
#measure = []
#for y in range(len(regimens)):
#    measure.append(cleaned_df.loc[cleaned_df['Drug Regimen'] == regimens(y)].count())
measure = cleaned_df['Drug Regimen'].value_counts()
ax = cleaned_df.plot.bar(x='Drug Regimen', y=measure, rot=90)

IndexError: index 230 is out of bounds for axis 0 with size 9

In [10]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
x_axis = np.arange(len(measure))
# Tell matplotlib that we will be making a bar chart
# Users is our y axis and x_axis is, of course, our x axis
# We apply align="edge" to ensure our bars line up with our tick marks
plt.bar(x_axis, measure, color='r', alpha=0.5, align="center")
# Tell matplotlib where we would like to place each of our x axis headers
# Tell matplotlib where we would like to place each of our x axis headers
tick_locations = [value for value in x_axis]
#plt.xticks(tick_locations, ["Java", "C++", "Python", "Ruby", "Clojure"])
plt.xticks(tick_locations, regimens)
# Sets the x limits of the current chart
plt.xlim(-0.75, len(x_axis)-0.25)
# Sets the y limits of the current chart
plt.ylim(0, max(measure)+25


In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
 titanic[["Sex", "Age"]].groupby("Sex").mean()
Out[8]: 
              Age
Sex              
female  27.915709
male    30.726645


In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
