# Assignment 3 - Diabetes Sampling


### Import Packages

In [1]:
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.tools as tls
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import plot
!pip install -U kaleido
import kaleido
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


### Create Directory Structure

In [2]:
# Define Directories and Dataset
parent_dir = 'Assignment_2_Diabetes-Sampling'
data_raw_dir = 'data_raw'
data_clean_dir = 'data_clean'
results_dir = 'results'
source_dir = 'src'
dataset = 'diabetes'

# Create Directories
os.mkdir(f'./{parent_dir}')
os.mkdir(f'./{parent_dir}/{data_raw_dir}')
os.mkdir(f'./{parent_dir}/{data_clean_dir}')
os.mkdir(f'./{parent_dir}/{results_dir}')
os.mkdir(f'./{parent_dir}/{source_dir}')

fig_list = {}

## Data Collection

The dataset was provided in a CSV:

diabetes.csv

At this point, upload or copy StudentsPerformance.csv into:

In [3]:
print(f'./{parent_dir}/{data_raw_dir}')

./Assignment_2_Diabetes-Sampling/data_raw


Then, create a README

In [4]:
with open(f'./{parent_dir}/{data_raw_dir}/README.md', "w") as file: # Create a (mostly) empty README
    file.write("Raw Data Metadata")

**The README will need to be manually updated with the appropriate field data.**

### Import the Dataset

In [5]:
# Create the Dataframe
data_raw = pd.read_csv(f'./{parent_dir}/{data_raw_dir}/{dataset}.csv')
data_raw.head(15)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


## Data Processing/Cleaning

In [9]:
data_raw.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
# Check for NaNs
data_raw.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
# Check for class imbalance
data_raw['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

There is definitely a class imbalance, so without balancing the classes (via gathering more data or over/undersampling, which will not be done as that would throw off means/standard deviations/etc.) we'll want to stratify.

In [11]:
X = data_raw.drop(['Outcome'], axis=1)
y = data_raw['Outcome']

In [12]:
X.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
5,5,116,74,0,0,25.6,0.201,30
6,3,78,50,32,88,31.0,0.248,26
7,10,115,0,0,0,35.3,0.134,29
8,2,197,70,45,543,30.5,0.158,53
9,8,125,96,0,0,0.0,0.232,54


In [13]:
y.head(10)

0    1
1    0
2    1
3    0
4    1
5    0
6    1
7    0
8    1
9    1
Name: Outcome, dtype: int64

Please note that because none of the data is obviously dirty/missing, etc. and the goal of this assignment is to compare sampling methods (and not to perform any modeling or other statistics), no preprocessing/cleaning/augmenting/scaling was performed so no new "clean" dataset was created.

## Data Analysis

## 25 Observations, Glucose

First, split train and test:

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, train_size=25, shuffle=True) # Split train and test, with a train size of 25 and with stratify set to True so that the sample label distributions match the population

In [16]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
161,7,102,74,40,105,37.2,0.204,45
757,0,123,72,0,0,36.3,0.258,52
594,6,123,72,45,230,33.6,0.733,34
186,8,181,68,36,495,30.1,0.615,60
27,1,97,66,15,140,23.2,0.487,22
572,3,111,58,31,44,29.5,0.43,22
110,3,171,72,33,135,33.3,0.199,24
602,1,124,74,36,0,27.8,0.1,30
448,0,104,64,37,64,33.6,0.51,22
686,3,130,64,0,0,23.1,0.314,22


In [17]:
X_train.shape

(25, 8)

In [18]:
y_train

161    0
757    1
594    0
186    1
27     0
572    0
110    1
602    0
448    1
686    0
575    0
568    0
316    0
117    0
196    0
458    1
561    1
251    0
604    1
622    0
165    1
556    0
84     1
106    0
136    0
Name: Outcome, dtype: int64

In [19]:
len(y_train)

25

In [21]:
y_train.value_counts() # To show that the train values are stratified

Outcome
0    16
1     9
Name: count, dtype: int64

Now we calcuate the mean and max Glucose values of the sample set and the population:

In [22]:
mean_sample_glucose = np.mean(X_train['Glucose'])
mean_pop_glucose = np.mean(data_raw['Glucose'])
max_sample_glucose = np.max(X_train['Glucose'])
max_pop_glucose = np.max(data_raw['Glucose'])
print(f'Mean sample glucose: {mean_sample_glucose}')
print(f'Max sample glucose: {max_sample_glucose}')
print(f'Mean population glucose: {mean_pop_glucose}')
print(f'Max population glucose: {max_pop_glucose}')

Mean sample glucose: 127.84
Max sample glucose: 198
Mean population glucose: 120.89453125
Max population glucose: 199


Or, even better, we can use a box plot:

In [81]:
# Define the overall subplot figure
box_fig = make_subplots(rows=1, cols=2, subplot_titles=('Sample Glucose Values','Population Glucose Values'), shared_yaxes='all')

# Define the subplot figures
box_figs = [px.box(X_train, y="Glucose"), px.box(data_raw, y="Glucose")]

# Convert the subplot figures to traces
for i, figure in enumerate(box_figs):
    for trace in range(len(figure["data"])):
        box_fig.append_trace(figure["data"][trace], row=1, col=1+i)

fig_list.update({'box_fig':box_fig})

# Show the figure
box_fig.show()

Which shows that the distribution of the mean and the max are relatively close, but that the population stretches a lower with outliers.

## 25 Observations, BMI 98th Percentile

Now let's calculate the percentiles:

In [24]:
percentiles = [25,50,75,90,98]
sample_percentiles = np.percentile(X_train['BMI'],percentiles)
population_percentiles = np.percentile(data_raw['BMI'],percentiles)
print(f'Sample percentiles: {sample_percentiles}')
print(f'Sample 98th percentile: {sample_percentiles[-1]}')
print(f'Population percentiles: {population_percentiles}')
print(f'Population 98th percentile: {population_percentiles[-1]}')

Sample percentiles: [28.   31.3  36.3  39.72 45.2 ]
Sample 98th percentile: 45.199999999999996
Population percentiles: [27.3   32.    36.6   41.5   47.526]
Population 98th percentile: 47.52599999999996


And when displayed as a histogram/distribution:

In [82]:
bmi_df = pd.DataFrame(dict(series = np.concatenate((["Population"] * len(data_raw['BMI']), ["Sample"] * len(X_train['BMI']))),
                           BMI = np.concatenate((data_raw['BMI'],X_train['BMI'])))) # Concatenate sample and population counts together

bmi_fig = px.histogram(bmi_df, x='BMI', color="series", marginal="box", barmode="overlay") # Create a historgram
bmi_fig.add_vline(x=sample_percentiles[-1], line_color="red", opacity=1, label=dict(text="Sample 98th Percentile",textposition="top left",font=dict(size=9))) # Add a line to represent the sample 98th percentile
bmi_fig.add_vline(x=population_percentiles[-1], line_color="blue", opacity=1, label=dict(text="Population 98th Percentile",textposition="top left",font=dict(size=9))) # Add a line to represent the population 98th percentile

fig_list.update({'bmi_fig':bmi_fig})

bmi_fig.show()

# Bootstrap to 500 Samples

First, let's split out our original 150 observations:

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, train_size=150, shuffle=True) # Split train and test, with a train size of 150 and stratify still set to True

In [27]:
X_train['BloodPressure']

625    88
209    84
444    62
728    88
426     0
       ..
251    84
113    62
556    70
280    70
107    58
Name: BloodPressure, Length: 150, dtype: int64

Then, we'll resample:

In [44]:
sampled_means = [] # Define an array to store the resampled set means
sampled_sd = [] # Define an array to store the resampled set standard deviations
sampled_min = [] # Define an array to store the resampled set min
sampled_max = [] # Define an array to store the resampled set max
sampled_q1 = [] # Define an array to store the resampled set quartile 1
sampled_q2 = [] # Define an array to store the resampled set quartile 2/median
sampled_q3 = [] # Define an array to store the resampled set quartile 3
sampled_q98 = [] # Define an array to store the resampled set quartile 98
seed = 1 # Set a counter to define a random state/seed for each resample in the loop below, because the seed needs to change with each loop
percentiles = [25,50,75,98] # Define the range of percentiles to calculate

for i in range(500): # In the range 0 to 499 (Python is exclusive in loops)
  X_train_resampled, y_train_resampled = resample(X_train,y_train,n_samples=150,random_state=seed,replace=True) # Create a new sample of 150 observations from the existing train sample, with replacement and a random_state to ensure reproducibility
  percentile = np.percentile(X_train_resampled['BloodPressure'],percentiles) # Calculate the resampled percentiles
  sampled_means.append(np.mean(X_train_resampled['BloodPressure'])) # Calculate the resampled mean
  sampled_sd.append(np.std(X_train_resampled['BloodPressure'])) # Calculate the resampled standard deviation
  sampled_min.append(np.min(X_train_resampled['BloodPressure'])) # Append the min
  sampled_max.append(np.max(X_train_resampled['BloodPressure'])) # Append the max
  sampled_q1.append(percentile[0])
  sampled_q2.append(percentile[1])
  sampled_q3.append(percentile[2])
  sampled_q98.append(percentile[3])
  seed += 1

In [45]:
print(f"Number of Bootstrapped Samples: {len(sampled_means)}")
print(f"Original Sample Mean: {np.mean(X_train['BloodPressure'])}")
print(f"Average Sample Mean: {np.mean(sampled_means)}")
print(f"Population Mean: {np.mean(data_raw['BloodPressure'])}")
print(f"Original Sample Standard Deviation: {np.std(X_train['BloodPressure'])}")
print(f"Average Sample Standard Deviation: {np.mean(sampled_sd)}")
print(f"Population Standard Deviation: {np.std(data_raw['BloodPressure'])}")
print(f"Original Sample 98th Percentile: {np.percentile(X_train['BloodPressure'], 98)}")
print(f"Average Sample 98th Percentile: {np.mean(sampled_q98)}")
print(f"Population 98th Percentile: {np.percentile(data_raw['BloodPressure'], 98)}")

Number of Bootstrapped Samples: 500
Original Sample Mean: 67.73333333333333
Average Sample Mean: 67.79064
Population Mean: 69.10546875
Original Sample Standard Deviation: 18.447282244878846
Average Sample Standard Deviation: 18.233551939399575
Population Standard Deviation: 19.343201628981696
Original Sample 98th Percentile: 92.04000000000002
Average Sample 98th Percentile: 94.13680000000005
Population 98th Percentile: 99.31999999999994


Then, to show Mean and Standard Deviation, we can use a Bar Chart:

In [83]:
combined_results = pd.DataFrame(columns=['Mean', 'Standard Deviation'])
combined_results.loc['Original Sample'] = [np.mean(X_train['BloodPressure']),np.std(X_train['BloodPressure'])]
combined_results.loc['Bootstrapped Sample'] = [np.mean(sampled_means),np.mean(sampled_sd)]
combined_results.loc['Population'] = [np.mean(data_raw['BloodPressure']),np.std(data_raw['BloodPressure'])]

results_fig = px.bar(combined_results, x=combined_results.index, y=['Mean', 'Standard Deviation'], barmode='group') # Define a bar chart, using the combined overal results above except Successes
results_fig.update_layout(title_text=f"<b>Boostrapping Results</b>") # Add a title to the figure
results_fig.update_xaxes(title_text="<b>Observation Set</b>") # Add a title to the x axis
results_fig.update_yaxes(title_text="<b>Result</b>") # Add a title to the primary y axis
results_fig.update_legends(title_text="Metric") # Add a title to the legend

fig_list.update({'results_fig':results_fig})

results_fig.show()

But since the bar chart doesn't do a fantastic job of showing off the other metrics, we can compute some additional box plots:

In [84]:
quartile1 = [np.percentile(X_train['BloodPressure'], 25), np.mean(sampled_q1), np.percentile(data_raw['BloodPressure'], 25)] # Define the values for quartile 1 (we're doing all of these manually since the bootstrapped samples had to be calcuated manually)
quartile2 = [np.percentile(X_train['BloodPressure'], 50), np.mean(sampled_q2), np.percentile(data_raw['BloodPressure'], 50)] # Define the values for quartile 2 (median)
quartile3 = [np.percentile(X_train['BloodPressure'], 75), np.mean(sampled_q3), np.percentile(data_raw['BloodPressure'], 75)] # Define the values for quartile 3
means = [np.mean(X_train['BloodPressure']), np.mean(sampled_means), np.mean(data_raw['BloodPressure'])] # Define the values for the means
sds = [np.std(X_train['BloodPressure']), np.mean(sampled_sd), np.std(data_raw['BloodPressure'])] # Define the values for the standard deviations
mins = [np.min(X_train['BloodPressure']), np.min(sampled_min), np.min(data_raw['BloodPressure'])] # Define the values for the mins
maxs = [np.max(X_train['BloodPressure']), np.max(sampled_max), np.max(data_raw['BloodPressure'])] # Define the values for the maxes

resampled_fig = go.Figure() # Define a blank figure

resampled_fig.add_trace(go.Box(y=('Original Sample','Bootstrapped Sample','Population'))) # Add a box plot trace

resampled_fig.update_traces(q1=quartile1, median=quartile2,
                  q3=quartile3, lowerfence=mins,
                  upperfence=maxs, mean=means,
                  sd=sds, notchspan=[ 0.2, 0.4, 0.6 ] ) # Update that trace with all the precomputed values

resampled_fig.add_vline(x=np.percentile(X_train['BloodPressure'], 98), line_color="red", opacity=1, label=dict(text="Oritginal Sample 98th Percentile",textposition="top left",font=dict(size=9))) # Add a line to represent the sample 98th percentile
resampled_fig.add_vline(x=np.mean(sampled_q98), line_color="pink", opacity=1, label=dict(text="Bootstrapped Sample 98th Percentile",textposition="top left",font=dict(size=9))) # Add a line to represent the bootstrapped sample 98th percentile
resampled_fig.add_vline(x=np.percentile(data_raw['BloodPressure'], 98), line_color="blue", opacity=1, label=dict(text="Population 98th Percentile",textposition="top left",font=dict(size=9))) # Add a line to represent the population 98th percentile

fig_list.update({'resampled_fig':resampled_fig})

resampled_fig.show()

Finally, let's save all of these.

In [88]:
for figure in fig_list:
  fig_list[figure].write_image(f'./{parent_dir}/{results_dir}/{figure}.png', width=1400, height=600)

In [89]:
!zip -r './{parent_dir}.zip' './{parent_dir}' # Zip up the directory to be downloaded

  adding: Assignment_2_Diabetes-Sampling/ (stored 0%)
  adding: Assignment_2_Diabetes-Sampling/results/ (stored 0%)
  adding: Assignment_2_Diabetes-Sampling/results/.ipynb_checkpoints/ (stored 0%)
  adding: Assignment_2_Diabetes-Sampling/results/box_fig.png (deflated 56%)
  adding: Assignment_2_Diabetes-Sampling/results/resampled_fig.png (deflated 28%)
  adding: Assignment_2_Diabetes-Sampling/results/bmi_fig.png (deflated 28%)
  adding: Assignment_2_Diabetes-Sampling/results/results_fig.png (deflated 47%)
  adding: Assignment_2_Diabetes-Sampling/data_clean/ (stored 0%)
  adding: Assignment_2_Diabetes-Sampling/src/ (stored 0%)
  adding: Assignment_2_Diabetes-Sampling/data_raw/ (stored 0%)
  adding: Assignment_2_Diabetes-Sampling/data_raw/diabetes.csv (deflated 63%)
  adding: Assignment_2_Diabetes-Sampling/data_raw/README.md (stored 0%)


Finally, don't forget to save this notebook in:

In [87]:
print(f'./{parent_dir}/{source_dir}')

./Assignment_2_Diabetes-Sampling/src
