<a href="https://colab.research.google.com/github/milicak/Data_Analysis_Course/blob/main/Confidence_Interval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline
from ipywidgets import interactive                        # widgets and interactivity
from ipywidgets import widgets
from ipywidgets import Layout
from ipywidgets import Label
from ipywidgets import VBox, HBox
import matplotlib.pyplot as plt                           # plotting
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator) # control of axes ticks
plt.rc('axes', axisbelow=True)                            # set axes and grids in the background for all plots
import numpy as np                                        # working with arrays
import pandas as pd                                       # working with DataFrames
import seaborn as sns                                     # for matrix scatter plots
from scipy.stats import triang                            # parametric distributions
from scipy.stats import binom
from scipy.stats import norm
from scipy.stats import uniform
from scipy.stats import triang
from scipy.stats import t
from scipy import stats                                   # statistical calculations
import random                                             # random drawing / bootstrap realizations of the data
from matplotlib.gridspec import GridSpec                  # nonstandard subplots
import math

In [2]:
def add_grid():
    plt.gca().grid(True, which='major',linewidth = 1.0); plt.gca().grid(True, which='minor',linewidth = 0.2) # add y grids
    plt.gca().tick_params(which='major',length=7); plt.gca().tick_params(which='minor', length=4)
    plt.gca().xaxis.set_minor_locator(AutoMinorLocator()); plt.gca().yaxis.set_minor_locator(AutoMinorLocator()) # turn on minor ticks

In [3]:
# parameters for the synthetic dataset
bins = np.linspace(0,1000,1000)

# interactive calculation of the sample set (control of source parametric distribution and number of samples)
l = widgets.Text(value='                                Simple Boostrap Demonstration',layout=Layout(width='950px', height='30px'))

a = widgets.IntSlider(min=0, max = 100, value = 2, step = 1, description = '$n_{red}$',orientation='horizontal',layout=Layout(width='400px', height='20px'),continuous_update=False)
a.style.handle_color = 'red'

b = widgets.IntSlider(min=0, max = 100, value = 3, step = 1, description = '$n_{green}$',orientation='horizontal',layout=Layout(width='400px', height='20px'),continuous_update=False)
b.style.handle_color = 'green'

c = widgets.IntSlider(min=1, max = 16, value = 3, step = 1, description = '$L$',orientation='horizontal',layout=Layout(width='400px', height='20px'),continuous_update=False)
c.style.handle_color = 'gray'

ui = widgets.HBox([a,b,c],)                               # basic widget formatting
ui2 = widgets.VBox([l,ui],)

def f_make(a, b, c):                                      # function to take parameters, make sample and plot
    red_freq = make_data(a, b, c)

    labels = ['Red', 'Green']
    nrows = int(np.sqrt(c)+0.9); ncols = int(c / nrows + 0.9)
    print(nrows,ncols)
    plt.clf()

    for i in range(0, c):
        plt.subplot(ncols,nrows,i + 1)
        draw = [red_freq[i],a + b - red_freq[i]]
        plt.grid(zorder=0, color='black', axis = 'y', alpha = 0.2); plt.ylim(0,a + b);
        plt.ylabel('Frequency'); plt.xlabel('Balls Drawn')
        plt.yticks(np.arange(0,a + b + 1,max(1,round((a+b)/10))))
        barlist = plt.bar(labels,draw,edgecolor = "black",linewidth = 1,alpha = 0.8); plt.title('Realization #' + str(i+1),zorder = 1)
        barlist[0].set_color('r'); barlist[1].set_color('g')

    plt.subplots_adjust(left=0.0, bottom=0.0, right=ncols, top=nrows + 0.2 * nrows, wspace=0.2, hspace=0.2)
    plt.show()

def make_data(a, b, c):                                   # function to check parameters and make sample
    prop_red = np.zeros(c)
    for i in range(0, c):
        prop_red[i] = np.random.multinomial(a+b,[a/(a+b),b/(a+b)], size = 1)[0][0]
    return prop_red

# connect the function to make the samples and plot to the widgets
interactive_plot = widgets.interactive_output(f_make, {'a': a, 'b': b, 'c': c})
interactive_plot.clear_output(wait = True)                # reduce flickering by delaying plot updating

In [4]:
display(ui2, interactive_plot)                            # display the interactive plot

VBox(children=(Text(value='                                Simple Boostrap Demonstration', layout=Layout(heigh…

Output()

In [5]:
# parameters for the synthetic dataset
bins = np.linspace(0,1000,1000)

# interactive calculation of the sample set (control of source parametric distribution and number of samples)
l2 = widgets.Text(value='     Confidence Interval for Proportions, Analytical and Bootstrap Demonstration, Michael Pyrcz, Associate Professor, The University of Texas at Austin',layout=Layout(width='950px', height='30px'))

a2 = widgets.IntSlider(min=0, max = 100, value = 20, step = 1, description = '$n_{red}$',orientation='horizontal',layout=Layout(width='400px', height='20px'),continuous_update=False)
a2.style.handle_color = 'red'

b2 = widgets.IntSlider(min=0, max = 100, value = 30, step = 1, description = '$n_{green}$',orientation='horizontal',layout=Layout(width='400px', height='20px'),continuous_update=False)
b2.style.handle_color = 'green'

c2 = widgets.IntSlider(min=5, max = 1000, value = 1000, step = 1, description = '$L$',orientation='horizontal',layout=Layout(width='400px', height='20px'),continuous_update=False)
c2.style.handle_color = 'gray'

alpha = widgets.FloatSlider(min=0.01, max = 0.40, value = 0.05, step = 0.01, description = r'$\alpha$',orientation='horizontal',layout=Layout(width='400px', height='20px'),continuous_update=False)
alpha.style.handle_color = 'gray'

uib = widgets.HBox([a2,b2,c2,alpha],)                            # basic widget formatting
uib2 = widgets.VBox([l2,uib],)

def s_make(a, b, c, alpha):                                      # function to take parameters, make sample and plot

    dof = a + b - 1

    red_freq = make_data(a, b, c)
    pred = red_freq/(a+b)
    red_prop = (a / (a+b))
    red_SE = math.sqrt((red_prop * (1.0 - red_prop)) / (a+b))

    green_freq = (a + b) - red_freq
    pgreen = green_freq/(a+b)
    green_prop = (b / (a+b))
    green_SE = math.sqrt((green_prop * (1.0 - green_prop)) / (a+b))

    prop_red = red_freq / (a + b)
    prop_green = green_freq / (a + b)
    labels = ['Red Balls', 'Green Balls']
    bins = np.linspace(0,a + b, a + b)

    fig = plt.figure(constrained_layout=False)
    gs = GridSpec(3, 2, figure=fig)

    ax1 = fig.add_subplot(gs[:, 0])
    boxplot = ax1.boxplot([pred,pgreen],labels = labels, notch = True, sym = '+',patch_artist=True)
    colors = ['red','green']
    for patch, color in zip(boxplot['boxes'], colors):
        patch.set_facecolor(color)
    for patch, color in zip(boxplot['medians'], colors):
        patch.set_color('black')

    ax1.set_ylim([0,1])
    ax1.grid(zorder=0, color='black', axis = 'y', alpha = 0.2)
    ax1.set_ylabel('Proportion of Balls'); ax1.set_xlabel('Ball Color');ax1.set_title('Bootstrap Uncertainty - Proportion Distributions')
    ax1.grid(True, which='major',axis='y',linewidth = 1.0); ax1.grid(True, which='minor',axis='y',linewidth = 0.2) # add y grids
    ax1.tick_params(which='major',length=7); ax1.tick_params(which='minor', length=4)
    ax1.xaxis.set_minor_locator(AutoMinorLocator()); ax1.yaxis.set_minor_locator(AutoMinorLocator()) # turn on minor ticks

    cumul_prob = np.linspace(0.0,1.0,100)
    if a <= 30 or b <= 30:
        red_prop_values = t.ppf(cumul_prob, dof)
        red_lower = t.ppf(alpha/2, dof); red_upper = t.ppf(1-alpha/2, dof)
    else:
        red_prop_values = norm.ppf(cumul_prob)
        red_lower = norm.ppf(alpha/2); red_upper = norm.ppf(1-alpha/2)

    red_prop_values = red_prop_values * red_SE + red_prop
    red_lower = red_lower * red_SE + red_prop
    red_upper = red_upper * red_SE + red_prop

    cumul_prob = np.linspace(0.01,0.99,100)
    if a <= 30 or b <= 30:
        green_prop_values = t.ppf(cumul_prob, dof)
        green_lower = t.ppf(alpha/2, dof); green_upper = t.ppf(1-alpha/2, dof)
    else:
        green_prop_values = norm.ppf(cumul_prob)
        green_lower = norm.ppf(alpha/2); green_upper = norm.ppf(1-alpha/2)

    green_prop_values = green_prop_values * green_SE + green_prop
    green_lower = green_lower * green_SE + green_prop
    green_upper = green_upper * green_SE + green_prop

    ax2 = fig.add_subplot(gs[0, 1])
    ax2.hist(prop_red,cumulative = True, density = True, alpha=0.7,color="red",edgecolor="black",linewidth=2,bins = np.linspace(0,1,50), label = 'Bootstrap')
    ax2.plot([red_lower,red_lower],[0,1],color='black',linewidth=2,linestyle='--',label='Lower/Upper')
    ax2.plot([red_upper,red_upper],[0,1],color='black',linewidth=2,linestyle='--')
    ax2.plot([red_prop,red_prop],[0,1],color='black',linewidth=3,label='Exp.')
    ax2.set_title('Uncertainty in Proportion of Red Balls'); ax2.set_xlabel('Proportion of Red Balls'); ax2.set_ylabel('Cumulative Probability')
    ax2.set_xlim([0,1]); ax2.set_ylim([0,1])

    ax2.plot(red_prop_values, cumul_prob, color = 'black', linewidth = 2, label = 'Analytical')
    ax2.legend()

    ax3 = fig.add_subplot(gs[1, 1])
    ax3.hist(prop_green,cumulative = True, density = True, alpha=0.7,color="green",edgecolor="black",linewidth=2,bins = np.linspace(0,1,50), label = 'Bootstrap')
    ax3.plot([green_lower,green_lower],[0,1],color='black',linewidth=2,linestyle='--',label='Lower/Upper')
    ax3.plot([green_upper,green_upper],[0,1],color='black',linewidth=2,linestyle='--')
    ax3.plot([green_prop,green_prop],[0,1],color='black',linewidth=3,label='Exp.')
    ax3.set_title('Uncertainty in Proportion of Green Balls'); ax3.set_xlabel('Proportion of Green Balls'); ax3.set_ylabel('Cumulative Probability')
    ax3.set_xlim([0,1]); ax3.set_ylim([0,1])

    ax3.plot(green_prop_values, cumul_prob, color = 'black', linewidth = 2, label = 'Analytical')
    ax3.legend()

    ax4 = fig.add_subplot(gs[2, 1])
    ax4.hist(prop_green,cumulative = False, density = True, alpha=0.7,color="green",edgecolor="black",linewidth=2, bins = np.linspace(0,1,50), label = 'Bootstrap Prop. Green')
    ax4.hist(prop_red,cumulative = False, density = True, alpha=0.7,color="red",edgecolor="black",linewidth=2, bins = np.linspace(0,1,50), label = 'Bootstrap Prop. Red')

    ax4.set_title('Confidence Interval in Proportion of Red and Green Balls (Alpha = ' + str(alpha) + ')'); ax3.set_xlabel('Proportion of Green Balls')
    ax4.set_xlabel('Proportion of Red and Green Balls'); ax4.set_ylabel('Frequency')
    ax4.set_xlim([0,1])

    prop_values = np.linspace(0.0,1.0,100)
    if a <= 30 and b <= 30:
        green_density = t.pdf(prop_values,loc = green_prop, df = dof, scale = green_SE)
    else:
        green_density = norm.pdf(prop_values,loc = green_prop, scale = green_SE)
    ax4.plot(prop_values, green_density, color = 'black', linewidth = 5,zorder=99)
    ax4.plot(prop_values, green_density, color = 'green', linewidth = 3, label = 'Analytical Prop. Green',zorder=100)

    if a <= 30 and b <= 30:
        red_density = t.pdf(prop_values,loc = red_prop, df = dof, scale = red_SE)
    else:
        red_density = norm.pdf(prop_values,loc = red_prop, scale = red_SE)
    ax4.plot(prop_values, red_density, color = 'black', linewidth = 5,zorder=99)
    ax4.plot(prop_values, red_density, color = 'red', linewidth = 3, label = 'Analytical Prop. Red',zorder=100)

    ax4.fill_between(prop_values, 0, green_density, where = prop_values <= green_lower, facecolor='green', interpolate=True, alpha = 0.9,zorder=101)
    ax4.fill_between(prop_values, 0, green_density, where = prop_values >= green_upper, facecolor='green', interpolate=True, alpha = 0.9,zorder=101)
    ax4.fill_between(prop_values, 0, red_density, where = prop_values <= red_lower, facecolor='darkred', interpolate=True, alpha = 0.9,zorder=101)
    ax4.fill_between(prop_values, 0, red_density, where = prop_values >= red_upper, facecolor='darkred', interpolate=True, alpha = 0.9,zorder=101)
    ax4.legend()

    plt.subplots_adjust(left=0.0, bottom=0.0, right=2.5, top=3.0, wspace=0.2, hspace=0.3)
    plt.show()

# connect the function to make the samples and plot to the widgets
interactive_plot = widgets.interactive_output(s_make, {'a': a2, 'b': b2, 'c': c2, 'alpha': alpha})
interactive_plot.clear_output(wait = True)                # reduce flickering by delaying plot updating

In [6]:
display(uib2, interactive_plot)                            # display the interactive plot

VBox(children=(Text(value='     Confidence Interval for Proportions, Analytical and Bootstrap Demonstration, M…

Output()