<a href="https://colab.research.google.com/github/milicak/Data_Analysis_Course/blob/main/Interactive_Spurious_Correlations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline
from ipywidgets import interactive                        # widgets and interactivity
from ipywidgets import widgets
from ipywidgets import Layout
from ipywidgets import Label
from ipywidgets import VBox, HBox
import matplotlib.pyplot as plt                           # plotting
from matplotlib.colors import ListedColormap
import numpy as np                                        # working with arrays
import pandas as pd                                       # working with DataFrames
import seaborn as sns                                     # for matrix scatter plots
from scipy.stats import triang                            # parametric distributions
from scipy.stats import binom
from scipy.stats import norm
from scipy.stats import uniform
from scipy.stats import triang
from scipy.stats import lognorm
from scipy import stats                                   # statistical calculations
import random                                             # random drawing / bootstrap realizations of the data
from matplotlib.gridspec import GridSpec                  # control of subplots
import seaborn as sns

In [7]:
bins = np.linspace(-1,1,100)                              # set histogram bins

# interactive calculation of the random sample set (control of source parametric distribution and number of samples)
l = widgets.Text(value='                                      Spurious Correlation Demonstration due to a combination of too few samples and skewed distribution',layout=Layout(width='950px', height='30px'))
dist = widgets.Dropdown(
    options=['Triangular', 'Uniform', 'Gaussian', 'LogNorm'],
    value='Gaussian',
    description='Dataset Distribution:',
    disabled=False,
    layout=Layout(width='200px', height='30px')
)
a = widgets.FloatSlider(min=0.0, max = 100.0, value = 0.5, description = 'Sample: Mean/Mode',orientation='vertical',layout=Layout(width='170px', height='200px'))
a.style.handle_color = 'blue'
d = widgets.FloatSlider(min=0.01, max = 30.0, value = 5.0, step = 1.0, description = 'Sample: St.Dev.',orientation='vertical',layout=Layout(width='110px', height='200px'))
d.style.handle_color = 'green'
b = widgets.FloatSlider(min = 0, max = 100.0, value = 0.5, description = 'Sample: Min.',orientation='vertical',layout=Layout(width='110px', height='200px'))
b.style.handle_color = 'red'
c = widgets.IntSlider(min = 0, max = 100, value = 100, description = 'Sample: Max.',orientation='vertical',layout=Layout(width='110px', height='200px'))
c.style.handle_color = 'orange'
n = widgets.IntSlider(min = 2, max = 1000, value = 4, description = 'Number Samples',orientation='vertical',layout=Layout(width='110px', height='200px'))
n.style.handle_color = 'gray'
m = widgets.IntSlider(min = 2, max = 20, value = 10, description = 'Number Features',orientation='vertical',layout=Layout(width='110px', height='200px'))
m.style.handle_color = 'gray'

uia = widgets.HBox([dist,a,d,b,c,n,m],kwargs = {'justify_content':'center'})                      # basic widget formatting
#uib = widgets.HBox([n, m],kwargs = {'justify_content':'center'})                      # basic widget formatting
ui2 = widgets.VBox([l,uia],)

In [3]:
def f_make(dist,a, b, c, d, n, m):                        # function to take parameters, make sample and plot
    dataset = make_data(dist,a, b, c, d, n, m)
    df = pd.DataFrame(data = dataset)
    corr = df.corr()

# build a mask to remove the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))
    corr_values = corr.values
    corr_values2 = corr_values[mask != True]

# make a custom colormap
    my_colormap = plt.cm.get_cmap('RdBu_r', 256)
    newcolors = my_colormap(np.linspace(0, 1, 256))
    white = np.array([256/256, 256/256, 256/256, 1])
    newcolors[26:230, :] = white                          # mask all correlations less than abs(0.8)
    newcmp = ListedColormap(newcolors)

# Draw the heatmap with the mask and correct aspect ratio
    fig, (ax1) = plt.subplots(1, 1)
    sns.set(font_scale = 0.8)
    sns.heatmap(corr, ax = ax1, annot = True, mask=mask, cmap=newcmp, vmin = -1.0, vmax=1.0, center=0,
        square=True, linewidths=.5, linecolor = 'white', linewidth = 1, cbar_kws={'shrink': .5, 'label': 'Correlation Coefficents'})
    ax1.set_xlabel('Random Independent Features'); ax1.set_ylabel('Random Independent Features')
    ax1.set_title('Lower Triangular Correlation Matrix Heat Map')

#     ax2.hist(corr_values2, alpha=0.2,color="red",edgecolor="black", bins = bins)
#     ax2.set_title('Lower Triangular Correlation Coefficent Distribution'); ax2.set_xlabel('Correlation Coefficent'); ax2.set_ylabel('Frequency')
#     ax2.set_facecolor('white'); ax2.grid(True);

    plt.subplots_adjust(left=0.0, bottom=0.0, right=1.2, top=3.2, wspace=0.2, hspace=0.2)
    plt.show()


In [4]:
def make_data(dist,a, b, c, d, n, m):                     # function to check parameters and make sample
    if dist == 'Uniform':
        if b >= c:
            print('Invalid uniform distribution parameters')
            return None
        dataset = uniform.rvs(size=[n,m], loc = b, scale = c, random_state = 73073).tolist()
        return dataset
    elif dist == 'Triangular':
        interval = c - b
        if b >= a or a >= c or interval <= 0:
            print('Invalid triangular distribution parameters')
            return None
        dataset = triang.rvs(size=[n,m], loc = b, c = (a-b)/interval, scale = interval, random_state = 73073).tolist()
        return dataset
    elif dist == 'Gaussian':
        dataset = norm.rvs(size=[n,m], loc = a, scale = d, random_state = 73073).tolist()
        return dataset
    elif dist == 'LogNorm':
        dataset = lognorm.rvs(size=[n,m], loc = a, scale = np.exp(a), s = d, random_state = 73073).tolist()
        return dataset

In [8]:
# connect the function to make the samples and plot to the widgets
interactive_plot = widgets.interactive_output(f_make, {'dist': dist,'a': a, 'd': d, 'b': b, 'c': c, 'n': n, 'm': m})
interactive_plot.clear_output(wait = True)                # reduce flickering by delaying plot updating

In [9]:
display(ui2, interactive_plot)                            # display the interactive plot

VBox(children=(Text(value='                                      Spurious Correlation Demonstration due to a c…

Output()

In [11]:
a=0.5
b=0.5
c=100
d=5.0
n=4
m=10
dataset = norm.rvs(size=[n,m], loc = a, scale = d, random_state = 73073).tolist()

In [14]:
df = pd.DataFrame(data = dataset)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-3.353243,4.253002,0.364656,4.139508,5.899718,-1.302844,-6.237996,2.224941,0.719676,-0.719767
1,1.775867,-1.769861,3.800179,-13.104763,3.944602,-3.051538,1.959566,4.432356,-1.523885,7.526376
2,3.17618,5.337339,-2.512431,4.301908,-5.173254,-1.068505,-1.871379,9.983692,-1.21805,0.003936
3,-0.875578,6.919447,2.017792,-0.519818,2.946831,2.319855,-0.095661,-2.822851,-5.76118,0.974424
