|<h2>Substack post:</h2>|<h1><a href="" target="_blank">Two-variable dependence part 1: Covariance</a></h1>|
|-|:-:|
|<h2><h2>|<h2>Scroll down for parts 2 and 3</h2>|
|<h2>Teacher:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the post may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.feature_selection import mutual_info_regression
import pandas as pd
import seaborn as sns

In [None]:
### Run this cell only if you're using "dark mode"

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    'figure.facecolor': '#383838',#'#171717',
    'figure.edgecolor': '#383838',#'#171717',
    'axes.facecolor':   '#383838',#'#171717',
    'axes.edgecolor':   '#DDE2F4',
    'axes.labelcolor':  '#DDE2F4',
    'xtick.color':      '#DDE2F4',
    'ytick.color':      '#DDE2F4',
    'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold'
})

# The code below is for Post 1 ("Two-variable dependence part 1: Covariance")

### Scroll down for Posts 2 and 3.

# Figure of examples

In [None]:
# population correlations
popCors = [-.7,0,.7,  0,0,0]

# sample size
N = 90

# create a figure with three subplots
_,axs = plt.subplots(2,3,figsize=(10,6.7))
axs = axs.flatten()

# loop over a range of r values
for i,popr in enumerate(popCors):

  # linear cases
  x = np.random.randn(N)
  y = x*popr + np.random.randn(N)*np.sqrt(1-popr**2)

  # nonlinear cases
  if i==3:
    x = np.cos(np.linspace(0,2*np.pi-2*np.pi/N,N))
    y = np.sin(np.linspace(0,2*np.pi-2*np.pi/N,N))
  elif i==4:
    x = np.linspace(-2,2,N)
    y = x**2
  elif i==5:
    x = np.linspace(-2,2,N//2)
    y = np.concatenate((x,-x),0)
    x = np.concatenate((x,x),0)

  # scale up so cov!=cor
  x *=2
  y += 5

  # observed covariance
  C = np.cov(x,y)[0,1]

  axs[i].plot(x,y,'ko',markersize=10,markerfacecolor=[.7,.9,.7,.5])
  axs[i].set(xlabel='Data "x"',ylabel='Data "y"',title=f'Covariance = {C:.2f}')

plt.tight_layout()
plt.show()

In [None]:
# example with fake data
samplesize = 40
n_posts_read = np.random.randint(low=0,high=20,size=samplesize)
life_happiness = n_posts_read*3 + np.random.randint(low=0,high=40,size=samplesize)

# plot
_,axs = plt.subplots(1,2,figsize=(10,4))
axs[0].plot(n_posts_read,life_happiness,'wh',markersize=10,markerfacecolor=[.7,.7,.9,.5])
axs[0].set(xlabel="Number of Mike's posts read",xticks=range(0,21,4),
              ylabel='Life happiness',title='Posts read vs. happiness, raw data')

axs[1].plot(n_posts_read-n_posts_read.mean(),life_happiness-life_happiness.mean(),'wh',markersize=10,markerfacecolor=[.7,.7,.9,.5])
axs[1].axhline(0,color='gray',linestyle='--')
axs[1].axvline(0,color='gray',linestyle='--')
axs[1].set(xlabel="Number of Mike's posts read",xticks=range(-10,11,4),
              ylabel='Life happiness',title='Mean-centered data')

plt.tight_layout()
plt.show()

# Demo 1: Covariance in simulated data

In [None]:
# create the variables
N = 300
x = np.random.normal(10,4,N)
y = x + np.random.normal(20,11,N)

# calculate covariance
x_centered = x-x.mean()
y_centered = y-y.mean()
cov = np.sum( x_centered*y_centered )
cov /= N-1

# numpy implementation
cov_np = np.cov(x,y)[0,1]

print(f'Covariance (manual) = {cov:.4f}')
print(f'Covariance (numpy)  = {cov_np:.4f}')

In [None]:
# and visualize
plt.figure(figsize=(7,5))
plt.plot(x,y,'wh',markeredgewidth=.5,markerfacecolor=[.9,.7,.7,.7])
plt.gca().set(xlabel='$x$',ylabel='$y$',title=f'Covariance = {cov:.2f}')
plt.show()

# Demo 2: Covariance to correlation

In [None]:
# same variables, but scaling impacts their covariance
print(f'Covariance unscaled:   {np.cov(x,y)[0,1]:.4f}')
print(f'Covariance x scaled:   {np.cov(x*10,y)[0,1]:.4f}')
print(f'Covariance x,y scaled: {np.cov(x*10,y*10)[0,1]:.4f}')

In [None]:
# translating the math into code
corr = cov / np.sqrt( x.var(ddof=1)*y.var(ddof=1) )

# via numpy
corr_np = np.corrcoef(x,y)[0,1]

print(f'Correlation (manual) = {corr:.4f}')
print(f'Correlation (numpy)  = {corr_np:.4f}')

In [None]:
# scaling has no impact on correlation
print(f'Correlation unscaled:   {np.corrcoef(x,y)[0,1]:.4f}')
print(f'Correlation x scaled:   {np.corrcoef(x*10,y)[0,1]:.4f}')
print(f'Correlation x,y scaled: {np.corrcoef(x*10,y*10)[0,1]:.4f}')

# Demo 3: Statistical significance via permutation testing

In [None]:
# how to use permutation()
np.random.permutation(5)

In [None]:
# one permuted covariance
shuffle_idx = np.random.permutation(N)
np.cov( x[shuffle_idx],y )[0,1]

In [None]:
# generate a distribution of H0 values
n_iters = 1000
permuted_covs = np.zeros(n_iters)

# loop over the shufflings
for i in range(n_iters):
  shuffle_idx = np.random.permutation(N)
  permuted_covs[i] = np.cov( x[shuffle_idx],y )[0,1]

# get statistical values
zval = (cov-permuted_covs.mean()) / permuted_covs.std(ddof=1)
pval = np.sum( permuted_covs > cov ) / n_iters

In [None]:
# visualize the distribution
plt.figure(figsize=(10,3))

plt.hist(permuted_covs,bins=40,color=[.9,.7,.7],edgecolor=[[.7,.7,.7]],label='Shuffled')
plt.axvline(cov,color='lightblue',linestyle='--',linewidth=2,label='Observed')

plt.gca().set(xlabel='Covariance value',ylabel='Count',
              title=f'Permutation test\nz = {zval:.2f}, p = {pval:.2f}')
plt.legend()
plt.show()

# Demo 4: Impact of outliers

In [None]:
# make a copy of the data and create an outlier
x_hasOut = x + 0
x_hasOut[-1] = x_hasOut[-1]*5

y_hasOut = y + 0
y_hasOut[1] = y_hasOut[1]*5

print(f'Covariance (original)  = {np.cov(x,y)[0,1]:.4f}')
print(f'Covariance (x-outlier) = {np.cov(x_hasOut,y)[0,1]:.4f}')
print(f'Covariance (y-outlier) = {np.cov(x_hasOut,y_hasOut)[0,1]:.4f}')

In [None]:
# and visualize
plt.figure(figsize=(7,5))
plt.plot(x_hasOut,y_hasOut,'wh',markeredgewidth=.5,markerfacecolor=[.9,.7,.7,.7])
plt.gca().set(xlabel='x',ylabel='y',title=f'Covariance = {np.cov(x_hasOut,y_hasOut)[0,1]:.2f}')
plt.show()

In [None]:
# calculate z-scores and significance threshold
x_z = (x_hasOut-x_hasOut.mean()) / x_hasOut.std(ddof=1)
y_z = (y_hasOut-y_hasOut.mean()) / y_hasOut.std(ddof=1)

zThresh = 4

# and visualize
_,axs = plt.subplots(1,2,figsize=(10,4))
axs[0].plot(x_hasOut,y_hasOut,'wh',markeredgewidth=.5,markerfacecolor=[.9,.7,.7,.7])
axs[0].set(xlabel='x',ylabel='y',title=f'Original scale\nCovariance = {np.cov(x_hasOut,y_hasOut)[0,1]:.2f}')

axs[1].plot(x_z,y_z,'wh',markeredgewidth=.5,markerfacecolor=[.9,.7,.7,.7])
axs[1].set(xlabel='$Z_x$',ylabel='$z_y$',title=f'Z-scored\nCovariance = {np.cov(x_z,y_z)[0,1]:.2f}')
axs[1].axhline(zThresh,color='gray',linestyle='--',linewidth=.5)
axs[1].axvline(zThresh,color='gray',linestyle='--',linewidth=.5)

plt.tight_layout()
plt.show()

In [None]:
# identify outliers
outliers = (abs(x_z)>zThresh) | (abs(y_z)>zThresh)

# it's a boolean
outliers

In [None]:
# remove outliers into new variables
x_clean = x_hasOut[~outliers]
y_clean = y_hasOut[~outliers]

# recalculate covariances
orig_cov = np.cov(x,y)[0,1]
hasOut_cov = np.cov(x_hasOut,y_hasOut)[0,1]
clean_cov = np.cov(x_clean,y_clean)[0,1]

print(f'Original covariance: {orig_cov:.2f}')
print(f'With outliers:       {hasOut_cov:.2f}')
print(f'Cleaned covariance:  {clean_cov:.2f}')

|<h2>Substack post:</h2>|<h1><a href="" target="_blank">Two-variable dependence part 2: Mutual information</a></h1>|
|-|:-:|
|<h2><h2>|<h2>Scroll down for part 3</h2>|
|<h2>Teacher:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the post may lead to confusion or errors.</i>

In [None]:
# population correlations
popCors = [-.7,0,.7,0,0,0]

# sample size
N = 90

# create a figure with three subplots
_,axs = plt.subplots(2,3,figsize=(10,6.7))
axs = axs.flatten()

# loop over a range of r values
for i,popr in enumerate(popCors):

  # linear cases
  x = np.random.randn(N)
  y = x*popr + np.random.randn(N)*np.sqrt(1-popr**2)

  # nonlinear cases
  if i==3:
    x = np.cos(np.linspace(0,2*np.pi-2*np.pi/N,N))
    y = np.sin(np.linspace(0,2*np.pi-2*np.pi/N,N))
  elif i==4:
    x = np.linspace(-2,2,N)
    y = x**2
  elif i==5:
    x = np.linspace(-2,2,N//2)
    y = np.concatenate((x,-x),0)
    x = np.concatenate((x,x),0)

  x *=2
  y += 5

  # observed covariance
  C = np.cov(x,y)[0,1]

  # observed mutual information
  mi = mutual_info_regression(x.reshape(-1,1),y)

  axs[i].plot(x,y,'ko',markersize=10,markerfacecolor=[.7,.9,.7,.5])
  axs[i].set(xlabel='Data "x"',ylabel='Data "y"',
             title=f'Covariance = {C:.2f}\nMutual information = {mi[0]:.2f}')

plt.tight_layout()
plt.show()

# Demo 1: Entropy in categorical variables

In [None]:
# generate non-normal distributed category labels
x = np.random.uniform(low=0,high=3,size=200)**2
x = np.ceil(x).astype(int)

# convert to probability
p_x = np.bincount(x) / len(x)

# calculate entropy
eps = 1e-13
entropy_x = -np.sum( p_x * np.log2(p_x+eps) )

# min-max scale for coloring
p4color = (p_x-p_x.min()) / (p_x.max()-p_x.min())

# and plot :)
_,axs = plt.subplots(1,2,figsize=(12,3))

# plot each data label according to frequency
for i in range(len(p_x)):
  axs[0].plot(np.where(x==i)[0],x[x==i],'ws',markeredgewidth=.2,markersize=5,markerfacecolor=mpl.cm.plasma(p4color[i]),alpha=.6)
  axs[1].bar(i,p_x[i],color=mpl.cm.plasma(p4color[i]))

axs[0].set(xlabel='Data index',ylabel='Category',title='Scatter plot')
axs[1].set(xlabel='Data values',ylabel='Proportion',xlim=[np.min(x)-.5,np.max(x)+.5],
              title=f'Distribution (entropy = {entropy_x:.3f})')
plt.show()

# Demo 2: Entropy in continuous variables

In [None]:
N = 347
x = np.linspace(.001,2.5,N)
y = np.cos(2*x)*3 + np.log(x) + np.random.normal(0,.5,N)

plt.figure(figsize=(8,4))
plt.plot(x,y,'ko',markersize=10,markerfacecolor=[.9,.7,.7,.5])
plt.gca().set(xlabel='Data $x$',ylabel='Data $y$',
              title=r'$y = 3\cos(2x) + \ln(x) + \mathcal{N}(0,.5)$')
plt.show()

In [None]:
# uh oh...
# np.bincount(y)

In [None]:
# convert to probability
p_y,p_x = np.histogram(y,bins=30,density=True)

# calculate entropy
entropy_y = -np.sum( p_y * np.log2(p_y+eps) )

# min-max scale for coloring
p4color = (p_y-p_y.min()) / (p_y.max()-p_y.min())

# and plot :)
_,axs = plt.subplots(1,2,figsize=(12,4))

# plot each data label according to frequency
for i in range(len(p_y)):

  # find the values within these bin boundaries
  whichvals = (y>=p_x[i]) & (y<p_x[i+1])

  # plot the data and probability
  axs[0].plot(np.where(whichvals)[0],y[whichvals],'ws',markeredgewidth=.2,markersize=6,markerfacecolor=mpl.cm.plasma(p4color[i]),alpha=.8)
  axs[1].bar(p_x[i],p_y[i],width=(p_x[1]-p_x[0])*.9,color=mpl.cm.plasma(p4color[i]))

axs[0].set(xlabel='Data index',ylabel='Data value',title='Scatter plot')
axs[1].set(xlabel='Data values',ylabel='Proportion',xlim=[p_x[0]-.5,p_x[-1]+.5],
              title=f'Distribution (entropy = {entropy_x:.3f})')
plt.show()

In [None]:
nbins = np.arange(5,51)
H_by_bins = np.zeros(len(nbins))

for i in range(len(H_by_bins)):
  p_y,p_x = np.histogram(y,bins=nbins[i],density=True)
  H_by_bins[i] = -np.sum( p_y * np.log2(p_y+eps) )


plt.figure(figsize=(8,3))
plt.plot(nbins,H_by_bins,'wh',markersize=10,markerfacecolor=[.7,.7,.9])
plt.gca().set(xlabel='Number of bins',ylabel='Entropy')
plt.show()

# Demo 3: Joint entropy and mutual information

In [None]:
# 2D histogram
Z,xx,yy = np.histogram2d(x,y,bins=8)

_,axs = plt.subplots(1,2,figsize=(12,4))
axs[0].plot(x,y,'ro',markeredgewidth=.3,markerfacecolor=[.7,.7,.9,.7])
axs[0].set(xlabel='x',ylabel='y',title='Full resolution data')

h = axs[1].imshow(Z.T,extent=[xx[0],xx[-1],yy[0],yy[-1]],vmin=0,vmax=Z.max()*.7,origin='lower',aspect='auto',cmap='hot')
axs[1].set(xlabel='x',ylabel='y',title='Discretized (binned) data')
axs[1].plot(x,y,'wo',markerfacecolor=[.4,.4,.4],alpha=.7)
plt.colorbar(h,ax=axs[1],pad=.01,label='Count')
plt.suptitle('Z,xx,yy = np.histogram2d(x,y,bins=8)', fontfamily='monospace')

plt.tight_layout()
plt.show()

In [None]:
# 2D discretization
Z = np.histogram2d(x,y,bins=8)[0]

# joint entropy from proportion
p_Z = Z / Z.sum()
entropy_Z = -np.sum( p_Z * np.log2(p_Z+eps) )

# single-variable entropies
p_x = np.sum(p_Z, axis=1)
entropy_x = -np.sum( p_x * np.log2(p_x+eps) )
p_y = np.sum(p_Z, axis=0)
entropy_y = -np.sum( p_y * np.log2(p_y+eps) )

print(f'Entropy of x: {entropy_x:.2f}')
print(f'Entropy of y: {entropy_y:.2f}')
print(f'Entropy of Z: {entropy_Z:.2f}')

In [None]:
# mutual information via direct translation of the formula
miEps = (entropy_x+entropy_y) - entropy_Z

In [None]:
# via sklearn's MI function optimized for continuous variables
miSk = mutual_info_regression(x.reshape(-1,1),y)[0]

print(f'Mutual information (manual) : {miEps:.2f}')
print(f'Mutual information (sklearn): {miSk:.2f}')

In [None]:
# impact of discretization

bincounts = np.arange(4,25)
mi_by_bincount = np.zeros(len(bincounts))

for i in range(len(mi_by_bincount)):

  Z,xx,yy = np.histogram2d(x,y,bins=bincounts[i])

  # proportion via sum-scaling
  p_Z = Z / Z.sum()
  p_x = np.sum(p_Z, axis=1)
  p_y = np.sum(p_Z, axis=0)

  # calculate entropy
  eps = 1e-13
  entropy_x = -np.sum( p_x * np.log2(p_x+eps) )
  entropy_y = -np.sum( p_y * np.log2(p_y+eps) )

  # as difference of entropies
  entropy_Z = -np.sum( p_Z * np.log2(p_Z+eps) )
  mi_by_bincount[i] = (entropy_x+entropy_y) - entropy_Z


plt.figure(figsize=(8,4))
plt.plot(bincounts,mi_by_bincount,'wh',markersize=10,markerfacecolor=[.7,.7,.9])
plt.axhline(miSk,color=[.9,.7,.7],linestyle='--',linewidth=2,label='Scikit-learn')
plt.gca().set(xticks=bincounts[::2],xlabel='Number of bins',ylabel='Mutual information')
plt.legend()

plt.show()

# Demo 4: Statistical significance via permutation testing

In [None]:
# generate a distribution of H0 values
n_iters = 1000
permuted_mis = np.zeros(n_iters)

# loop over the shufflings
for i in range(n_iters):
  shuffle_idx = np.random.permutation(N)
  x_shuffled = x[shuffle_idx].reshape(-1,1)
  permuted_mis[i] = mutual_info_regression(x_shuffled,y)[0]

# get statistical values
zval = (miSk-permuted_mis.mean()) / permuted_mis.std(ddof=1)
pval = np.sum( permuted_mis > miSk ) / n_iters

In [None]:
# visualize the distribution
plt.figure(figsize=(10,3))

plt.hist(permuted_mis,bins=40,color=[.9,.7,.7],edgecolor=[[.7,.7,.7]],label='Shuffled')
plt.axvline(miSk,color='lightblue',linestyle='--',linewidth=2,label='Observed')

plt.gca().set(xlabel='Mutual information value',ylabel='Count (log)',xlim=[-.01,None],yscale='log',
              title=f'Permutation test\nz = {zval:.2f}, p = {pval:.2f}')
plt.legend()
plt.show()

# Demo 5: Impact of outliers

In [None]:
def calculate_mi(x,y):

  # histogram to calculate proportion
  Z,xx,yy = np.histogram2d(x,y,bins=7)

  # proportion via sum-scaling
  p_Z = Z / Z.sum()
  p_x = np.sum(p_Z, axis=1)
  p_y = np.sum(p_Z, axis=0)

  # calculate entropy
  eps = 1e-13
  entropy_x = -np.sum( p_x * np.log2(p_x+eps) )
  entropy_y = -np.sum( p_y * np.log2(p_y+eps) )

  # as difference of entropies
  entropy_Z = -np.sum( p_Z * np.log2(p_Z+eps) )
  miEps = (entropy_x+entropy_y) - entropy_Z

  miSk = mutual_info_regression(x.reshape(-1,1),y)[0]

  return miEps,miSk

In [None]:
# make a copy of the data and create an outlier
x_hasOut = x + 0
x_hasOut[-1] = x_hasOut[-1]*5

y_hasOut = y + 0
y_hasOut[1] = y_hasOut[1]*5

# original
mi1,mi2 = calculate_mi(x,y)
print('NO OUTLIERS:')
print(f'  Manual  = {mi1:.2f}')
print(f'  SKlearn = {mi2:.2f}')

# outlier in x (not in y)
mi1,mi2 = calculate_mi(x_hasOut,y)
print('\nOUTLIER in x:')
print(f'  Manual  = {mi1:.2f}')
print(f'  SKlearn = {mi2:.2f}')

# outlier in y (not in x)
mi1,mi2 = calculate_mi(x,y_hasOut)
print('\nOUTLIER in y:')
print(f'  Manual  = {mi1:.2f}')
print(f'  SKlearn = {mi2:.2f}')

# outliers in x and y
mi1,mi2 = calculate_mi(x_hasOut,y_hasOut)
print('\nOUTLIERS in both:')
print(f'  Manual  = {mi1:.2f}')
print(f'  SKlearn = {mi2:.2f}')

In [None]:
plt.figure(figsize=(8,3))
plt.plot(x_hasOut,y_hasOut,'ko',markersize=10,markerfacecolor=[.9,.7,.9,.5])
plt.gca().set(xlabel='Data $x$',ylabel='Data $y$',title='Data with outliers')
plt.show()

|<h2>Substack post:</h2>|<h1><a href="" target="_blank">Two-variable dependence part 3: Covariance vs. mutual information</a></h1>|
|-|:-:|
|<h2><h2>|<h2> </h2>|
|<h2>Teacher:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the post may lead to confusion or errors.</i>

# Demo 1: Covariance and mutual information in simulated data

In [None]:
# population correlation coefficients
rs = np.linspace(-.9,.9,100)

# sample size
N = 500

# initialize output matrix
covmi = np.zeros((len(rs),2))


# loop over a range of r values
for ri in range(len(rs)):

  ### generate data
  x = np.random.randn(N)
  y = x*rs[ri] + np.random.randn(N)*np.sqrt(1-rs[ri]**2)

  ### compute covariance
  covmi[ri,0] = np.cov(x,y)[0,1]

  ### compute mutual information
  covmi[ri,1] = mutual_info_regression(x.reshape(-1,1),y)[0]


## visualize the results
_,axs = plt.subplots(2,2,figsize=(10,7))

axs[0,0].plot(rs,covmi[:,0],'rs',markersize=8,markerfacecolor=[.9,.7,.7,.5],label='Covariance')
axs[0,0].plot(rs,covmi[:,1],'bo',markersize=8,markerfacecolor=[.7,.7,.9,.5],label='Mutual information')
axs[0,0].legend()
axs[0,0].set(xlabel='Population correlation',ylabel=r'Measured $c$ or MI',title='Covariance and mutual information')

axs[0,1].plot(covmi[:,0],covmi[:,1],'ks',markersize=8,markerfacecolor=[.7,.9,.7,.5])
axs[0,1].axhline(y=0,color='gray',linestyle='--')
axs[0,1].axvline(x=0,color='gray',linestyle='--')
axs[0,1].set(xlabel='Covariance',ylabel='Mutual information',title=f'$r=${np.corrcoef(covmi.T)[1,0]:.2f}')


axs[1,0].plot(rs,covmi[:,0]**2,'rs',markersize=8,markerfacecolor=[.9,.7,.7,.5],label='(covariance)$^2$')
axs[1,0].plot(rs,covmi[:,1],'bo',markersize=8,markerfacecolor=[.7,.7,.9,.5],label='Mutual information')
axs[1,0].legend()
axs[1,0].set(xlabel='Population correlation',ylabel=r'Measured $c^2$ or MI',title='Squared covariance and mutual information')

axs[1,1].plot(covmi[:,0]**2,covmi[:,1],'ks',markersize=8,markerfacecolor=[.7,.9,.7,.5])
axs[1,1].set(xlabel='(cov)$^2$',ylabel='Mutual information',title=f'$r=${np.corrcoef(abs(covmi[:,0]),covmi[:,1])[1,0]:.2f}')



plt.tight_layout()
plt.show()

# Demo 2: Download and process a real dataset

In [None]:
# Data citation: Sathishkumar V E, Jangwoo Park, and Yongyun Cho. 'Using data mining techniques for bike sharing demand
#                prediction in metropolitan city.' Computer Communications, Vol.153, pp.353-366, March, 2020
# data source website: https://archive.ics.uci.edu/ml/datasets/Seoul+Bike+Sharing+Demand

# import the data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv"
data = pd.read_csv(url,sep=',',encoding='unicode_escape')

# let's have a look
data

In [None]:
### data processing 1: convert datetime to days since the first day

# new column with different date format
data['date'] = pd.to_datetime(data['Date'],format='%d/%m/%Y')

# get the first date
first_date = data['date'].iloc[0]

# subtract and convert to days
data['days_since_first'] = (data['date'] - first_date).dt.days


### data processing 2: new dataframe with relevant columns
columns2use = ['Rented Bike Count','Temperature(°C)',
               'Wind speed (m/s)','days_since_first',
               'Humidity(%)']
df = data[columns2use]

# dataset size
N,M = df.shape

# summary statistics
df.describe()

In [None]:
h = sns.pairplot(df,height=2,
             plot_kws=dict(s=3,color=[.7,.7,.9,.7]),
             diag_kws=dict(color=[.9,.7,.7]) )
h.fig.set_size_inches(12,6)
plt.show()

# Demo 3: Split-half reliability of covariance and mutual information

In [None]:
# initialize
covariances = np.zeros((M,M,2))
mutualinfos = np.zeros((M,M,2))

for i in range(M):
  for j in range(i,M):

    ### EVEN rows
    # extract data
    x = df[columns2use[i]][::2]
    y = df[columns2use[j]][::2]

    # covariance
    covariances[i,j,0] = np.cov(x,y)[0,1]
    covariances[j,i,0] = covariances[i,j,0] # symmetric value can be copied instead of recalculated

    # mutual information
    mutualinfos[i,j,0] = mutual_info_regression(x.values.reshape(-1,1),y)[0]
    mutualinfos[j,i,0] = mutualinfos[i,j,0]


    ### ODD rows
    # extract data
    x = df[columns2use[i]][1::2]
    y = df[columns2use[j]][1::2]

    # covariance
    covariances[i,j,1] = np.cov(x,y)[0,1]
    covariances[j,i,1] = covariances[i,j,0]

    # mutual information
    mutualinfos[i,j,1] = mutual_info_regression(x.values.reshape(-1,1),y)[0]
    mutualinfos[j,i,1] = mutualinfos[i,j,0]

In [None]:
fig,axs = plt.subplots(2,3,figsize=(9,6))

shortlabels = ['Bikes','Temp','Wind','Days','Humidity']

h = axs[0,0].imshow(covariances[:,:,0],cmap='RdBu_r',vmin=-1000,vmax=10000)
fig.colorbar(h,ax=axs[0,0],pad=.02,fraction=.046,location='top')
axs[0,0].set(title='Covariance (even)\n\n',xticks=np.arange(M),xticklabels=shortlabels,yticks=np.arange(M),yticklabels=shortlabels)
axs[0,0].tick_params(axis='x',rotation=90)

h = axs[0,1].imshow(covariances[:,:,1],cmap='RdBu_r',vmin=-1000,vmax=10000)
fig.colorbar(h,ax=axs[0,1],pad=.02,fraction=.046,location='top')
axs[0,1].set(title='Covariance (odd)\n\n',xticks=np.arange(M),xticklabels=shortlabels,yticks=np.arange(M),yticklabels=shortlabels)
axs[0,1].tick_params(axis='x',rotation=90)

logcovs = np.log(abs(covariances)) * np.sign(covariances)
axs[0,2].plot(logcovs[:,:,0].flatten(),logcovs[:,:,1].flatten(),'ks',markersize=10,markerfacecolor=[.7,.7,.9,.5])
axs[0,2].set(xlabel='log(cov) (even rows)',ylabel='log(cov) (odd rows)',title='Covariance correspondences')



h = axs[1,0].imshow(mutualinfos[:,:,0],cmap='RdBu_r')
fig.colorbar(h,ax=axs[1,0],pad=.02,fraction=.046,location='top')
axs[1,0].set(title='Mutual info. (even)\n\n',xticks=np.arange(M),xticklabels=shortlabels,yticks=np.arange(M),yticklabels=shortlabels)
axs[1,0].tick_params(axis='x',rotation=90)

h = axs[1,1].imshow(mutualinfos[:,:,1],cmap='RdBu_r')
fig.colorbar(h,ax=axs[1,1],pad=.02,fraction=.046,location='top')
axs[1,1].set(title='Mutual info. (odd)\n\n',xticks=np.arange(M),xticklabels=shortlabels,yticks=np.arange(M),yticklabels=shortlabels)
axs[1,1].tick_params(axis='x',rotation=90)

axs[1,2].plot(np.log(mutualinfos[:,:,0].flatten()),np.log(mutualinfos[:,:,1].flatten()),'ks',markersize=10,markerfacecolor=[.7,.7,.9,.5])
axs[1,2].set(xlabel='log(MI) (even rows)',ylabel='log(MI) (odd rows)',title='MI correspondences')


plt.tight_layout()
plt.show()

In [None]:
# indices of unique elements
uniqueIdx = np.triu_indices(M,k=1)

# vectors for unique elements from average of test-retest
uniqueCovs = np.log(covariances**2).mean(axis=-1)[uniqueIdx]
uniqueMIs  = np.log(mutualinfos).mean(axis=-1)[uniqueIdx]

# their correlation
r = np.corrcoef(uniqueCovs,uniqueMIs)[0,1]

# visualize
plt.figure(figsize=(8,4))
plt.plot(uniqueCovs,uniqueMIs,'wh',markersize=13,markerfacecolor=[.7,.7,.9,.5])
plt.gca().set(xlabel='Signed log(cov)',ylabel='log(MI)',title=f'$r$ = {r:.3f}')
plt.show()

# Demo 4: Statistical significances

In [None]:
n_iters = 500 # 5 mins with all rows
permuted_vals = np.zeros(n_iters)

# initialize
zvals = np.zeros((M,M,2))
pvals = np.zeros((M,M,2))

for i in range(M):
  for j in range(i+1,M):

    ### extract data
    x = df[columns2use[i]].values[::55]
    y = df[columns2use[j]].values[::55]
    N = len(x)

    ### COVARIANCE: permutation distribution
    for permi in range(n_iters):
      shuffle_idx = np.random.permutation(N)
      permuted_vals[permi] = np.cov(x[shuffle_idx],y)[0,1]

    # get statistical values
    c = np.cov(x,y)[0,1]
    zvals[i,j,0] = (c-permuted_vals.mean()) / permuted_vals.std(ddof=1)
    pvals[i,j,0] = np.sum( abs(permuted_vals) > abs(c) ) / n_iters


    ### MUTUAL INFORMATION
    for permi in range(n_iters):
      shuffle_idx = np.random.permutation(N)
      x_shuffled = x[shuffle_idx].reshape(-1,1)
      permuted_vals[permi] = mutual_info_regression(x_shuffled,y)[0]

    # get statistical values
    mi = mutual_info_regression(x.reshape(-1,1),y)[0]
    zvals[i,j,1] = (mi-permuted_vals.mean()) / permuted_vals.std(ddof=1)
    pvals[i,j,1] = np.sum( permuted_vals > mi ) / n_iters


In [None]:
fig,axs = plt.subplots(1,2,figsize=(9,4))

# significance threshold, corrected for multiple comparisons
sigthresh = .05 / (M*(M-1)/2)

# loop over the analyses
for i in range(2):

  # replace "missing" tests with nan
  Z = zvals[:,:,i]
  Z[Z==0] = np.nan

  # visualize the statistical z-scores
  h = axs[i].imshow(Z,vmin=-3,vmax=3,cmap='RdBu_r')
  fig.colorbar(h,ax=axs[i],pad=.02,fraction=.046)
  axs[i].set(xticks=np.arange(M),xticklabels=shortlabels,yticks=np.arange(M),yticklabels=shortlabels)

  # show significance
  for j in range(M):
    for k in range(j+1,M):
      if pvals[j,k,i] < sigthresh:
        axs[i].text(k,j,'*',fontsize=18,ha='center',va='center')

axs[0].set(title=f'Covariance z-scores\n(N={N})')
axs[1].set(title=f'Mutual info. z-scores\n(N={N})')

plt.tight_layout()
plt.show()

In [None]:
# comparison of cov/mi values in the subset vs full dataset

# initialize
CovVals = np.zeros((M,M,2))
MI_Vals = np.zeros((M,M,2))

for i in range(M):
  for j in range(i+1,M):

    ### small dataset
    x = df[columns2use[i]].values[::55]
    y = df[columns2use[j]].values[::55]
    CovVals[i,j,0] = np.cov(x,y)[0,1]
    MI_Vals[i,j,0] = mutual_info_regression(x.reshape(-1,1),y)[0]
    smallN = len(x)

    ### full dataset
    x = df[columns2use[i]].values
    y = df[columns2use[j]].values
    CovVals[i,j,1] = np.cov(x,y)[0,1]
    MI_Vals[i,j,1] = mutual_info_regression(x.reshape(-1,1),y)[0]
    bigN = len(x)

# transform to signed log
CovVals = np.log(abs(CovVals)) * np.sign(CovVals)

In [None]:
fig,axs = plt.subplots(1,2,figsize=(8,3))

# visual comparison
axs[0].plot(CovVals[:,:,0].flatten(),CovVals[:,:,1].flatten(),'wh',markersize=10,markerfacecolor=[.7,.7,.9,.5])
axs[0].plot([np.nanmin(CovVals),np.nanmax(CovVals)],[np.nanmin(CovVals),np.nanmax(CovVals)],'w--',zorder=-14,linewidth=.3)
axs[0].set(xlabel=f'N = {smallN} rows',ylabel=f'N = {bigN} rows',title='Signed log(cov) correspondences')

axs[1].plot(MI_Vals[:,:,0].flatten(),MI_Vals[:,:,1].flatten(),'wh',markersize=10,markerfacecolor=[.9,.7,.7,.5])
axs[1].plot([np.nanmin(MI_Vals),np.nanmax(MI_Vals)],[np.nanmin(MI_Vals),np.nanmax(MI_Vals)],'w--',zorder=-14,linewidth=.3)
axs[1].set(xlabel=f'N = {smallN} rows',ylabel=f'N = {bigN} rows',title='Mutual info. correspondences')


plt.tight_layout()
plt.show()