# Canonical Correlation Analysis tutorial

Based on Liana Mehrabyan's TDS post:
https://towardsdatascience.com/understanding-how-schools-work-with-canonical-correlation-analysis-4c9a88c6b913

In [126]:
import pandas as pd
import numpy as np
df = pd.read_csv('data-science-for-good/2016 School Explorer.csv')

In [127]:
# choose relevant features
df = df[['Rigorous Instruction %',
         'Collaborative Teachers %',
         'Supportive Environment %',
         'Effective School Leadership %',
         'Strong Family-Community Ties %',
         'Trust %','Average ELA Proficiency',
         'Average Math Proficiency']]

In [128]:
df = df.dropna()

In [129]:
X = df[['Rigorous Instruction %',
        'Collaborative Teachers %',
        'Supportive Environment %',
        'Effective School Leadership %',
        'Strong Family-Community Ties %',
        'Trust %']].copy()

In [130]:
Y = df[['Average ELA Proficiency',
        'Average Math Proficiency']].copy()

In [131]:
X.head()

Unnamed: 0,Rigorous Instruction %,Collaborative Teachers %,Supportive Environment %,Effective School Leadership %,Strong Family-Community Ties %,Trust %
0,89%,94%,86%,91%,85%,94%
1,96%,96%,97%,90%,86%,94%
2,87%,77%,82%,61%,80%,79%
3,85%,78%,82%,73%,89%,88%
4,90%,88%,87%,81%,89%,93%


In [132]:
Y.head()

Unnamed: 0,Average ELA Proficiency,Average Math Proficiency
0,2.14,2.17
1,2.63,2.98
2,2.39,2.54
3,2.48,2.47
4,2.38,2.54


In [134]:
#Convert all variables to numeric
for col in X.columns:
    X.loc[:,col] = X.loc[:,col].str.strip('%')
    X.loc[:,col] = X.loc[:,col].astype('int')

In [135]:
# Standardise the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=True, with_std=True)
X_sc = sc.fit_transform(X)
Y_sc = sc.fit_transform(Y)

### CCA itself!

In [146]:
from pyrcca import rcca

In [148]:
nComponents = 2
cca = rcca.CCA(kernelcca=False, reg=0., numCC=nComponents)

In [150]:
cca.train([X_sc, Y_sc])

Training CCA, kernel = None, regularization = 0.0000, 2 components


<pyrcca.rcca.CCA at 0x1a1fc5f160>

In [151]:
print('CC per component par: {}'.format(cca.cancorrs))

CC per component par: [0.46059902 0.18447786]


In [152]:
print('% Shared variance: {}'.format(cca.cancorrs**2))

% Shared variance: [0.21215146 0.03403208]


In [153]:
cca.ws

[array([[-0.00375779,  0.0078263 ],
        [ 0.00061439, -0.00357358],
        [-0.02054012, -0.0083491 ],
        [-0.01252477,  0.02976148],
        [ 0.00046503, -0.00905069],
        [ 0.01415084, -0.01264106]]),
 array([[ 0.00632283,  0.05721601],
        [-0.02606459, -0.05132531]])]

In [154]:
print('Canonical Loading for Math Score:',
      np.corrcoef(cca.comps[0][:,0],Y_sc[:,0])[0,1])
print('Canonical Loading for ELA Score:',
      np.corrcoef(cca.comps[0][:,0],Y_sc[:,1])[0,1])

Canonical Loading for Math Score: -0.4106778140971078
Canonical Loading for ELA Score: -0.4578120954218727


In [161]:
print('CVX')
print(cca.comps[0], '\n')
print('First CV for X')
print(cca.comps[0][:,0], '\n')
print('Second CV for X')
print(cca.comps[0][:,1], '\n')
print('CVY')
print(cca.comps[1], '\n')
print('First CV for Y')
print(cca.comps[1][:,0], '\n')
print('Second CV for Y')
print(cca.comps[1][:,1], '\n')

CVX
[[ 0.00521903  0.01840999]
 [-0.03173083  0.00669952]
 [ 0.02066659 -0.02319188]
 ...
 [ 0.00029259  0.0018127 ]
 [-0.02341333 -0.00532078]
 [ 0.00647899  0.00178834]] 

First CV for X
[ 0.00521903 -0.03173083  0.02066659 ...  0.00029259 -0.02341333
  0.00647899] 

Second CV for X
[ 0.01840999  0.00669952 -0.02319188 ...  0.0018127  -0.00532078
  0.00178834] 

CVY
[[ 0.02079591 -0.00760554]
 [-0.01557288 -0.01886755]
 [ 0.00463834 -0.00862955]
 ...
 [-0.01923392 -0.00883273]
 [-0.03327623 -0.05742338]
 [-0.02529823 -0.02446945]] 

First CV for Y
[ 0.02079591 -0.01557288  0.00463834 ... -0.01923392 -0.03327623
 -0.02529823] 

Second CV for Y
[-0.00760554 -0.01886755 -0.00862955 ... -0.00883273 -0.05742338
 -0.02446945] 

