# Correlations

**Imports** - *External*

In [1]:
%matplotlib widget
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import common

*Internal*

In [2]:
from Data import Merge

In [3]:
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

**The path used to save the correlation file is:**

In [4]:
path = None

**Fnction used to calculate correlation between two columns where they both have some sort of value:**

This function is using Pearson product-moment correlation coefficients:

\begin{equation}
    R_{ij} = \frac{ C_{ij} } { \sqrt{ C_{ii} * C_{jj} } }
\end{equation}


In [5]:
def getCorr(x, y):
    xData = []
    yData = []
    for i in range(len(x)): # Both columns must have same size
        if x[i] != '' and y[i] != '':
            xData.append(float(x[i]))
            yData.append(float(y[i]))
    xData = np.array(xData)
    yData = np.array(yData)
    return np.corrcoef(xData, yData)[0, 1]

Load the data:

In [6]:

data = Merge().data

data = data.drop({'VISCODE', 'RID'}, axis=1)
dataLabels = data.columns.values
dataNumpy = data.to_numpy()

correlations = pd.DataFrame(index=dataLabels, columns=dataLabels)

for i in range(len(dataLabels)):
    correlations.at[dataLabels[i], dataLabels[i]] = 1

    # The next two lines ignore all warnings within this category
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)

        for j in range(i+1, len(dataLabels)):
            corr = getCorr(dataNumpy[:, i], dataNumpy[:, j])
            correlations.at[dataLabels[i], dataLabels[j]] = correlations.at[dataLabels[j], dataLabels[i]] = 0 if np.isnan(corr) else abs(corr)
            


before treating 15087
After treating 15087


Here we will proceed to the configuration of the **seaborn** heatmap.

In [7]:
aux = sns.color_palette("coolwarm", as_cmap=True)

In [8]:
plt.close()
plt.title('Correlations')

sns.heatmap(np.ma.filled(correlations.astype(float), np.nan), annot=True, annot_kws={"size": 7}, xticklabels=correlations.columns, yticklabels=correlations.index, cmap=aux)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:title={'center':'Correlations'}>

In [9]:
def getCorrP(x, y):
    xData = []
    yData = []
    for i in range(len(x)): # Both columns must have same size
        if x[i] != '' and y[i] != '':
            xData.append(float(x[i]))
            yData.append(float(y[i]))
    if len(xData) > 1:
        return pearsonr(xData, yData)
    return 0, 0
    

In [10]:
data = Merge().data
data = data.drop('VISCODE', axis=1)

dataLabels = data.columns.values
dataNumpy = data.to_numpy()

correlations = pd.DataFrame(index=dataLabels, columns=dataLabels)

pValues = pd.DataFrame(index=dataLabels, columns=dataLabels)

tableCorrP = pd.DataFrame(columns=["Labels", "Correlation", "p_Value"])
#getCorrP(dataNumpy[:, 2], dataNumpy[:,5])



for i in range(len(dataLabels)):
    correlations.at[dataLabels[i], dataLabels[i]] = 1
    pValues.at[dataLabels[i], dataLabels[i]] = 0
    for j in range(i+1, len(dataLabels)):
        p_r, p_p = getCorrP(dataNumpy[:, i], dataNumpy[:, j])
        correlations.at[dataLabels[i], dataLabels[j]] = correlations.at[dataLabels[j], dataLabels[i]] = p_r
        pValues.at[dataLabels[i], dataLabels[j]] = pValues.at[dataLabels[j], dataLabels[i]] = p_p
        
        tableCorrP = tableCorrP.append(dict(zip(tableCorrP.columns,[f"{dataLabels[i]} -> {dataLabels[j]}", p_r, p_p])), ignore_index = True)
        

In [11]:
print(tableCorrP.head(100))

                      Labels  Correlation        p_Value
0                 RID -> AGE    -0.191673  9.137021e-125
1            RID -> PTGENDER    -0.086936   1.043889e-26
2            RID -> PTEDUCAT     0.095091   1.194564e-31
3               RID -> CDRSB    -0.128112   6.673856e-41
4              RID -> ADAS11    -0.068106   2.043346e-12
..                       ...          ...            ...
95   PTGENDER -> EcogSPTotal     0.089364   9.477104e-14
96            PTGENDER -> DX     0.081659   3.411491e-17
97    PTGENDER -> mPACCdigit    -0.058814   1.203063e-09
98  PTGENDER -> mPACCtrailsB    -0.050600   1.690541e-07
99         PTEDUCAT -> CDRSB    -0.110210   1.190217e-30

[100 rows x 3 columns]


# Correlations

In [12]:
plt.close()
sns.heatmap(np.ma.filled(correlations.astype(float), np.nan), annot=True, annot_kws={"size": 7}, xticklabels=correlations.columns, yticklabels=correlations.index, cmap=aux)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:>

# P_Values

In [13]:
plt.close()
plt.title('p Values')
sns.heatmap(np.ma.filled(pValues.astype(float), np.nan), annot=True, annot_kws={"size": 7}, xticklabels=pValues.columns, yticklabels=pValues.index, cmap=aux)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:title={'center':'p Values'}>

## Correlations filtered by p Value

In [14]:
pFilter = 0.05 #Max value

In [15]:
mask = pd.DataFrame(index=dataLabels, columns=dataLabels)

for i in range(len(dataLabels)):
    mask.at[dataLabels[i], dataLabels[i]] = False
    for j in range(i+1, len(dataLabels)):
        if pValues.at[dataLabels[i], dataLabels[j]] >= pFilter:
            mask.at[dataLabels[i], dataLabels[j]] = mask.at[dataLabels[j], dataLabels[i]] = True
        else:
            mask.at[dataLabels[i], dataLabels[j]] = mask.at[dataLabels[j], dataLabels[i]] = False

mask = mask.to_numpy()

In [16]:
plt.close()
aux = sns.color_palette("coolwarm", as_cmap=True)
plt.title('Correlations filtered by p Value')
sns.heatmap(np.ma.filled(correlations.astype(float), np.nan), annot=True, annot_kws={"size": 7}, xticklabels=correlations.columns, yticklabels=correlations.index, cmap=aux, mask=mask)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:title={'center':'Correlations filtered by p Value'}>

## Bonferroni Correction

In [17]:
mask = pd.DataFrame(index=dataLabels, columns=dataLabels)

for i in range(len(dataLabels)):
    mask.at[dataLabels[i], dataLabels[i]] = False
    for j in range(i+1, len(dataLabels)):
        if pValues.at[dataLabels[i], dataLabels[j]] >= pFilter/(len(dataLabels)**2):
            mask.at[dataLabels[i], dataLabels[j]] = mask.at[dataLabels[j], dataLabels[i]] = True  #Hide
        else:
            mask.at[dataLabels[i], dataLabels[j]] = mask.at[dataLabels[j], dataLabels[i]] = False

mask = mask.to_numpy()

In [18]:
plt.close()
aux = sns.color_palette("coolwarm", as_cmap=True)
plt.title('Bonferroni Correction')
sns.heatmap(np.ma.filled(correlations.astype(float), np.nan), annot=True, annot_kws={"size": 7}, xticklabels=correlations.columns, yticklabels=correlations.index, cmap=aux, mask=mask)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:title={'center':'Bonferroni Correction'}>

## Correlações acima de 60% e Filtro de Bonferroni Correction

In [19]:
corrFilter = 0.6

In [20]:
mask = pd.DataFrame(index=dataLabels, columns=dataLabels)

for i in range(len(dataLabels)):
    mask.at[dataLabels[i], dataLabels[i]] = False
    for j in range(i+1, len(dataLabels)):
        if pValues.at[dataLabels[i], dataLabels[j]] >= pFilter/(len(dataLabels)**2) or abs(correlations.at[dataLabels[i], dataLabels[j]]) < corrFilter:
            mask.at[dataLabels[i], dataLabels[j]] = mask.at[dataLabels[j], dataLabels[i]] = True  #Hide
        else:
            mask.at[dataLabels[i], dataLabels[j]] = mask.at[dataLabels[j], dataLabels[i]] = False

mask = mask.to_numpy()

In [21]:
plt.close()
aux = sns.color_palette("coolwarm", as_cmap=True)
plt.title('Correlation >60% with Bonferroni Correction Filter')
sns.heatmap(np.ma.filled(correlations.astype(float), np.nan), annot=True, annot_kws={"size": 7}, xticklabels=correlations.columns, yticklabels=correlations.index, cmap=aux, mask=mask,  linewidths=0.5, linecolor='gray')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:title={'center':'Correlation >60% with Bonferroni Correction Filter'}>