# Spurious Correlations of Highly Dimensional Big Data

This Notebook aims at showing how PCA and random projection can solve the problem of spurious correlations in Big Data.

In [5]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.utils import shuffle

## DataFrame Creation

In [6]:
# User defined parameters

# Number of rows for df1
x = 1000

# Number of columns for df1
y = 1000000

# Number of rows for df2
z = 100

In [7]:
np.random.seed(0)

In [8]:
# Creating a dataframe with x number of rows and y number of columns
df = pd.DataFrame(np.random.random_sample((x,y)))

# Shuffle rows of dataframe
df = df.sample(frac=1)

In [9]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,999990,999991,999992,999993,999994,999995,999996,999997,999998,999999
988,0.832249,0.361960,0.676514,0.027373,0.649353,0.952980,0.163837,0.780131,0.448143,0.068555,...,0.659805,0.951651,0.466683,0.457974,0.561280,0.258342,0.215973,0.166107,0.707721,0.229875
151,0.986434,0.581833,0.920863,0.510072,0.054308,0.371047,0.478974,0.066140,0.179229,0.396120,...,0.456604,0.574398,0.584675,0.639665,0.060008,0.055314,0.414867,0.982768,0.584035,0.767159
877,0.052638,0.773671,0.920110,0.439777,0.341468,0.799348,0.050049,0.137289,0.595921,0.864781,...,0.975494,0.183281,0.231427,0.774146,0.882921,0.052360,0.163753,0.689479,0.586898,0.378605
999,0.651154,0.579449,0.807348,0.042459,0.387636,0.541059,0.503211,0.011269,0.669948,0.379149,...,0.309901,0.984276,0.856078,0.807838,0.336806,0.872235,0.325910,0.504379,0.820846,0.157205
793,0.483832,0.754674,0.659368,0.588200,0.219405,0.214822,0.910195,0.451105,0.100575,0.711392,...,0.843676,0.358331,0.430636,0.107079,0.432590,0.347211,0.451412,0.402100,0.198296,0.834905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856,0.580801,0.443627,0.362128,0.218346,0.515298,0.808221,0.265634,0.066611,0.870211,0.990942,...,0.662621,0.497807,0.675745,0.895028,0.319796,0.907200,0.892699,0.345144,0.639260,0.778316
991,0.180223,0.082013,0.675189,0.761537,0.247343,0.636096,0.828976,0.305864,0.663033,0.870398,...,0.002359,0.522936,0.722241,0.772558,0.164086,0.873914,0.677640,0.692677,0.712713,0.527447
763,0.005044,0.662633,0.839997,0.014043,0.834426,0.122258,0.339104,0.628316,0.623754,0.869120,...,0.950375,0.917650,0.884937,0.936627,0.023364,0.950863,0.391776,0.031693,0.438467,0.848248
583,0.668801,0.753164,0.091944,0.573357,0.380170,0.615005,0.866798,0.390522,0.554016,0.485384,...,0.813601,0.739745,0.781143,0.633897,0.939881,0.719016,0.225304,0.731029,0.092984,0.290562


In order to assess the correlations of the different parameters, correlations between the column with index 0 and the 999 other first columns is assessed.

In [None]:
# Assigning X to all columns except 0
X_df = df.drop(columns=0)
X_df.head()

# Assigning Y to column 0
Y_df = df[0]
print(Y_df)

# The following line makes Y become a list
Y_df = np.array(Y_df).reshape(-1)
print(X_df.shape,Y_df.shape)

# Calculate correlations between selected column and column 0
list_titles = X_df.columns
list_corr_df1 = []
for i in list_titles[0:z]:
    list_corr_df1.append(abs(np.corrcoef(Y_df, X_df[i])[0][1]))

To see if indeed the correlations between the different parameters increases with the data size, the original dataframe is compared to a sub-set dataframe which only takes the first z rows of the original dataframe. If the correlations in the original dataframe are higher than in the smaller dataframe this would prove that the bigger the data size the more frequent the number of spurious correlations.

In [None]:
# Creating smaller dataframe taking z number of rows from original dataframe
df2 = df.iloc[:z]
df2

In [None]:
# Assigning X to all columns except 0
X_df2 = df2.drop(columns=0)
X_df2.head()

# Assigning Y to column 0
Y_df2 = df2[0]
print(Y_df2)

# The following line makes Y become a list
Y_df2 = np.array(Y_df2).reshape(-1)
print(X_df2.shape,Y_df2.shape)

list_titles = X_df2.columns

# Calculate correlations between selected column and column 0
list_corr_df2 = []
for i in list_titles[0:z]:
    list_corr_df2.append(abs(np.corrcoef(Y_df2, X_df2[i])[0][1]))

We now compare which of the correlation lists has the highest numbers of every parameter

In [None]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df2)):
    if abs(list_corr_df2[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df2:",(b/len(list_corr_df2)*100),"%")

Clearly it is seen that the lower the number of observations, the higher the frequency of spurious correlations. However the number of spurious correlations present in the bigger DataFrame is still very significant.

To counter spurious correlations, Principal Component Analysis and Random Projection can be used.

## Principal Component Analysis

In PCA, it is up to the user to choose how many dimension the final data has. This is because the explained variance tells you how much information (variance) can be attributed to each of the principal components. This is important as while you can convert a y dimensional space to 2 dimensional space, you lose some of the variance (information) when you do this. It is up to the user to decide how much variance he is ready to lose. 

In [None]:
# Here we pass the daframe of y columns in 2 dimensions
pca = PCA(n_components=2)
df_pca = pd.DataFrame(pca.fit_transform(df))

In [None]:
# How much of the information is kept by the new dataframe?
print(pca.explained_variance_ratio_)

The new dataframe is not useful as it almost does not capture any variance. Thus we need more dimensionss.

In [None]:
# Here we pass the daframe of y columns in 2 dimensions
pca = PCA(n_components=100)
df_pca = pd.DataFrame(pca.fit_transform(df))
print(pca.explained_variance_ratio_)

In [None]:
# Here we pass the daframe of y columns in 2 dimensions
pca = PCA(n_components=1000)
df_pca = pd.DataFrame(pca.fit_transform(df))
print(pca.explained_variance_ratio_)

In [None]:
df_new = df_pca
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

# Calculate correlations between selected column and column 0
list_corr_df_new = []
for i in list_titles[0:len(list_titles)]:
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

In [None]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")

## Random Projection

In Random Projection, the parameter eps defines how much the Random Projection can deviate from original DataFrame.
The lower the percentage of the parameter the higher the fidelity of the transformed data.
However the higher the fidelity the less the reduction of dimensions.

In [None]:
transformer = random_projection.GaussianRandomProjection(eps = 0.1)
df_rp = pd.DataFrame(transformer.fit_transform(df))
df_rp

In [None]:
df_new = df_rp
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:len(list_titles)]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

In [None]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")