# Spurious Correlations of Highly Dimensional Big Data

This Notebook aims at showing how PCA and random projection can solve the problem of spurious correlations in Big Data.

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.utils import shuffle

## DataFrame Creation

In [2]:
# User defined parameters

# Number of rows for df1
x = 100000

# Number of columns for df1
y = 10000

# Number of rows for df2
z = 1000

In [3]:
# Creating a dataframe with x number of rows and y number of columns
df = pd.DataFrame(np.random.random_sample((x,y)))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.200861,0.447345,0.406915,0.441774,0.200606,0.920589,0.621542,0.253289,0.190018,0.555092,...,0.298089,0.920974,0.755059,0.505978,0.514104,0.934172,0.119559,0.423139,0.646104,0.282789
1,0.171916,0.272941,0.887402,0.093271,0.987592,0.394753,0.410725,0.670960,0.353376,0.324825,...,0.064830,0.034188,0.016570,0.496434,0.326357,0.575975,0.887575,0.349384,0.758442,0.054061
2,0.658898,0.320016,0.157313,0.676873,0.461843,0.639391,0.505990,0.792386,0.208214,0.595487,...,0.832149,0.103013,0.488458,0.061678,0.901183,0.960294,0.277997,0.253806,0.416586,0.564177
3,0.545727,0.609389,0.004953,0.907550,0.943342,0.472856,0.835942,0.142429,0.944917,0.170039,...,0.119195,0.200376,0.066862,0.394784,0.244827,0.855903,0.019195,0.413095,0.273509,0.213919
4,0.828779,0.614228,0.641161,0.869954,0.111897,0.334124,0.292701,0.717720,0.483627,0.087026,...,0.326245,0.493860,0.069565,0.319860,0.857679,0.509462,0.653832,0.609036,0.302119,0.591317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.820523,0.609154,0.618819,0.670078,0.842839,0.822974,0.572641,0.030359,0.346309,0.856519,...,0.387006,0.142172,0.327234,0.242120,0.519938,0.981537,0.122192,0.102802,0.601809,0.064625
99996,0.265806,0.423324,0.760826,0.449370,0.917684,0.497820,0.485549,0.880453,0.444082,0.750693,...,0.781809,0.413794,0.003311,0.355406,0.451845,0.148189,0.691261,0.511892,0.251432,0.030527
99997,0.803242,0.499650,0.176380,0.896739,0.648351,0.003983,0.909239,0.212951,0.238987,0.837061,...,0.189300,0.970325,0.992655,0.552166,0.716320,0.809665,0.583533,0.945580,0.342026,0.959738
99998,0.234264,0.416311,0.446431,0.466118,0.569204,0.213844,0.891233,0.104801,0.369165,0.476053,...,0.858734,0.361672,0.581430,0.489510,0.319429,0.803354,0.603189,0.634078,0.465153,0.381850


In order to assess the correlations of the different parameters, correlations between the column with index 0 and the 999 other first columns is assessed.

In [None]:
# Assigning X to all columns except 0
X_df = df.drop(columns=0)
X_df.head()

# Assigning Y to column 0
Y_df = df[0]
print(Y_df)

# The following line makes Y become a list
Y_df = np.array(Y_df).reshape(-1)
print(X_df.shape,Y_df.shape)

list_titles = X_df.columns
list_corr_df1 = []
for i in list_titles[0:x]:
    print(i)
    list_corr_df1.append(abs(np.corrcoef(Y_df, X_df[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df, X_df[i])[0][1]))

To see if indeed the correlations between the different parameters increases with the data size, the original dataframe is compared to a sub-set dataframe which only takes the first z rows of the original dataframe. If the correlations in the original dataframe are higher than in the smaller dataframe this would prove that the bigger the data size the more frequent the number of spurious correlations.

In [None]:
# Creating smaller dataframe taking z number of rows from original dataframe
df2 = df.iloc[:z]
df2

In [None]:
# Assigning X to all columns except 0
X_df2 = df2.drop(columns=0)
X_df2.head()

# Assigning Y to column 0
Y_df2 = df2[0]
print(Y_df2)

# The following line makes Y become a list
Y_df2 = np.array(Y_df2).reshape(-1)
print(X_df2.shape,Y_df2.shape)

list_titles = X_df2.columns

list_corr_df2 = []
for i in list_titles[0:z]:
    print(i)
    list_corr_df2.append(abs(np.corrcoef(Y_df2, X_df2[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df2, X_df2[i])[0][1]))

We now compare which of the correlation lists has the highest numbers of every parameter

In [None]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df2)):
    if abs(list_corr_df2[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df2:",(b/len(list_corr_df2)*100),"%")

Clearly it is seen that the lower the number of observations, the higher the frequency of spurious correlations. However the number of spurious correlations present in the bigger DataFrame is still very significant.

To counter spurious correlations, random projection can be used.

## Random Projection eps = 0.1

In [None]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.1)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

In [None]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:len(list_titles)]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

In [None]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")