# Spurious Correlations of Highly Dimensional Big Data

This Notebook aims at showing how PCA and random projection can solve the problem of spurious correlations in Big Data.

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.utils import shuffle

## DataFrame Creation

In [2]:
# User defined parameters

# Number of rows for df1
x = 10000 

# Number of columns for df1
y = 100000

# Number of rows for df2
z = 100

In [3]:
np.random.seed(0)

In [4]:
# Creating a dataframe with x number of rows and y number of columns
df = pd.DataFrame(np.random.random_sample((x,y)))

# Shuffle rows of dataframe
df = df.sample(frac=1)

In [5]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
5777,0.013107,0.470582,0.782615,0.356561,0.713228,0.001639,0.531202,0.149542,0.238533,0.156730,...,0.014578,0.703602,0.121921,0.809777,0.141529,0.051837,0.615442,0.598923,0.463951,0.278275
3783,0.960434,0.443701,0.571535,0.813105,0.448731,0.292695,0.378250,0.988330,0.737386,0.240126,...,0.865072,0.048426,0.556778,0.939422,0.430007,0.304750,0.629446,0.784474,0.027791,0.411822
1295,0.145563,0.462142,0.170783,0.475111,0.200657,0.647372,0.260204,0.386845,0.420123,0.275254,...,0.038677,0.744098,0.731141,0.329909,0.280043,0.441809,0.997663,0.531609,0.461333,0.930565
838,0.415516,0.027677,0.297519,0.258756,0.415945,0.530397,0.204964,0.330645,0.675209,0.114403,...,0.305023,0.463645,0.621614,0.614746,0.401539,0.840944,0.074929,0.523992,0.074633,0.005774
1786,0.838305,0.187315,0.668331,0.625653,0.694821,0.305795,0.505483,0.220637,0.973320,0.472144,...,0.863375,0.903354,0.090747,0.823878,0.833744,0.254442,0.854606,0.282577,0.124343,0.716000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3408,0.998848,0.085800,0.471002,0.409039,0.700793,0.540194,0.107416,0.102830,0.798725,0.773601,...,0.026062,0.044855,0.712999,0.918720,0.893730,0.286155,0.116346,0.535174,0.483603,0.914082
6336,0.662911,0.213004,0.723441,0.183560,0.722360,0.145670,0.330431,0.135678,0.996597,0.220519,...,0.144963,0.446426,0.148142,0.398983,0.215387,0.081686,0.622973,0.427959,0.442991,0.751608
1880,0.416949,0.271127,0.791953,0.508140,0.039867,0.257467,0.701436,0.249736,0.430972,0.918784,...,0.435077,0.629779,0.088950,0.311274,0.453839,0.558268,0.585202,0.938641,0.548909,0.466086
763,0.647500,0.127408,0.799678,0.626792,0.124441,0.137988,0.041683,0.283041,0.592727,0.516547,...,0.854014,0.113504,0.449887,0.768145,0.033564,0.712753,0.263060,0.514465,0.901452,0.073480


In order to assess the correlations of the different parameters, correlations between the column with index 0 and the 999 other first columns is assessed.

In [6]:
# Assigning X to all columns except 0
X_df = df.drop(columns=0)
X_df.head()

# Assigning Y to column 0
Y_df = df[0]
print(Y_df)

# The following line makes Y become a list
Y_df = np.array(Y_df).reshape(-1)
print(X_df.shape,Y_df.shape)

# Calculate correlations between selected column and column 0
list_titles = X_df.columns
list_corr_df1 = []
for i in list_titles[0:z]:
    list_corr_df1.append(abs(np.corrcoef(Y_df, X_df[i])[0][1]))

5777    0.013107
3783    0.960434
1295    0.145563
838     0.415516
1786    0.838305
          ...   
3408    0.998848
6336    0.662911
1880    0.416949
763     0.647500
2631    0.795114
Name: 0, Length: 10000, dtype: float64
(10000, 99999) (10000,)


To see if indeed the correlations between the different parameters increases with the data size, the original dataframe is compared to a sub-set dataframe which only takes the first z rows of the original dataframe. If the correlations in the original dataframe are higher than in the smaller dataframe this would prove that the bigger the data size the more frequent the number of spurious correlations.

In [7]:
# Creating smaller dataframe taking z number of rows from original dataframe
df2 = df.iloc[:z]
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
5777,0.013107,0.470582,0.782615,0.356561,0.713228,0.001639,0.531202,0.149542,0.238533,0.156730,...,0.014578,0.703602,0.121921,0.809777,0.141529,0.051837,0.615442,0.598923,0.463951,0.278275
3783,0.960434,0.443701,0.571535,0.813105,0.448731,0.292695,0.378250,0.988330,0.737386,0.240126,...,0.865072,0.048426,0.556778,0.939422,0.430007,0.304750,0.629446,0.784474,0.027791,0.411822
1295,0.145563,0.462142,0.170783,0.475111,0.200657,0.647372,0.260204,0.386845,0.420123,0.275254,...,0.038677,0.744098,0.731141,0.329909,0.280043,0.441809,0.997663,0.531609,0.461333,0.930565
838,0.415516,0.027677,0.297519,0.258756,0.415945,0.530397,0.204964,0.330645,0.675209,0.114403,...,0.305023,0.463645,0.621614,0.614746,0.401539,0.840944,0.074929,0.523992,0.074633,0.005774
1786,0.838305,0.187315,0.668331,0.625653,0.694821,0.305795,0.505483,0.220637,0.973320,0.472144,...,0.863375,0.903354,0.090747,0.823878,0.833744,0.254442,0.854606,0.282577,0.124343,0.716000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1206,0.034008,0.384476,0.955847,0.017472,0.794494,0.203768,0.840669,0.235905,0.451264,0.652585,...,0.543415,0.567987,0.648657,0.374993,0.781972,0.816703,0.173999,0.779997,0.079036,0.773980
3166,0.970882,0.217768,0.846741,0.774417,0.425300,0.366696,0.320553,0.725521,0.691172,0.997096,...,0.743677,0.513415,0.185774,0.593170,0.729660,0.288874,0.085996,0.673181,0.571513,0.732731
7007,0.536753,0.235460,0.386641,0.627114,0.757405,0.824510,0.304211,0.478996,0.601915,0.469792,...,0.371762,0.180812,0.551398,0.731789,0.660825,0.301559,0.690867,0.735730,0.899302,0.432162
231,0.453526,0.221772,0.356402,0.693289,0.388062,0.727092,0.152799,0.823869,0.241612,0.032454,...,0.914891,0.290496,0.041678,0.685290,0.540991,0.279267,0.791828,0.520694,0.033079,0.731138


In [8]:
# Assigning X to all columns except 0
X_df2 = df2.drop(columns=0)
X_df2.head()

# Assigning Y to column 0
Y_df2 = df2[0]
print(Y_df2)

# The following line makes Y become a list
Y_df2 = np.array(Y_df2).reshape(-1)
print(X_df2.shape,Y_df2.shape)

list_titles = X_df2.columns

# Calculate correlations between selected column and column 0
list_corr_df2 = []
for i in list_titles[0:z]:
    list_corr_df2.append(abs(np.corrcoef(Y_df2, X_df2[i])[0][1]))

5777    0.013107
3783    0.960434
1295    0.145563
838     0.415516
1786    0.838305
          ...   
1206    0.034008
3166    0.970882
7007    0.536753
231     0.453526
7824    0.467103
Name: 0, Length: 100, dtype: float64
(100, 99999) (100,)


We now compare which of the correlation lists has the highest numbers of every parameter

In [9]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df2)):
    if abs(list_corr_df2[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df2:",(b/len(list_corr_df2)*100),"%")

Percentage of correlations in df1: 35.0 %
Percentage of correlations in df2: 95.0 %


Clearly it is seen that the lower the number of observations, the higher the frequency of spurious correlations. However the number of spurious correlations present in the bigger DataFrame is still very significant.

To counter spurious correlations, Principal Component Analysis and Random Projection can be used.

## Principal Component Analysis

In PCA, it is up to the user to choose how many dimension the final data has. This is because the explained variance tells you how much information (variance) can be attributed to each of the principal components. This is important as while you can convert a y dimensional space to 2 dimensional space, you lose some of the variance (information) when you do this. It is up to the user to decide how much variance he is ready to lose. 

In [10]:
# Here we pass the daframe of y columns in 2 dimensions
pca = PCA(n_components=2)
df_pca = pd.DataFrame(pca.fit_transform(df))

In [11]:
# How much of the information is kept by the new dataframe?
print(pca.explained_variance_ratio_)

[0.00015985 0.00015956]


The new dataframe is not useful as it almost does not capture any variance. Thus we need more dimensionss.

In [None]:
# Here we pass the daframe of y columns in 2 dimensions
pca = PCA(n_components=100)
df_pca = pd.DataFrame(pca.fit_transform(df))
print(pca.explained_variance_ratio_)

In [None]:
# Here we pass the daframe of y columns in 2 dimensions
pca = PCA(n_components=1000)
df_pca = pd.DataFrame(pca.fit_transform(df))
print(pca.explained_variance_ratio_)

In [None]:
pca.explained_variance_ratio_[0]

With 1000 Columns, the sum of all the variances is close to 100%, therefore appplying PCA on data with dimension 1000000 and changing it to 1000 dimensions is effective.

In [14]:
df_new = df_pca
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

# Calculate correlations between selected column and column 0
list_corr_df_new = []
for i in list_titles[0:len(list_titles)]:
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

0     -15.324342
1       6.189955
2     -11.839902
3       0.427279
4      -0.099419
         ...    
995    18.040323
996     2.612194
997    -1.075493
998    -7.722772
999    -1.066815
Name: 0, Length: 1000, dtype: float64
(1000, 999) (1000,)


In [15]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")

Percentage of correlations in df1: 75.0 %
Percentage of correlations in df_new: 0.10010010010010009 %


The correlation of the original DataFrame close to 0 when transformed through PCA.

## Random Projection

In Random Projection, the parameter eps defines how much the Random Projection can deviate from original DataFrame.
The lower the percentage of the parameter the higher the fidelity of the transformed data.
However the higher the fidelity the less the reduction of dimensions.

In [7]:
transformer = random_projection.GaussianRandomProjection(eps = 0.1)
df_rp = pd.DataFrame(transformer.fit_transform(df))
df_rp

MemoryError: Unable to allocate 44.1 GiB for an array with shape (5920, 1000000) and data type float64

In [None]:
df_new = df_rp
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:len(list_titles)]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

In [None]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")
    
print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")