# Spurious Correlations of Highly Dimensional Big Data

This Notebook aims at showing how PCA and random projection can solve the problem of spurious correlations in Big Data.

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import random_projections
from sklearn.utils import shuffle

In [2]:
# User defined parameters

# Number of rows for df1
x = 10000
print(x)

# Number of columns for df1
y = 200000

# Number of rows for df2
z = 1000

10000


In [3]:
# Creating a dataframe with x number of rows and y number of columns
df = pd.DataFrame(np.random.random_sample((x,y)))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,199990,199991,199992,199993,199994,199995,199996,199997,199998,199999
0,0.458906,0.166611,0.323397,0.957772,0.220363,0.103676,0.567348,0.861282,0.715613,0.637657,...,0.903059,0.617513,0.692247,0.385723,0.154265,0.037161,0.984675,0.906641,0.346729,0.078499
1,0.300065,0.593224,0.237618,0.593717,0.867807,0.452217,0.117413,0.903606,0.522869,0.672424,...,0.659286,0.876604,0.568471,0.882705,0.380385,0.758663,0.795307,0.928774,0.266886,0.235867
2,0.581309,0.688314,0.712273,0.650404,0.163878,0.699277,0.220455,0.086919,0.982415,0.750624,...,0.283565,0.679379,0.274606,0.738045,0.011229,0.742113,0.365778,0.087292,0.558108,0.543161
3,0.093770,0.295145,0.444983,0.128746,0.588971,0.383497,0.169854,0.870678,0.588955,0.355223,...,0.745982,0.753749,0.631237,0.643488,0.246109,0.503748,0.054027,0.043541,0.227906,0.007422
4,0.592554,0.935474,0.758191,0.775989,0.581051,0.089638,0.061915,0.055130,0.318064,0.687914,...,0.080853,0.797460,0.704905,0.744570,0.805460,0.176091,0.448273,0.954497,0.303338,0.483902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.979845,0.388626,0.097597,0.179480,0.288942,0.379594,0.028072,0.347917,0.585411,0.376731,...,0.211041,0.178730,0.860672,0.423346,0.872039,0.444041,0.907480,0.310552,0.334655,0.165113
9996,0.812211,0.727473,0.484163,0.432827,0.585476,0.472010,0.851534,0.254498,0.590776,0.398893,...,0.655917,0.663677,0.925517,0.081427,0.804885,0.178063,0.880774,0.742075,0.142048,0.197481
9997,0.513036,0.999481,0.996926,0.531102,0.667060,0.424777,0.578542,0.136855,0.618295,0.175844,...,0.138008,0.803614,0.547870,0.795971,0.983628,0.115428,0.777258,0.568196,0.118312,0.351208
9998,0.817162,0.453057,0.644303,0.701634,0.211196,0.926298,0.071059,0.311278,0.325705,0.182426,...,0.683502,0.631228,0.357619,0.919328,0.361264,0.703459,0.361398,0.130522,0.986322,0.908582


In order to assess the correlations of the different parameters, correlations between the column with index 0 and the 99 other first columns is assessed.

In [4]:
# Assigning X to all columns except 0
X_df = df.drop(columns=0)
X_df.head()

# Assigning Y to column 0
Y_df = df[0]
print(Y_df)

# The following line makes Y become a list
Y_df = np.array(Y_df).reshape(-1)
print(X_df.shape,Y_df.shape)

list_titles = X_df.columns
list_corr_df1 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df1.append(abs(np.corrcoef(Y_df, X_df[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df, X_df[i])[0][1]))

0       0.458906
1       0.300065
2       0.581309
3       0.093770
4       0.592554
          ...   
9995    0.979845
9996    0.812211
9997    0.513036
9998    0.817162
9999    0.637561
Name: 0, Length: 10000, dtype: float64
(10000, 199999) (10000,)
1
Correlation matrix for column 0 and and column1: 0.009411031195587964
2
Correlation matrix for column 0 and and column2: -0.003625856513468184
3
Correlation matrix for column 0 and and column3: 0.00010479936436612581
4
Correlation matrix for column 0 and and column4: -0.005715044709023195
5
Correlation matrix for column 0 and and column5: 0.0013374683184867674
6
Correlation matrix for column 0 and and column6: -0.02270411552733314
7
Correlation matrix for column 0 and and column7: -0.02402223707073301
8
Correlation matrix for column 0 and and column8: 0.014420987367135757
9
Correlation matrix for column 0 and and column9: -0.003622960381751026
10
Correlation matrix for column 0 and and column10: 0.0013037203309700749
11
Correlation matri

Correlation matrix for column 0 and and column122: 0.0010961749700914517
123
Correlation matrix for column 0 and and column123: -0.011824106678474769
124
Correlation matrix for column 0 and and column124: 0.00986457377995204
125
Correlation matrix for column 0 and and column125: -0.005969207301942767
126
Correlation matrix for column 0 and and column126: -0.012676784289394253
127
Correlation matrix for column 0 and and column127: 0.010825970713342068
128
Correlation matrix for column 0 and and column128: -0.0074005558013221446
129
Correlation matrix for column 0 and and column129: -0.011539176198864921
130
Correlation matrix for column 0 and and column130: -0.0008706965650845534
131
Correlation matrix for column 0 and and column131: -0.017998222039207524
132
Correlation matrix for column 0 and and column132: -0.0021050060320851353
133
Correlation matrix for column 0 and and column133: 0.013007992437384791
134
Correlation matrix for column 0 and and column134: -0.007758075146249645
135


Correlation matrix for column 0 and and column261: -0.005024656468195305
262
Correlation matrix for column 0 and and column262: -0.009619301124014719
263
Correlation matrix for column 0 and and column263: -0.01020820089757308
264
Correlation matrix for column 0 and and column264: -0.0006711734106929152
265
Correlation matrix for column 0 and and column265: -0.015254711462588956
266
Correlation matrix for column 0 and and column266: 0.003688387346625456
267
Correlation matrix for column 0 and and column267: -0.004932735320537935
268
Correlation matrix for column 0 and and column268: 0.0009002964388494469
269
Correlation matrix for column 0 and and column269: 0.01483946707930692
270
Correlation matrix for column 0 and and column270: 0.006111669685127942
271
Correlation matrix for column 0 and and column271: 0.0031420628719352407
272
Correlation matrix for column 0 and and column272: 0.009511200036299364
273
Correlation matrix for column 0 and and column273: -0.004016458480164128
274
Corr

Correlation matrix for column 0 and and column413: 0.006891976252433373
414
Correlation matrix for column 0 and and column414: -0.019186099085167115
415
Correlation matrix for column 0 and and column415: -0.010875366421297395
416
Correlation matrix for column 0 and and column416: -0.021132074938519423
417
Correlation matrix for column 0 and and column417: -0.006337258675400423
418
Correlation matrix for column 0 and and column418: -0.006255631359865459
419
Correlation matrix for column 0 and and column419: 0.0035685653266874255
420
Correlation matrix for column 0 and and column420: -0.008626971358452598
421
Correlation matrix for column 0 and and column421: 0.007144791011305693
422
Correlation matrix for column 0 and and column422: 0.01537728788005279
423
Correlation matrix for column 0 and and column423: 0.001398670447481099
424
Correlation matrix for column 0 and and column424: 0.00372567294324444
425
Correlation matrix for column 0 and and column425: -0.003028026968857454
426
Correl

Correlation matrix for column 0 and and column548: -0.015266100435395634
549
Correlation matrix for column 0 and and column549: -0.0006970101573283117
550
Correlation matrix for column 0 and and column550: 0.018927937137231076
551
Correlation matrix for column 0 and and column551: -0.007264887998528277
552
Correlation matrix for column 0 and and column552: -0.008304799438537062
553
Correlation matrix for column 0 and and column553: -0.005330181098988096
554
Correlation matrix for column 0 and and column554: 0.0030939773189045464
555
Correlation matrix for column 0 and and column555: 0.019280658586288187
556
Correlation matrix for column 0 and and column556: 0.012323165872513685
557
Correlation matrix for column 0 and and column557: 0.0037332321252325873
558
Correlation matrix for column 0 and and column558: -0.007120720721627426
559
Correlation matrix for column 0 and and column559: 0.00236561918507127
560
Correlation matrix for column 0 and and column560: 0.021748568273785223
561
Corr

Correlation matrix for column 0 and and column690: 0.025638094393265743
691
Correlation matrix for column 0 and and column691: 0.005473766477932557
692
Correlation matrix for column 0 and and column692: 0.010392752945107869
693
Correlation matrix for column 0 and and column693: 0.005577722776343945
694
Correlation matrix for column 0 and and column694: -0.005928831154971292
695
Correlation matrix for column 0 and and column695: -0.009357397140626073
696
Correlation matrix for column 0 and and column696: 0.0051690215067315606
697
Correlation matrix for column 0 and and column697: 0.006546441482143588
698
Correlation matrix for column 0 and and column698: 0.004247845406407646
699
Correlation matrix for column 0 and and column699: -0.004364094116715736
700
Correlation matrix for column 0 and and column700: 0.0067639523467862665
701
Correlation matrix for column 0 and and column701: 0.004685138414971068
702
Correlation matrix for column 0 and and column702: -0.002683352798897607
703
Correl

Correlation matrix for column 0 and and column801: 0.007210845576521252
802
Correlation matrix for column 0 and and column802: -0.007297865707777326
803
Correlation matrix for column 0 and and column803: -0.012142226118381155
804
Correlation matrix for column 0 and and column804: 0.00764274504805353
805
Correlation matrix for column 0 and and column805: 0.002867905993019454
806
Correlation matrix for column 0 and and column806: -0.008841374085405543
807
Correlation matrix for column 0 and and column807: 0.0025850621006400613
808
Correlation matrix for column 0 and and column808: 0.01193400462515293
809
Correlation matrix for column 0 and and column809: -0.01715119237368734
810
Correlation matrix for column 0 and and column810: -0.004089435740963757
811
Correlation matrix for column 0 and and column811: -0.009276341809893575
812
Correlation matrix for column 0 and and column812: -0.003251760942709842
813
Correlation matrix for column 0 and and column813: -0.013325226144575876
814
Correl

Correlation matrix for column 0 and and column917: 0.014340216823136735
918
Correlation matrix for column 0 and and column918: -0.0033483291541208183
919
Correlation matrix for column 0 and and column919: -0.0005241435815541518
920
Correlation matrix for column 0 and and column920: 0.0013197249679295737
921
Correlation matrix for column 0 and and column921: 0.019849650674061938
922
Correlation matrix for column 0 and and column922: -0.0008182248644738047
923
Correlation matrix for column 0 and and column923: 0.006231541756002637
924
Correlation matrix for column 0 and and column924: 0.0026825674617073003
925
Correlation matrix for column 0 and and column925: 0.005511346964484861
926
Correlation matrix for column 0 and and column926: 0.011590435730240507
927
Correlation matrix for column 0 and and column927: 0.00018938390903006967
928
Correlation matrix for column 0 and and column928: -0.002964236054094356
929
Correlation matrix for column 0 and and column929: 0.008023196798147215
930
C

In [5]:
# See how many correlations are significant

To see if indeed the correlations between the different parameters increases with the data size, the original dataframe is compared to a sub-set dataframe which only takes the first z rows of the original dataframe. If the correlations in the original dataframe are higher than in the smaller dataframe this would prove that the bigger the data size the more frequent the number of spurious correlations.

In [6]:
# Creating smaller dataframe taking z number of rows from original dataframe
df2 = df.iloc[:z]
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,199990,199991,199992,199993,199994,199995,199996,199997,199998,199999
0,0.458906,0.166611,0.323397,0.957772,0.220363,0.103676,0.567348,0.861282,0.715613,0.637657,...,0.903059,0.617513,0.692247,0.385723,0.154265,0.037161,0.984675,0.906641,0.346729,0.078499
1,0.300065,0.593224,0.237618,0.593717,0.867807,0.452217,0.117413,0.903606,0.522869,0.672424,...,0.659286,0.876604,0.568471,0.882705,0.380385,0.758663,0.795307,0.928774,0.266886,0.235867
2,0.581309,0.688314,0.712273,0.650404,0.163878,0.699277,0.220455,0.086919,0.982415,0.750624,...,0.283565,0.679379,0.274606,0.738045,0.011229,0.742113,0.365778,0.087292,0.558108,0.543161
3,0.09377,0.295145,0.444983,0.128746,0.588971,0.383497,0.169854,0.870678,0.588955,0.355223,...,0.745982,0.753749,0.631237,0.643488,0.246109,0.503748,0.054027,0.043541,0.227906,0.007422
4,0.592554,0.935474,0.758191,0.775989,0.581051,0.089638,0.061915,0.05513,0.318064,0.687914,...,0.080853,0.79746,0.704905,0.74457,0.80546,0.176091,0.448273,0.954497,0.303338,0.483902


In [7]:
# Assigning X to all columns except 0
X_df2 = df2.drop(columns=0)
X_df2.head()

# Assigning Y to column 0
Y_df2 = df2[0]
print(Y_df2)

# The following line makes Y become a list
Y_df2 = np.array(Y_df2).reshape(-1)
print(X_df2.shape,Y_df2.shape)

list_titles = X_df2.columns

list_corr_df2 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df2.append(abs(np.corrcoef(Y_df2, X_df2[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df2, X_df2[i])[0][1]))

0      0.458906
1      0.300065
2      0.581309
3      0.093770
4      0.592554
         ...   
995    0.235707
996    0.506015
997    0.320981
998    0.621073
999    0.325118
Name: 0, Length: 1000, dtype: float64
(1000, 199999) (1000,)
1
Correlation matrix for column 0 and and column1: -0.033734426616779896
2
Correlation matrix for column 0 and and column2: 0.00792315117216639
3
Correlation matrix for column 0 and and column3: 0.010706799982063265
4
Correlation matrix for column 0 and and column4: -0.02236811322412715
5
Correlation matrix for column 0 and and column5: 0.04227347310708775
6
Correlation matrix for column 0 and and column6: -0.003304219867890738
7
Correlation matrix for column 0 and and column7: -0.07156899962080156
8
Correlation matrix for column 0 and and column8: 0.024017820631438935
9
Correlation matrix for column 0 and and column9: -0.014810763109455484
10
Correlation matrix for column 0 and and column10: 0.04229144304614162
11
Correlation matrix for column 0 and an

Correlation matrix for column 0 and and column261: -0.041283907436880325
262
Correlation matrix for column 0 and and column262: -0.012006117682456323
263
Correlation matrix for column 0 and and column263: -0.052399507651058636
264
Correlation matrix for column 0 and and column264: -0.030425458936548976
265
Correlation matrix for column 0 and and column265: -0.010915894773679638
266
Correlation matrix for column 0 and and column266: 0.051839956042736675
267
Correlation matrix for column 0 and and column267: -0.006314221772131813
268
Correlation matrix for column 0 and and column268: 0.04947981660640015
269
Correlation matrix for column 0 and and column269: 0.0063681229005994105
270
Correlation matrix for column 0 and and column270: 0.022289443638481777
271
Correlation matrix for column 0 and and column271: -0.002857449129270539
272
Correlation matrix for column 0 and and column272: -0.04039144985961021
273
Correlation matrix for column 0 and and column273: -0.015743711222274883
274
Corr

Correlation matrix for column 0 and and column570: -0.015341573163003133
571
Correlation matrix for column 0 and and column571: -0.019628681793935735
572
Correlation matrix for column 0 and and column572: -0.035025977675938666
573
Correlation matrix for column 0 and and column573: -0.038135482831098313
574
Correlation matrix for column 0 and and column574: 0.022136087224623714
575
Correlation matrix for column 0 and and column575: 0.018074367323221072
576
Correlation matrix for column 0 and and column576: -0.04437544806012237
577
Correlation matrix for column 0 and and column577: -0.03567279965668737
578
Correlation matrix for column 0 and and column578: 0.08195284080400822
579
Correlation matrix for column 0 and and column579: 0.022691645040952724
580
Correlation matrix for column 0 and and column580: -0.02960473743589131
581
Correlation matrix for column 0 and and column581: 0.007063872242818295
582
Correlation matrix for column 0 and and column582: -0.004339608016685982
583
Correlat

929
Correlation matrix for column 0 and and column929: 0.010610127314146902
930
Correlation matrix for column 0 and and column930: 0.002870783134862256
931
Correlation matrix for column 0 and and column931: 0.008299128923876216
932
Correlation matrix for column 0 and and column932: -0.02655704769569677
933
Correlation matrix for column 0 and and column933: 0.012405144571527206
934
Correlation matrix for column 0 and and column934: -0.01796717369891319
935
Correlation matrix for column 0 and and column935: -0.0031051142422377563
936
Correlation matrix for column 0 and and column936: 0.01987953321846738
937
Correlation matrix for column 0 and and column937: -0.008922088767318334
938
Correlation matrix for column 0 and and column938: 0.04918896107309019
939
Correlation matrix for column 0 and and column939: -0.044463958049865915
940
Correlation matrix for column 0 and and column940: -0.006602271286883118
941
Correlation matrix for column 0 and and column941: 0.03525258221328312
942
Correl

We now compare which of the correlation lists has the highest numbers of every parameter

In [8]:
a = 0
b = 0
for i in range(0,1000):
    if list_corr_df1[i] > list_corr_df2[i]:
        a+=1
    elif list_corr_df1[i] < list_corr_df2[i]:
        b+=1
    else:
        print()
        
print(a)
print(b)

199
801


Clearly it is seen that the bigger DataFrame has the highest number of spurious correlations.

To counter spurious correlations, random projection can be used

In [None]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.1)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

In [None]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)
X_df_new.head()

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

In [None]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        a+=1
    else:
        pass
print(a)
print(b)

In [None]:
print((a/len(list_corr_df1)*100)

In [None]:
print((b/len(list_corr_df_new)*100)