# Spurious Correlations of Highly Dimensional Big Data

This Notebook aims at showing how PCA and random projection can solve the problem of spurious correlations in Big Data.

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.utils import shuffle

In [2]:
# User defined parameters

# Number of rows for df1
x = 10000
print(x)

# Number of columns for df1
y = 200000

# Number of rows for df2
z = 1000

10000


In [3]:
# Creating a dataframe with x number of rows and y number of columns
df = pd.DataFrame(np.random.random_sample((x,y)))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,199990,199991,199992,199993,199994,199995,199996,199997,199998,199999
0,0.683541,0.018479,0.357882,0.681310,0.447448,0.598510,0.124120,0.026100,0.892880,0.790828,...,0.785743,0.987288,0.587592,0.546459,0.181415,0.818808,0.184176,0.668079,0.347183,0.707497
1,0.827127,0.473744,0.547709,0.097834,0.480502,0.960953,0.667213,0.029142,0.544869,0.029782,...,0.147390,0.316055,0.891719,0.825268,0.782924,0.101897,0.650957,0.081078,0.441026,0.386312
2,0.929321,0.373383,0.937262,0.279188,0.898340,0.396706,0.395249,0.085978,0.535766,0.352000,...,0.372708,0.331071,0.823048,0.698590,0.825034,0.672430,0.670454,0.542668,0.040608,0.398071
3,0.669256,0.761206,0.647999,0.893389,0.087763,0.398732,0.699660,0.116835,0.825984,0.666978,...,0.774387,0.791663,0.092473,0.203856,0.140118,0.459813,0.687499,0.229563,0.706178,0.919847
4,0.153948,0.006564,0.293203,0.002775,0.026068,0.006733,0.999795,0.351303,0.612944,0.640485,...,0.444889,0.689587,0.771937,0.902576,0.301864,0.389987,0.739029,0.607980,0.586038,0.148765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.741071,0.019029,0.726753,0.045537,0.636693,0.836929,0.093999,0.314873,0.654295,0.301487,...,0.478707,0.315784,0.983865,0.113627,0.419265,0.209850,0.296888,0.764054,0.570778,0.936255
9996,0.685906,0.602061,0.585465,0.409683,0.720429,0.850826,0.618792,0.318580,0.570577,0.920668,...,0.213659,0.924116,0.411273,0.638701,0.433407,0.526492,0.612638,0.148246,0.185502,0.203919
9997,0.056428,0.032820,0.501168,0.791639,0.725606,0.565231,0.828117,0.597243,0.007799,0.331962,...,0.216443,0.145054,0.951711,0.482814,0.676325,0.616729,0.444121,0.248416,0.777552,0.403724
9998,0.968585,0.287188,0.518795,0.353105,0.819224,0.899140,0.735084,0.302363,0.593943,0.153128,...,0.630912,0.062861,0.266355,0.701662,0.445077,0.107632,0.429528,0.886163,0.737584,0.839082


In order to assess the correlations of the different parameters, correlations between the column with index 0 and the 99 other first columns is assessed.

In [4]:
# Assigning X to all columns except 0
X_df = df.drop(columns=0)
X_df.head()

# Assigning Y to column 0
Y_df = df[0]
print(Y_df)

# The following line makes Y become a list
Y_df = np.array(Y_df).reshape(-1)
print(X_df.shape,Y_df.shape)

list_titles = X_df.columns
list_corr_df1 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df1.append(abs(np.corrcoef(Y_df, X_df[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df, X_df[i])[0][1]))

0       0.683541
1       0.827127
2       0.929321
3       0.669256
4       0.153948
          ...   
9995    0.741071
9996    0.685906
9997    0.056428
9998    0.968585
9999    0.959810
Name: 0, Length: 10000, dtype: float64
(10000, 199999) (10000,)
1
Correlation matrix for column 0 and and column1: 0.005204561553717579
2
Correlation matrix for column 0 and and column2: 0.005883334136098834
3
Correlation matrix for column 0 and and column3: -0.0011400525660891913
4
Correlation matrix for column 0 and and column4: -0.013393150349447307
5
Correlation matrix for column 0 and and column5: -0.004535673138091676
6
Correlation matrix for column 0 and and column6: -0.0007679309042347035
7
Correlation matrix for column 0 and and column7: 0.012319691810831751
8
Correlation matrix for column 0 and and column8: 0.02196116383262485
9
Correlation matrix for column 0 and and column9: -0.009096636271847714
10
Correlation matrix for column 0 and and column10: -0.002522116763103136
11
Correlation matri

Correlation matrix for column 0 and and column119: 7.86142363329614e-05
120
Correlation matrix for column 0 and and column120: -0.007723210586679989
121
Correlation matrix for column 0 and and column121: 0.0014309697005428085
122
Correlation matrix for column 0 and and column122: -0.008637684617373973
123
Correlation matrix for column 0 and and column123: -0.022904523300391937
124
Correlation matrix for column 0 and and column124: -0.0006407550209063088
125
Correlation matrix for column 0 and and column125: 0.004735101116302653
126
Correlation matrix for column 0 and and column126: 0.004992089934344456
127
Correlation matrix for column 0 and and column127: 0.001586927367302704
128
Correlation matrix for column 0 and and column128: -0.004595721064631615
129
Correlation matrix for column 0 and and column129: 0.002648138761150019
130
Correlation matrix for column 0 and and column130: 0.012017704451079124
131
Correlation matrix for column 0 and and column131: -0.011085281476286686
132
Corr

Correlation matrix for column 0 and and column232: 0.004000660474909227
233
Correlation matrix for column 0 and and column233: 0.00509127294230283
234
Correlation matrix for column 0 and and column234: -0.005440691531417265
235
Correlation matrix for column 0 and and column235: -0.004361331588515272
236
Correlation matrix for column 0 and and column236: 0.0058424997065688045
237
Correlation matrix for column 0 and and column237: -0.005279954078303295
238
Correlation matrix for column 0 and and column238: 0.003821208706442398
239
Correlation matrix for column 0 and and column239: 0.010824059860641912
240
Correlation matrix for column 0 and and column240: 0.0017039092593223522
241
Correlation matrix for column 0 and and column241: 0.004154205545166176
242
Correlation matrix for column 0 and and column242: 0.01789231595663138
243
Correlation matrix for column 0 and and column243: 0.009909817333817628
244
Correlation matrix for column 0 and and column244: -0.003349178055210816
245
Correlat

Correlation matrix for column 0 and and column344: 0.015590621267163273
345
Correlation matrix for column 0 and and column345: 0.0074312910715804235
346
Correlation matrix for column 0 and and column346: -0.011325343396228416
347
Correlation matrix for column 0 and and column347: 0.017286526812975354
348
Correlation matrix for column 0 and and column348: 0.0017181678305820982
349
Correlation matrix for column 0 and and column349: -0.0010403039996355564
350
Correlation matrix for column 0 and and column350: -0.0014825159920800837
351
Correlation matrix for column 0 and and column351: 0.017309344142099496
352
Correlation matrix for column 0 and and column352: -0.009828066597625576
353
Correlation matrix for column 0 and and column353: 0.012740828742690197
354
Correlation matrix for column 0 and and column354: 0.0036499965505251706
355
Correlation matrix for column 0 and and column355: 0.003620542763047323
356
Correlation matrix for column 0 and and column356: -0.004387198887141541
357
Co

Correlation matrix for column 0 and and column480: -0.0024459373087589303
481
Correlation matrix for column 0 and and column481: -0.002799566799419267
482
Correlation matrix for column 0 and and column482: 0.0009179004373733157
483
Correlation matrix for column 0 and and column483: 0.015793663904988906
484
Correlation matrix for column 0 and and column484: 0.015334789846986052
485
Correlation matrix for column 0 and and column485: -0.002872800812167765
486
Correlation matrix for column 0 and and column486: 0.003781063245376808
487
Correlation matrix for column 0 and and column487: 0.0038833783992493548
488
Correlation matrix for column 0 and and column488: 0.004442240833449362
489
Correlation matrix for column 0 and and column489: -0.014040003245029789
490
Correlation matrix for column 0 and and column490: 0.021993816072847464
491
Correlation matrix for column 0 and and column491: 0.01072288242730271
492
Correlation matrix for column 0 and and column492: -0.0007717683647388617
493
Corr

Correlation matrix for column 0 and and column594: -0.0038935435425540284
595
Correlation matrix for column 0 and and column595: 0.009622613052200328
596
Correlation matrix for column 0 and and column596: -0.002614183607971729
597
Correlation matrix for column 0 and and column597: -0.015923079946107196
598
Correlation matrix for column 0 and and column598: 0.014396881255269634
599
Correlation matrix for column 0 and and column599: 0.0026583900252131133
600
Correlation matrix for column 0 and and column600: 0.030463546871361696
601
Correlation matrix for column 0 and and column601: -0.0008868498884179619
602
Correlation matrix for column 0 and and column602: -0.007521188450062126
603
Correlation matrix for column 0 and and column603: 0.004755321558690016
604
Correlation matrix for column 0 and and column604: 0.006136696318170591
605
Correlation matrix for column 0 and and column605: 0.004012959078991821
606
Correlation matrix for column 0 and and column606: -0.002410007090158896
607
Cor

Correlation matrix for column 0 and and column754: -0.00774213944306665
755
Correlation matrix for column 0 and and column755: 0.001326941738143978
756
Correlation matrix for column 0 and and column756: -0.0004555574821158126
757
Correlation matrix for column 0 and and column757: 0.0003377157724982919
758
Correlation matrix for column 0 and and column758: -0.008406288256314506
759
Correlation matrix for column 0 and and column759: -0.0036204601515371017
760
Correlation matrix for column 0 and and column760: -0.0008064876855471879
761
Correlation matrix for column 0 and and column761: -0.004423963515891383
762
Correlation matrix for column 0 and and column762: 0.0014290408806488964
763
Correlation matrix for column 0 and and column763: -0.017832144895442603
764
Correlation matrix for column 0 and and column764: -0.007136798663920185
765
Correlation matrix for column 0 and and column765: -0.005321565343911679
766
Correlation matrix for column 0 and and column766: 0.0004576827632023595
76

Correlation matrix for column 0 and and column867: -0.005391876492966334
868
Correlation matrix for column 0 and and column868: 0.004906235839509006
869
Correlation matrix for column 0 and and column869: -0.011758007266720742
870
Correlation matrix for column 0 and and column870: -0.007174083267027996
871
Correlation matrix for column 0 and and column871: -0.004204741207659266
872
Correlation matrix for column 0 and and column872: -0.003403981254342799
873
Correlation matrix for column 0 and and column873: -0.0005101218306851052
874
Correlation matrix for column 0 and and column874: 0.0023496776595389227
875
Correlation matrix for column 0 and and column875: -0.010902030518217951
876
Correlation matrix for column 0 and and column876: -0.005971544898014144
877
Correlation matrix for column 0 and and column877: -0.00777267199061405
878
Correlation matrix for column 0 and and column878: 0.005738998952886851
879
Correlation matrix for column 0 and and column879: 0.01084638422790155
880
Cor

Correlation matrix for column 0 and and column978: -0.0004558858465453584
979
Correlation matrix for column 0 and and column979: 0.022354622692316563
980
Correlation matrix for column 0 and and column980: -0.007070130766293175
981
Correlation matrix for column 0 and and column981: -0.01671355631832611
982
Correlation matrix for column 0 and and column982: 0.009694314328468333
983
Correlation matrix for column 0 and and column983: 0.0009375987505307893
984
Correlation matrix for column 0 and and column984: -0.015496801156315803
985
Correlation matrix for column 0 and and column985: -0.015011367403167103
986
Correlation matrix for column 0 and and column986: -0.003969459770892997
987
Correlation matrix for column 0 and and column987: -0.0010545605670274454
988
Correlation matrix for column 0 and and column988: 0.005835074464176421
989
Correlation matrix for column 0 and and column989: -0.012788760560318201
990
Correlation matrix for column 0 and and column990: -0.009293846275775576
991
C

In [5]:
# See how many correlations are significant

To see if indeed the correlations between the different parameters increases with the data size, the original dataframe is compared to a sub-set dataframe which only takes the first z rows of the original dataframe. If the correlations in the original dataframe are higher than in the smaller dataframe this would prove that the bigger the data size the more frequent the number of spurious correlations.

In [6]:
# Creating smaller dataframe taking z number of rows from original dataframe
df2 = df.iloc[:z]
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,199990,199991,199992,199993,199994,199995,199996,199997,199998,199999
0,0.683541,0.018479,0.357882,0.68131,0.447448,0.59851,0.12412,0.0261,0.89288,0.790828,...,0.785743,0.987288,0.587592,0.546459,0.181415,0.818808,0.184176,0.668079,0.347183,0.707497
1,0.827127,0.473744,0.547709,0.097834,0.480502,0.960953,0.667213,0.029142,0.544869,0.029782,...,0.14739,0.316055,0.891719,0.825268,0.782924,0.101897,0.650957,0.081078,0.441026,0.386312
2,0.929321,0.373383,0.937262,0.279188,0.89834,0.396706,0.395249,0.085978,0.535766,0.352,...,0.372708,0.331071,0.823048,0.69859,0.825034,0.67243,0.670454,0.542668,0.040608,0.398071
3,0.669256,0.761206,0.647999,0.893389,0.087763,0.398732,0.69966,0.116835,0.825984,0.666978,...,0.774387,0.791663,0.092473,0.203856,0.140118,0.459813,0.687499,0.229563,0.706178,0.919847
4,0.153948,0.006564,0.293203,0.002775,0.026068,0.006733,0.999795,0.351303,0.612944,0.640485,...,0.444889,0.689587,0.771937,0.902576,0.301864,0.389987,0.739029,0.60798,0.586038,0.148765


In [7]:
# Assigning X to all columns except 0
X_df2 = df2.drop(columns=0)
X_df2.head()

# Assigning Y to column 0
Y_df2 = df2[0]
print(Y_df2)

# The following line makes Y become a list
Y_df2 = np.array(Y_df2).reshape(-1)
print(X_df2.shape,Y_df2.shape)

list_titles = X_df2.columns

list_corr_df2 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df2.append(abs(np.corrcoef(Y_df2, X_df2[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df2, X_df2[i])[0][1]))

0      0.683541
1      0.827127
2      0.929321
3      0.669256
4      0.153948
         ...   
995    0.808768
996    0.474589
997    0.573353
998    0.644323
999    0.210506
Name: 0, Length: 1000, dtype: float64
(1000, 199999) (1000,)
1
Correlation matrix for column 0 and and column1: 0.003245390535283828
2
Correlation matrix for column 0 and and column2: 0.0219811816003659
3
Correlation matrix for column 0 and and column3: -0.03974781731436399
4
Correlation matrix for column 0 and and column4: -0.05048746628639718
5
Correlation matrix for column 0 and and column5: -0.02842895054170089
6
Correlation matrix for column 0 and and column6: 0.056251294251095964
7
Correlation matrix for column 0 and and column7: -0.008514544834533837
8
Correlation matrix for column 0 and and column8: 0.01041363358686422
9
Correlation matrix for column 0 and and column9: -0.0251834549326992
10
Correlation matrix for column 0 and and column10: -0.0267516070001874
11
Correlation matrix for column 0 and and co

Correlation matrix for column 0 and and column130: 0.018565587809408216
131
Correlation matrix for column 0 and and column131: 0.00522407228809864
132
Correlation matrix for column 0 and and column132: 0.0028516528659240908
133
Correlation matrix for column 0 and and column133: -0.03115741816148659
134
Correlation matrix for column 0 and and column134: -0.0011404844975080322
135
Correlation matrix for column 0 and and column135: 0.05402266926567464
136
Correlation matrix for column 0 and and column136: 0.021974907108506657
137
Correlation matrix for column 0 and and column137: 0.03573489229075013
138
Correlation matrix for column 0 and and column138: 0.024682730420597133
139
Correlation matrix for column 0 and and column139: 0.009830175802579295
140
Correlation matrix for column 0 and and column140: 0.03702944642205062
141
Correlation matrix for column 0 and and column141: -0.00428198710235106
142
Correlation matrix for column 0 and and column142: -0.07705823294839145
143
Correlation m

Correlation matrix for column 0 and and column332: 0.0020739711530354667
333
Correlation matrix for column 0 and and column333: 0.059550668633554044
334
Correlation matrix for column 0 and and column334: -0.019567470819622908
335
Correlation matrix for column 0 and and column335: 0.006182014984133862
336
Correlation matrix for column 0 and and column336: 0.010591685867362945
337
Correlation matrix for column 0 and and column337: -0.0583194423749215
338
Correlation matrix for column 0 and and column338: -0.04348502724365143
339
Correlation matrix for column 0 and and column339: -0.006626231450225576
340
Correlation matrix for column 0 and and column340: -0.07112323714569571
341
Correlation matrix for column 0 and and column341: -0.016186184395072616
342
Correlation matrix for column 0 and and column342: 0.022631647468052195
343
Correlation matrix for column 0 and and column343: -0.055539061280091354
344
Correlation matrix for column 0 and and column344: 0.05706803871817384
345
Correlati

Correlation matrix for column 0 and and column473: -0.041167835573146246
474
Correlation matrix for column 0 and and column474: -0.032423180711366205
475
Correlation matrix for column 0 and and column475: -0.00504886923382497
476
Correlation matrix for column 0 and and column476: -0.01715180236847581
477
Correlation matrix for column 0 and and column477: -0.04488760269987353
478
Correlation matrix for column 0 and and column478: -0.005958579253192275
479
Correlation matrix for column 0 and and column479: 0.03406840052709041
480
Correlation matrix for column 0 and and column480: -0.003289839443957107
481
Correlation matrix for column 0 and and column481: -0.01664982186214105
482
Correlation matrix for column 0 and and column482: 0.031852138187119496
483
Correlation matrix for column 0 and and column483: -0.02170262589856085
484
Correlation matrix for column 0 and and column484: 0.028509559128670817
485
Correlation matrix for column 0 and and column485: 0.01771440375447976
486
Correlatio

678
Correlation matrix for column 0 and and column678: -0.00228557203487596
679
Correlation matrix for column 0 and and column679: 0.03041824140982951
680
Correlation matrix for column 0 and and column680: -0.08127344836383754
681
Correlation matrix for column 0 and and column681: -0.01986888247668908
682
Correlation matrix for column 0 and and column682: 0.016529826898569393
683
Correlation matrix for column 0 and and column683: 0.006474867000719555
684
Correlation matrix for column 0 and and column684: -0.019755909786611868
685
Correlation matrix for column 0 and and column685: -0.0002864224914382146
686
Correlation matrix for column 0 and and column686: 0.017301733570988485
687
Correlation matrix for column 0 and and column687: -0.0009386024664526252
688
Correlation matrix for column 0 and and column688: 0.004841950696484172
689
Correlation matrix for column 0 and and column689: 0.004027602274692036
690
Correlation matrix for column 0 and and column690: -0.032366062570118995
691
Cor

Correlation matrix for column 0 and and column801: 0.00971655672541502
802
Correlation matrix for column 0 and and column802: -0.03568952138134177
803
Correlation matrix for column 0 and and column803: 0.004693490237373167
804
Correlation matrix for column 0 and and column804: 0.0032049547491529275
805
Correlation matrix for column 0 and and column805: 0.010211443446137057
806
Correlation matrix for column 0 and and column806: -0.022467419413865188
807
Correlation matrix for column 0 and and column807: 0.07943885495866178
808
Correlation matrix for column 0 and and column808: -0.040838121133422195
809
Correlation matrix for column 0 and and column809: -0.04659508371512232
810
Correlation matrix for column 0 and and column810: -0.016550472163323025
811
Correlation matrix for column 0 and and column811: -0.015767599162621115
812
Correlation matrix for column 0 and and column812: -0.005069204032041974
813
Correlation matrix for column 0 and and column813: 0.03716596411532302
814
Correlati

Correlation matrix for column 0 and and column964: 0.0055763467622627514
965
Correlation matrix for column 0 and and column965: 0.029223273146962665
966
Correlation matrix for column 0 and and column966: -0.0010032492961380328
967
Correlation matrix for column 0 and and column967: -0.03683803278354412
968
Correlation matrix for column 0 and and column968: 0.006818243842167728
969
Correlation matrix for column 0 and and column969: -0.021687313826491104
970
Correlation matrix for column 0 and and column970: 0.06070085022636527
971
Correlation matrix for column 0 and and column971: -0.007848468466702068
972
Correlation matrix for column 0 and and column972: 0.0008198400901795728
973
Correlation matrix for column 0 and and column973: -0.018312672003012925
974
Correlation matrix for column 0 and and column974: -0.004666284515568901
975
Correlation matrix for column 0 and and column975: -0.03466755227468478
976
Correlation matrix for column 0 and and column976: 0.02260003506250112
977
Correl

We now compare which of the correlation lists has the highest numbers of every parameter

In [8]:
a = 0
b = 0
for i in range(0,1000):
    if list_corr_df1[i] > list_corr_df2[i]:
        a+=1
    elif list_corr_df1[i] < list_corr_df2[i]:
        b+=1
    else:
        print()
        
print(a)
print(b)

190
810


Clearly it is seen that the bigger DataFrame has the highest number of spurious correlations.

To counter spurious correlations, random projection can be used

In [None]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.1)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

In [None]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)
X_df_new.head()

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

In [None]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        a+=1
    else:
        pass
print(a)
print(b)

In [None]:
print((a/len(list_corr_df1)*100)

In [None]:
print((b/len(list_corr_df_new)*100)