# Spurious Correlations of Highly Dimensional Big Data

This Notebook aims at showing how PCA and random projection can solve the problem of spurious correlations in Big Data.

In [3]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.utils import shuffle

In [4]:
# User defined parameters

# Number of rows for df1
x = 10000
print(x)

# Number of columns for df1
y = 100000

# Number of rows for df2
z = 1000

10000


In [5]:
# Creating a dataframe with x number of rows and y number of columns
df = pd.DataFrame(np.random.random_sample((x,y)))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0.061298,0.756408,0.955822,0.070020,0.217668,0.393388,0.966329,0.891092,0.579198,0.331265,...,0.778967,0.468210,0.679867,0.729670,0.856490,0.429657,0.296388,0.681342,0.897865,0.635299
1,0.057075,0.137276,0.690617,0.700402,0.325993,0.766677,0.288747,0.774992,0.286148,0.177418,...,0.252897,0.035854,0.934537,0.055186,0.322023,0.246681,0.511080,0.735884,0.838561,0.584525
2,0.984162,0.171919,0.464720,0.553199,0.400680,0.103135,0.817047,0.909135,0.141019,0.806498,...,0.153532,0.337442,0.462770,0.954031,0.676778,0.931126,0.745477,0.756005,0.525756,0.566734
3,0.096138,0.894445,0.430324,0.380440,0.500831,0.930599,0.800609,0.778466,0.686237,0.605540,...,0.950118,0.028114,0.488747,0.066995,0.609090,0.467603,0.076604,0.072263,0.371792,0.658430
4,0.553343,0.994447,0.017715,0.807436,0.216181,0.837522,0.157724,0.566661,0.943171,0.189643,...,0.039185,0.410250,0.547353,0.390316,0.386253,0.362151,0.093699,0.382417,0.690338,0.139009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.792989,0.481503,0.000437,0.557400,0.730382,0.639320,0.469008,0.455646,0.836738,0.612393,...,0.134984,0.344310,0.525505,0.531327,0.239999,0.412274,0.571596,0.045542,0.911634,0.453434
9996,0.645351,0.117989,0.274650,0.843818,0.191691,0.798321,0.230676,0.263166,0.582599,0.682999,...,0.686475,0.305636,0.279913,0.619262,0.853848,0.534704,0.838709,0.578919,0.740227,0.884758
9997,0.067525,0.748941,0.181776,0.962837,0.426376,0.220011,0.126514,0.982534,0.486853,0.141445,...,0.867905,0.233131,0.736405,0.890476,0.314410,0.879346,0.010462,0.818255,0.382479,0.295355
9998,0.851933,0.820017,0.188854,0.163322,0.246018,0.775953,0.177944,0.902797,0.677514,0.723182,...,0.250690,0.840023,0.860393,0.692787,0.814785,0.835220,0.363471,0.259919,0.906705,0.913501


In order to assess the correlations of the different parameters, correlations between the column with index 0 and the 99 other first columns is assessed.

In [4]:
# Assigning X to all columns except 0
X_df = df.drop(columns=0)
X_df.head()

# Assigning Y to column 0
Y_df = df[0]
print(Y_df)

# The following line makes Y become a list
Y_df = np.array(Y_df).reshape(-1)
print(X_df.shape,Y_df.shape)

list_titles = X_df.columns
list_corr_df1 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df1.append(abs(np.corrcoef(Y_df, X_df[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df, X_df[i])[0][1]))

0        0.303852
1        0.109598
2        0.834885
3        0.480425
4        0.117790
           ...   
19995    0.187236
19996    0.964633
19997    0.122353
19998    0.873922
19999    0.330913
Name: 0, Length: 20000, dtype: float64
(20000, 99999) (20000,)
1
Correlation matrix for column 0 and and column1: -0.003396522977200864
2
Correlation matrix for column 0 and and column2: -0.006979010317726101
3
Correlation matrix for column 0 and and column3: 0.011254365651114592
4
Correlation matrix for column 0 and and column4: -0.009139271059098711
5
Correlation matrix for column 0 and and column5: -0.00534948128156126
6
Correlation matrix for column 0 and and column6: 0.01654278816153284
7
Correlation matrix for column 0 and and column7: 0.008317330740406961
8
Correlation matrix for column 0 and and column8: -0.005202043010063194
9
Correlation matrix for column 0 and and column9: 0.007895479137871633
10
Correlation matrix for column 0 and and column10: 0.0020560316387869585
11
Correlatio

Correlation matrix for column 0 and and column115: 0.0033482985481543756
116
Correlation matrix for column 0 and and column116: -0.002794941723732855
117
Correlation matrix for column 0 and and column117: 0.005872375310205831
118
Correlation matrix for column 0 and and column118: 0.004899976861876078
119
Correlation matrix for column 0 and and column119: 0.007929714537224705
120
Correlation matrix for column 0 and and column120: -0.0060575258495928486
121
Correlation matrix for column 0 and and column121: -0.009366982196977732
122
Correlation matrix for column 0 and and column122: 0.00032770966056170514
123
Correlation matrix for column 0 and and column123: 0.0026786298555103684
124
Correlation matrix for column 0 and and column124: 0.00879330168254894
125
Correlation matrix for column 0 and and column125: -0.0006718742730727252
126
Correlation matrix for column 0 and and column126: -0.001689684670689891
127
Correlation matrix for column 0 and and column127: 0.007497268304295305
128
Co

Correlation matrix for column 0 and and column237: -0.013158728213246587
238
Correlation matrix for column 0 and and column238: 0.00047793034444308685
239
Correlation matrix for column 0 and and column239: -0.0012214478835695427
240
Correlation matrix for column 0 and and column240: -0.009791737870586842
241
Correlation matrix for column 0 and and column241: -0.0019027122631681337
242
Correlation matrix for column 0 and and column242: 0.000601088232069535
243
Correlation matrix for column 0 and and column243: -0.009278578201744374
244
Correlation matrix for column 0 and and column244: 0.010361841722142525
245
Correlation matrix for column 0 and and column245: 0.013333557978235577
246
Correlation matrix for column 0 and and column246: 0.006680926232278086
247
Correlation matrix for column 0 and and column247: 0.005971533572418039
248
Correlation matrix for column 0 and and column248: 0.002785054390365264
249
Correlation matrix for column 0 and and column249: 0.011052782986499303
250
Cor

Correlation matrix for column 0 and and column361: -0.0024937643222617163
362
Correlation matrix for column 0 and and column362: 0.013422879946723486
363
Correlation matrix for column 0 and and column363: 0.00570826233679019
364
Correlation matrix for column 0 and and column364: 0.0034774420383366675
365
Correlation matrix for column 0 and and column365: 0.0010644407605707958
366
Correlation matrix for column 0 and and column366: -0.008467305663891856
367
Correlation matrix for column 0 and and column367: -0.005654624231692833
368
Correlation matrix for column 0 and and column368: -0.006349728192464363
369
Correlation matrix for column 0 and and column369: 0.007133358954971568
370
Correlation matrix for column 0 and and column370: -0.013942758257177935
371
Correlation matrix for column 0 and and column371: -0.0012614018602592964
372
Correlation matrix for column 0 and and column372: -0.004945890258273611
373
Correlation matrix for column 0 and and column373: -0.001334491109337416
374
C

Correlation matrix for column 0 and and column482: 0.0030350070464975005
483
Correlation matrix for column 0 and and column483: -0.007323920189132415
484
Correlation matrix for column 0 and and column484: -0.0001755699354957022
485
Correlation matrix for column 0 and and column485: -0.00547167250305936
486
Correlation matrix for column 0 and and column486: -0.007161019279441674
487
Correlation matrix for column 0 and and column487: -0.0037912916538775294
488
Correlation matrix for column 0 and and column488: 0.00789664495944639
489
Correlation matrix for column 0 and and column489: -0.0017366178621486745
490
Correlation matrix for column 0 and and column490: -5.5935335801330894e-05
491
Correlation matrix for column 0 and and column491: -0.0005856993308833716
492
Correlation matrix for column 0 and and column492: 0.008532364325674583
493
Correlation matrix for column 0 and and column493: -0.004806960986798711
494
Correlation matrix for column 0 and and column494: -0.0101725777223215
495

Correlation matrix for column 0 and and column609: 0.002378574078606475
610
Correlation matrix for column 0 and and column610: -0.0018824393845540955
611
Correlation matrix for column 0 and and column611: -9.558392871210671e-05
612
Correlation matrix for column 0 and and column612: 0.011722524212887664
613
Correlation matrix for column 0 and and column613: 0.0054021715368286925
614
Correlation matrix for column 0 and and column614: 0.0016016866538687282
615
Correlation matrix for column 0 and and column615: -0.00419214242432847
616
Correlation matrix for column 0 and and column616: -0.012197570469096695
617
Correlation matrix for column 0 and and column617: 0.0018405339107927133
618
Correlation matrix for column 0 and and column618: 0.0015876738712042688
619
Correlation matrix for column 0 and and column619: -0.003989878515018886
620
Correlation matrix for column 0 and and column620: 0.0021403223327296286
621
Correlation matrix for column 0 and and column621: 0.010600718009439476
622
C

Correlation matrix for column 0 and and column729: -0.008166450255280087
730
Correlation matrix for column 0 and and column730: -0.013681415495989554
731
Correlation matrix for column 0 and and column731: 0.000340199428100655
732
Correlation matrix for column 0 and and column732: -0.0031197827297952054
733
Correlation matrix for column 0 and and column733: -0.0009991128151762587
734
Correlation matrix for column 0 and and column734: 0.006966498750770751
735
Correlation matrix for column 0 and and column735: 3.595470887072723e-05
736
Correlation matrix for column 0 and and column736: -0.007306649041277611
737
Correlation matrix for column 0 and and column737: 0.0051223490778897944
738
Correlation matrix for column 0 and and column738: -0.0001925643847144236
739
Correlation matrix for column 0 and and column739: -0.01156729737866329
740
Correlation matrix for column 0 and and column740: -0.0021110790159473048
741
Correlation matrix for column 0 and and column741: 0.015686011404771646
742

Correlation matrix for column 0 and and column847: 0.003825427969033489
848
Correlation matrix for column 0 and and column848: 0.00019483132535199672
849
Correlation matrix for column 0 and and column849: 0.002335375326104157
850
Correlation matrix for column 0 and and column850: 0.0010260323017884138
851
Correlation matrix for column 0 and and column851: -0.006823274401181871
852
Correlation matrix for column 0 and and column852: -0.012134705936429487
853
Correlation matrix for column 0 and and column853: -0.001736045688528761
854
Correlation matrix for column 0 and and column854: -0.006060479643645313
855
Correlation matrix for column 0 and and column855: 0.006198631388179295
856
Correlation matrix for column 0 and and column856: -0.010467145620767627
857
Correlation matrix for column 0 and and column857: 0.009450035099723567
858
Correlation matrix for column 0 and and column858: 0.011694998246075132
859
Correlation matrix for column 0 and and column859: -0.0009218993268162487
860
Co

Correlation matrix for column 0 and and column961: -0.009552010744516032
962
Correlation matrix for column 0 and and column962: 0.004719865927362495
963
Correlation matrix for column 0 and and column963: 0.0019998066297406223
964
Correlation matrix for column 0 and and column964: 0.014317773157349188
965
Correlation matrix for column 0 and and column965: 0.008566911215923177
966
Correlation matrix for column 0 and and column966: 0.00723078702744491
967
Correlation matrix for column 0 and and column967: 0.0075644143272277625
968
Correlation matrix for column 0 and and column968: 0.006650850207073706
969
Correlation matrix for column 0 and and column969: 0.0012383545499493071
970
Correlation matrix for column 0 and and column970: 0.002616907970338605
971
Correlation matrix for column 0 and and column971: 0.008458528678047908
972
Correlation matrix for column 0 and and column972: -0.003526102582398916
973
Correlation matrix for column 0 and and column973: 0.008627227454070184
974
Correlat

In [5]:
# See how many correlations are significant

To see if indeed the correlations between the different parameters increases with the data size, the original dataframe is compared to a sub-set dataframe which only takes the first z rows of the original dataframe. If the correlations in the original dataframe are higher than in the smaller dataframe this would prove that the bigger the data size the more frequent the number of spurious correlations.

In [6]:
# Creating smaller dataframe taking z number of rows from original dataframe
df2 = df.iloc[:z]
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0.303852,0.482469,0.224052,0.344633,0.188248,0.105952,0.26962,0.334199,0.476101,0.036924,...,0.53733,0.216358,0.699363,0.471592,0.754962,0.658774,0.912231,0.666855,0.385205,0.457965
1,0.109598,0.519675,0.592146,0.06736,0.822842,0.540424,0.625228,0.021509,0.940852,0.251539,...,0.399282,0.368807,0.140951,0.703834,0.531143,0.976082,0.574081,0.102286,0.762093,0.721805
2,0.834885,0.079509,0.46305,0.529323,0.263668,0.110891,0.475412,0.231435,0.926952,0.381912,...,0.716307,0.812762,0.991564,0.97324,0.495106,0.715242,0.674071,0.510914,0.792048,0.390235
3,0.480425,0.219961,0.709749,0.883555,0.325709,0.832008,0.519205,0.095463,0.860421,0.493793,...,0.028534,0.559854,0.730829,0.979136,0.508722,0.896001,0.870853,0.669506,0.92902,0.386358
4,0.11779,0.811164,0.545468,0.048144,0.474348,0.88023,0.33181,0.636698,0.774046,0.946138,...,0.078699,0.45613,0.044826,0.1739,0.614132,0.078417,0.977479,0.133508,0.557102,0.076976


In [7]:
# Assigning X to all columns except 0
X_df2 = df2.drop(columns=0)
X_df2.head()

# Assigning Y to column 0
Y_df2 = df2[0]
print(Y_df2)

# The following line makes Y become a list
Y_df2 = np.array(Y_df2).reshape(-1)
print(X_df2.shape,Y_df2.shape)

list_titles = X_df2.columns

list_corr_df2 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df2.append(abs(np.corrcoef(Y_df2, X_df2[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df2, X_df2[i])[0][1]))

0       0.303852
1       0.109598
2       0.834885
3       0.480425
4       0.117790
          ...   
4995    0.808964
4996    0.425854
4997    0.094408
4998    0.437089
4999    0.860908
Name: 0, Length: 5000, dtype: float64
(5000, 99999) (5000,)
1
Correlation matrix for column 0 and and column1: -0.009502765791843597
2
Correlation matrix for column 0 and and column2: -0.004907494856086281
3
Correlation matrix for column 0 and and column3: 0.00679108825012815
4
Correlation matrix for column 0 and and column4: -0.03328017221360417
5
Correlation matrix for column 0 and and column5: -0.017899070583821782
6
Correlation matrix for column 0 and and column6: 0.0253278520013889
7
Correlation matrix for column 0 and and column7: 0.0021226086961311924
8
Correlation matrix for column 0 and and column8: -0.01954791640157403
9
Correlation matrix for column 0 and and column9: 0.02720615611665416
10
Correlation matrix for column 0 and and column10: -0.018999075111300757
11
Correlation matrix for colu

143
Correlation matrix for column 0 and and column143: 0.008524598240079966
144
Correlation matrix for column 0 and and column144: -0.03250753936555639
145
Correlation matrix for column 0 and and column145: -0.0025750611672324046
146
Correlation matrix for column 0 and and column146: 0.024200060721155634
147
Correlation matrix for column 0 and and column147: 0.001575976169715187
148
Correlation matrix for column 0 and and column148: -0.00917004181982708
149
Correlation matrix for column 0 and and column149: -0.009647466987578188
150
Correlation matrix for column 0 and and column150: -0.009116717533879813
151
Correlation matrix for column 0 and and column151: -0.013318877346749414
152
Correlation matrix for column 0 and and column152: -0.02818858729419624
153
Correlation matrix for column 0 and and column153: -0.015527128025831281
154
Correlation matrix for column 0 and and column154: -0.018434446856062565
155
Correlation matrix for column 0 and and column155: -0.004151907338228391
156


Correlation matrix for column 0 and and column324: 0.021729182928476323
325
Correlation matrix for column 0 and and column325: -0.008540574154078998
326
Correlation matrix for column 0 and and column326: -0.012361153588043239
327
Correlation matrix for column 0 and and column327: 0.003924127338168441
328
Correlation matrix for column 0 and and column328: -0.0026850539395201502
329
Correlation matrix for column 0 and and column329: -0.009595595835723643
330
Correlation matrix for column 0 and and column330: 0.013074616247607234
331
Correlation matrix for column 0 and and column331: 0.02616682695037196
332
Correlation matrix for column 0 and and column332: 0.00863152513012527
333
Correlation matrix for column 0 and and column333: -0.006110953870226094
334
Correlation matrix for column 0 and and column334: 0.025972224517568167
335
Correlation matrix for column 0 and and column335: -0.021529291422373652
336
Correlation matrix for column 0 and and column336: -0.00020522974213899492
337
Corr

Correlation matrix for column 0 and and column505: 0.021817685476153616
506
Correlation matrix for column 0 and and column506: -0.012033559308842779
507
Correlation matrix for column 0 and and column507: 0.006883430025021004
508
Correlation matrix for column 0 and and column508: -0.009449715802268758
509
Correlation matrix for column 0 and and column509: -0.002879778278451026
510
Correlation matrix for column 0 and and column510: -0.013824600921938756
511
Correlation matrix for column 0 and and column511: 0.004015121315021733
512
Correlation matrix for column 0 and and column512: 0.003963713821618299
513
Correlation matrix for column 0 and and column513: -0.011654468620413893
514
Correlation matrix for column 0 and and column514: -0.015603496593448337
515
Correlation matrix for column 0 and and column515: -0.006139881995189551
516
Correlation matrix for column 0 and and column516: 0.016989020457659286
517
Correlation matrix for column 0 and and column517: -0.009851272814724213
518
Corr

Correlation matrix for column 0 and and column698: 0.015773047952384534
699
Correlation matrix for column 0 and and column699: 0.0027230717974416115
700
Correlation matrix for column 0 and and column700: 0.008960056520612
701
Correlation matrix for column 0 and and column701: -0.011257160071755114
702
Correlation matrix for column 0 and and column702: 0.01456798177962901
703
Correlation matrix for column 0 and and column703: -0.018457717597143684
704
Correlation matrix for column 0 and and column704: 0.038866996810514094
705
Correlation matrix for column 0 and and column705: -0.008049867705259913
706
Correlation matrix for column 0 and and column706: -0.0013416562511832408
707
Correlation matrix for column 0 and and column707: 0.011541164820778282
708
Correlation matrix for column 0 and and column708: -0.016976716385291347
709
Correlation matrix for column 0 and and column709: -0.011583931683362143
710
Correlation matrix for column 0 and and column710: -0.007684615477164827
711
Correla

Correlation matrix for column 0 and and column897: -0.002328664155078128
898
Correlation matrix for column 0 and and column898: 0.01942533203921596
899
Correlation matrix for column 0 and and column899: -0.02185972380246306
900
Correlation matrix for column 0 and and column900: 0.001282952598261087
901
Correlation matrix for column 0 and and column901: -0.009117090558880676
902
Correlation matrix for column 0 and and column902: -0.01769938078845998
903
Correlation matrix for column 0 and and column903: -0.0072899453451711355
904
Correlation matrix for column 0 and and column904: 0.0007639585462188957
905
Correlation matrix for column 0 and and column905: -0.008244461221131712
906
Correlation matrix for column 0 and and column906: -0.008914802880253033
907
Correlation matrix for column 0 and and column907: 0.007303547757071852
908
Correlation matrix for column 0 and and column908: 0.004829753799585051
909
Correlation matrix for column 0 and and column909: 0.011825948599842817
910
Correl

We now compare which of the correlation lists has the highest numbers of every parameter

In [8]:
a = 0
b = 0
for i in range(0,1000):
    if list_corr_df1[i] > list_corr_df2[i]:
        a+=1
    elif list_corr_df1[i] < list_corr_df2[i]:
        b+=1
    else:
        print()
        
print(a)
print(b)

275
725


Clearly it is seen that the bigger DataFrame has the highest number of spurious correlations.

To counter spurious correlations, random projection can be used

In [9]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.1)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8478,8479,8480,8481,8482,8483,8484,8485,8486,8487
0,-1.505776,0.494964,1.454985,-2.409638,0.037033,1.726040,-0.336924,0.828892,-4.078158,-1.600785,...,3.580053,-4.282690,-0.227981,1.356377,1.634153,0.017160,2.432889,0.550471,-2.439324,2.293737
1,-1.977856,-1.692631,1.221273,0.246123,-0.304276,1.448005,1.951724,1.380400,-2.351605,0.712508,...,1.638031,-4.264461,1.251586,-0.124997,2.038625,0.531707,0.213835,1.201850,-3.363666,2.219868
2,-1.790018,-0.419632,1.069008,0.517965,-0.592610,-0.248424,-0.271528,1.501446,-5.498978,-0.979634,...,1.567332,-2.718169,-0.836026,1.225188,0.235857,0.991075,1.786082,0.825860,-1.314704,2.789844
3,-2.368421,-2.601237,0.644601,-1.544683,-0.312503,1.427571,-1.573056,3.583695,-2.982832,-1.294038,...,1.685605,-3.033735,0.225959,0.835180,2.906469,-1.278746,2.033608,0.892400,0.193377,0.506989
4,-0.606204,-2.506238,0.244723,-0.684318,0.543826,-1.066708,-0.038253,2.340018,-1.570519,-0.493867,...,0.955904,-3.269878,1.305561,0.376849,1.617395,-0.091312,2.876207,0.713865,-0.765709,1.906405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,-1.125111,-2.714008,0.293237,-0.895614,0.525664,-0.299298,0.844249,1.379512,-5.831828,-2.440473,...,1.854650,-3.923876,-0.429098,0.489553,1.036466,1.110395,2.455838,1.405563,-1.393829,-0.446387
19996,-0.710740,-2.315788,1.946590,0.736356,-1.287408,0.707104,-0.905361,1.807740,-4.349939,-0.873873,...,-0.467271,-3.482126,-0.131087,-0.251312,1.132335,0.202859,0.810187,-1.884355,-2.551507,-0.954142
19997,-1.222876,-1.616258,-0.160353,-1.512945,1.059584,0.068243,0.208985,0.691161,-4.393229,0.551765,...,0.679544,-3.817209,0.253394,0.592943,2.986748,-0.382308,1.610006,0.070262,-2.665169,2.378818
19998,-2.433725,-3.402373,1.813970,-0.621652,-0.329505,0.124545,-1.045627,1.556182,-4.165089,-1.690462,...,0.532385,-3.247353,0.195401,0.216646,1.982551,0.271126,3.658061,-0.222213,-1.402727,-0.126109


In [10]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)
X_df_new.head()

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

0       -1.505776
1       -1.977856
2       -1.790018
3       -2.368421
4       -0.606204
           ...   
19995   -1.125111
19996   -0.710740
19997   -1.222876
19998   -2.433725
19999   -1.880259
Name: 0, Length: 20000, dtype: float64
(20000, 8487) (20000,)
1
Correlation matrix for column 0 and and column1: 0.008779665327064158
2
Correlation matrix for column 0 and and column2: 0.009636426940665806
3
Correlation matrix for column 0 and and column3: -0.002791849477536291
4
Correlation matrix for column 0 and and column4: 0.003367036442339243
5
Correlation matrix for column 0 and and column5: -0.01510202162714712
6
Correlation matrix for column 0 and and column6: -0.005191911409021359
7
Correlation matrix for column 0 and and column7: -0.003960571441833615
8
Correlation matrix for column 0 and and column8: -0.001304515743659646
9
Correlation matrix for column 0 and and column9: -0.014830641750613376
10
Correlation matrix for column 0 and and column10: -0.015800835182206593
11
Correlati

Correlation matrix for column 0 and and column132: 0.005364418248008618
133
Correlation matrix for column 0 and and column133: -0.011010352388843923
134
Correlation matrix for column 0 and and column134: 0.008078072061904404
135
Correlation matrix for column 0 and and column135: -0.00783130121004016
136
Correlation matrix for column 0 and and column136: 0.01137485305425162
137
Correlation matrix for column 0 and and column137: 0.007792680148718162
138
Correlation matrix for column 0 and and column138: 0.0026805405461628453
139
Correlation matrix for column 0 and and column139: -0.0047687161020803955
140
Correlation matrix for column 0 and and column140: -0.0033612205985183875
141
Correlation matrix for column 0 and and column141: -0.000681509968009223
142
Correlation matrix for column 0 and and column142: 0.0029030922801184514
143
Correlation matrix for column 0 and and column143: 0.007751171502600823
144
Correlation matrix for column 0 and and column144: 0.003987541557529419
145
Corre

Correlation matrix for column 0 and and column256: 0.006804892383573044
257
Correlation matrix for column 0 and and column257: -0.013719335111009695
258
Correlation matrix for column 0 and and column258: 0.009501331319921947
259
Correlation matrix for column 0 and and column259: -0.004036500601435813
260
Correlation matrix for column 0 and and column260: -0.008376732713700163
261
Correlation matrix for column 0 and and column261: -0.006810539687802454
262
Correlation matrix for column 0 and and column262: -0.005450508688788775
263
Correlation matrix for column 0 and and column263: 0.002397328058899663
264
Correlation matrix for column 0 and and column264: 0.0020160068409667154
265
Correlation matrix for column 0 and and column265: -0.0030297918474414165
266
Correlation matrix for column 0 and and column266: -0.007337661150303468
267
Correlation matrix for column 0 and and column267: 0.011835832998053004
268
Correlation matrix for column 0 and and column268: -0.0086479575937141
269
Corr

Correlation matrix for column 0 and and column414: -0.0014391183725916653
415
Correlation matrix for column 0 and and column415: 0.01943967818938483
416
Correlation matrix for column 0 and and column416: 0.00043241523747762785
417
Correlation matrix for column 0 and and column417: -0.0035874744208933443
418
Correlation matrix for column 0 and and column418: -0.00423337839607137
419
Correlation matrix for column 0 and and column419: -0.017454854339776803
420
Correlation matrix for column 0 and and column420: 0.008115589164944502
421
Correlation matrix for column 0 and and column421: 0.0023617357098590207
422
Correlation matrix for column 0 and and column422: -0.002264002629055862
423
Correlation matrix for column 0 and and column423: -0.001294049718791202
424
Correlation matrix for column 0 and and column424: -0.0013830502564967596
425
Correlation matrix for column 0 and and column425: -0.0031337616533529773
426
Correlation matrix for column 0 and and column426: 0.006845326504136741
427

Correlation matrix for column 0 and and column599: 0.005209867629342986
600
Correlation matrix for column 0 and and column600: -0.0021295940024284813
601
Correlation matrix for column 0 and and column601: -0.012992300627547337
602
Correlation matrix for column 0 and and column602: 0.010081511839833709
603
Correlation matrix for column 0 and and column603: -0.0033133007986770467
604
Correlation matrix for column 0 and and column604: -0.0058057476638832435
605
Correlation matrix for column 0 and and column605: -0.004245709629395608
606
Correlation matrix for column 0 and and column606: 0.006123874478516658
607
Correlation matrix for column 0 and and column607: 0.001962207097719299
608
Correlation matrix for column 0 and and column608: 0.0036816708762541237
609
Correlation matrix for column 0 and and column609: 0.003671989408439695
610
Correlation matrix for column 0 and and column610: 0.015411189558319187
611
Correlation matrix for column 0 and and column611: -0.011359920615304176
612
Co

Correlation matrix for column 0 and and column784: 0.005203884732751876
785
Correlation matrix for column 0 and and column785: -0.0006114032012485514
786
Correlation matrix for column 0 and and column786: -0.008834336916076949
787
Correlation matrix for column 0 and and column787: 0.004743939419335588
788
Correlation matrix for column 0 and and column788: -0.007763074585111173
789
Correlation matrix for column 0 and and column789: 0.003159649412473433
790
Correlation matrix for column 0 and and column790: -0.0001128649607199206
791
Correlation matrix for column 0 and and column791: 0.00327807678773746
792
Correlation matrix for column 0 and and column792: -0.006047901612415119
793
Correlation matrix for column 0 and and column793: 0.0023286242170928056
794
Correlation matrix for column 0 and and column794: 0.003431076140409614
795
Correlation matrix for column 0 and and column795: -0.001712454949616977
796
Correlation matrix for column 0 and and column796: 0.01062528327377366
797
Corre

Correlation matrix for column 0 and and column968: 0.0100350598942873
969
Correlation matrix for column 0 and and column969: -0.009560328705448509
970
Correlation matrix for column 0 and and column970: 0.0008120093768339719
971
Correlation matrix for column 0 and and column971: 0.0012283910463179869
972
Correlation matrix for column 0 and and column972: 0.0017349104025511071
973
Correlation matrix for column 0 and and column973: -0.011479402046851942
974
Correlation matrix for column 0 and and column974: -0.008154433337014447
975
Correlation matrix for column 0 and and column975: -0.00028235192799421144
976
Correlation matrix for column 0 and and column976: 0.00533717349781601
977
Correlation matrix for column 0 and and column977: -0.006634504904919146
978
Correlation matrix for column 0 and and column978: 0.008925114075062716
979
Correlation matrix for column 0 and and column979: -0.0007144594426985082
980
Correlation matrix for column 0 and and column980: 0.012556336487986118
981
Cor

In [11]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass
print(a)
print(b)

162
222


In [12]:
print((a/len(list_corr_df1)*100))

16.2


In [13]:
print((b/len(list_corr_df_new)*100))

22.2
