# Spurious Correlations of Highly Dimensional Big Data

This Notebook aims at showing how PCA and random projection can solve the problem of spurious correlations in Big Data.

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.utils import shuffle

In [2]:
# User defined parameters

# Number of rows for df1
x = 10000
print(x)

# Number of columns for df1
y = 200000

# Number of rows for df2
z = 1000

10000


In [3]:
# Creating a dataframe with x number of rows and y number of columns
df = pd.DataFrame(np.random.random_sample((x,y)))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,199990,199991,199992,199993,199994,199995,199996,199997,199998,199999
0,0.452499,0.015201,0.821404,0.212805,0.685945,0.305132,0.424505,0.969277,0.416651,0.644096,...,0.438304,0.881876,0.292105,0.671391,0.005284,0.437007,0.210602,0.985556,0.358968,0.645019
1,0.355750,0.360524,0.514521,0.175276,0.496044,0.790927,0.397017,0.654430,0.896949,0.345304,...,0.689668,0.271047,0.948846,0.820937,0.049355,0.520599,0.717266,0.917065,0.186555,0.546087
2,0.069522,0.101912,0.327868,0.159529,0.962489,0.909987,0.941861,0.427619,0.539128,0.315603,...,0.413128,0.612691,0.938097,0.911261,0.781846,0.421557,0.192917,0.449100,0.114279,0.681607
3,0.680208,0.910606,0.913884,0.863068,0.539352,0.562692,0.123968,0.473372,0.232446,0.812583,...,0.561565,0.993054,0.399094,0.975478,0.561635,0.912421,0.825015,0.758597,0.830240,0.333259
4,0.392236,0.400862,0.125789,0.437953,0.519968,0.437479,0.047480,0.640837,0.264325,0.219463,...,0.548551,0.664787,0.672099,0.042429,0.031315,0.504037,0.362528,0.796567,0.767595,0.185414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.481008,0.739183,0.256689,0.427259,0.200591,0.795033,0.987056,0.405392,0.064573,0.563152,...,0.011348,0.082764,0.870629,0.394139,0.307250,0.643393,0.275145,0.017221,0.137578,0.905565
9996,0.094865,0.374013,0.803594,0.866782,0.548235,0.125134,0.820147,0.603449,0.591685,0.436062,...,0.693236,0.953898,0.563271,0.763430,0.311998,0.388640,0.265597,0.673339,0.826547,0.216650
9997,0.770083,0.774229,0.083592,0.643795,0.505837,0.386739,0.684326,0.745352,0.043293,0.171218,...,0.312898,0.702338,0.558973,0.036452,0.873408,0.864624,0.532652,0.724782,0.592730,0.810464
9998,0.721382,0.877774,0.921654,0.052360,0.745420,0.712550,0.818238,0.420171,0.288095,0.238262,...,0.628822,0.577783,0.461548,0.616376,0.494411,0.735491,0.476356,0.404001,0.091986,0.984021


In order to assess the correlations of the different parameters, correlations between the column with index 0 and the 99 other first columns is assessed.

In [4]:
# Assigning X to all colusmns except 0
X_df = df.drop(columns=0)
X_df.head()

# Assigning Y to column 0
Y_df = df[0]
print(Y_df)

# The following line makes Y become a list
Y_df = np.array(Y_df).reshape(-1)
print(X_df.shape,Y_df.shape)

list_titles = X_df.columns
list_corr_df1 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df1.append(abs(np.corrcoef(Y_df, X_df[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df, X_df[i])[0][1]))

0       0.452499
1       0.355750
2       0.069522
3       0.680208
4       0.392236
          ...   
9995    0.481008
9996    0.094865
9997    0.770083
9998    0.721382
9999    0.585542
Name: 0, Length: 10000, dtype: float64
(10000, 199999) (10000,)
1
Correlation matrix for column 0 and and column1: 0.0026353420665170607
2
Correlation matrix for column 0 and and column2: 0.011552262907478876
3
Correlation matrix for column 0 and and column3: 0.018147081811682408
4
Correlation matrix for column 0 and and column4: 0.022030453260380386
5
Correlation matrix for column 0 and and column5: 0.005150211050657107
6
Correlation matrix for column 0 and and column6: 0.00033102247060111617
7
Correlation matrix for column 0 and and column7: 0.004139693537590675
8
Correlation matrix for column 0 and and column8: 0.0026441307808484768
9
Correlation matrix for column 0 and and column9: -0.009054408190033258
10
Correlation matrix for column 0 and and column10: -0.0050047499649192384
11
Correlation matri

Correlation matrix for column 0 and and column136: -0.0023550284443501317
137
Correlation matrix for column 0 and and column137: -0.017330043188773625
138
Correlation matrix for column 0 and and column138: -0.00879785229237748
139
Correlation matrix for column 0 and and column139: -0.003835068445296891
140
Correlation matrix for column 0 and and column140: -0.004976768841393993
141
Correlation matrix for column 0 and and column141: 0.0012122921300404086
142
Correlation matrix for column 0 and and column142: -0.0006483937140745596
143
Correlation matrix for column 0 and and column143: -0.00556772162991332
144
Correlation matrix for column 0 and and column144: -0.005388004826533412
145
Correlation matrix for column 0 and and column145: 0.004647414525043903
146
Correlation matrix for column 0 and and column146: -0.008030407789628954
147
Correlation matrix for column 0 and and column147: 0.002003236002627151
148
Correlation matrix for column 0 and and column148: -0.004590484050427332
149
C

Correlation matrix for column 0 and and column293: 0.012800075141025661
294
Correlation matrix for column 0 and and column294: 0.013511942550766399
295
Correlation matrix for column 0 and and column295: 0.007463124653099404
296
Correlation matrix for column 0 and and column296: -0.0018785350372614172
297
Correlation matrix for column 0 and and column297: 0.013446548367435482
298
Correlation matrix for column 0 and and column298: 0.006873723738187833
299
Correlation matrix for column 0 and and column299: 0.006375773625870046
300
Correlation matrix for column 0 and and column300: -0.00410723229848226
301
Correlation matrix for column 0 and and column301: -0.0068270833747684665
302
Correlation matrix for column 0 and and column302: 0.0027677711236328
303
Correlation matrix for column 0 and and column303: 0.001073450767148915
304
Correlation matrix for column 0 and and column304: -0.005076087684689979
305
Correlation matrix for column 0 and and column305: 0.01362393001476217
306
Correlatio

Correlation matrix for column 0 and and column450: 0.003695441985358796
451
Correlation matrix for column 0 and and column451: -0.011494159606345847
452
Correlation matrix for column 0 and and column452: -0.0037910000357662733
453
Correlation matrix for column 0 and and column453: -0.0012965716580340169
454
Correlation matrix for column 0 and and column454: 0.0014969687793666717
455
Correlation matrix for column 0 and and column455: -0.023652389972281092
456
Correlation matrix for column 0 and and column456: -0.00790065719434234
457
Correlation matrix for column 0 and and column457: -0.0012257714704074847
458
Correlation matrix for column 0 and and column458: -0.010970395544179853
459
Correlation matrix for column 0 and and column459: 0.0022944945411236574
460
Correlation matrix for column 0 and and column460: -0.011934972938093955
461
Correlation matrix for column 0 and and column461: -0.010001116851229386
462
Correlation matrix for column 0 and and column462: 0.010857506517071677
463

Correlation matrix for column 0 and and column626: -0.002926950068952106
627
Correlation matrix for column 0 and and column627: 0.004096762901802898
628
Correlation matrix for column 0 and and column628: 0.01683268836740143
629
Correlation matrix for column 0 and and column629: 0.010096436585295005
630
Correlation matrix for column 0 and and column630: -0.006940169784267753
631
Correlation matrix for column 0 and and column631: -0.021988450572397997
632
Correlation matrix for column 0 and and column632: 0.008694880727011326
633
Correlation matrix for column 0 and and column633: -0.004053410134382115
634
Correlation matrix for column 0 and and column634: 0.011476819776808167
635
Correlation matrix for column 0 and and column635: 0.0024225547787674008
636
Correlation matrix for column 0 and and column636: 0.00503457731010009
637
Correlation matrix for column 0 and and column637: 0.012870132616362061
638
Correlation matrix for column 0 and and column638: -0.0135642539780109
639
Correlatio

Correlation matrix for column 0 and and column782: -0.011435114905690682
783
Correlation matrix for column 0 and and column783: 0.016602703920115806
784
Correlation matrix for column 0 and and column784: 0.016811352291778953
785
Correlation matrix for column 0 and and column785: -0.003643431156740555
786
Correlation matrix for column 0 and and column786: 0.008045678548980617
787
Correlation matrix for column 0 and and column787: -0.003924830464865455
788
Correlation matrix for column 0 and and column788: 0.004912170041078544
789
Correlation matrix for column 0 and and column789: 0.008091214595525937
790
Correlation matrix for column 0 and and column790: 0.001683902883845002
791
Correlation matrix for column 0 and and column791: 0.01246940053441964
792
Correlation matrix for column 0 and and column792: 0.000652519593823445
793
Correlation matrix for column 0 and and column793: 0.01126184535107657
794
Correlation matrix for column 0 and and column794: 0.022287618643755676
795
Correlation

Correlation matrix for column 0 and and column933: 0.003278251930204863
934
Correlation matrix for column 0 and and column934: 0.010863198388335443
935
Correlation matrix for column 0 and and column935: -0.00601453147427881
936
Correlation matrix for column 0 and and column936: -0.0038308111229637147
937
Correlation matrix for column 0 and and column937: -0.007604488422971087
938
Correlation matrix for column 0 and and column938: -0.013250913869900726
939
Correlation matrix for column 0 and and column939: 0.005207048745197261
940
Correlation matrix for column 0 and and column940: 0.01129144289102709
941
Correlation matrix for column 0 and and column941: -0.010661133089442578
942
Correlation matrix for column 0 and and column942: -0.0025830385323536817
943
Correlation matrix for column 0 and and column943: -0.002173100949790411
944
Correlation matrix for column 0 and and column944: 0.010694350171006554
945
Correlation matrix for column 0 and and column945: -0.013118797270654263
946
Corr

In [5]:
# See how many correlations are significant

To see if indeed the correlations between the different parameters increases with the data size, the original dataframe is compared to a sub-set dataframe which only takes the first z rows of the original dataframe. If the correlations in the original dataframe are higher than in the smaller dataframe this would prove that the bigger the data size the more frequent the number of spurious correlations.

In [6]:
# Creating smaller dataframe taking z number of rows from original dataframe
df2 = df.iloc[:z]
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,199990,199991,199992,199993,199994,199995,199996,199997,199998,199999
0,0.452499,0.015201,0.821404,0.212805,0.685945,0.305132,0.424505,0.969277,0.416651,0.644096,...,0.438304,0.881876,0.292105,0.671391,0.005284,0.437007,0.210602,0.985556,0.358968,0.645019
1,0.35575,0.360524,0.514521,0.175276,0.496044,0.790927,0.397017,0.65443,0.896949,0.345304,...,0.689668,0.271047,0.948846,0.820937,0.049355,0.520599,0.717266,0.917065,0.186555,0.546087
2,0.069522,0.101912,0.327868,0.159529,0.962489,0.909987,0.941861,0.427619,0.539128,0.315603,...,0.413128,0.612691,0.938097,0.911261,0.781846,0.421557,0.192917,0.4491,0.114279,0.681607
3,0.680208,0.910606,0.913884,0.863068,0.539352,0.562692,0.123968,0.473372,0.232446,0.812583,...,0.561565,0.993054,0.399094,0.975478,0.561635,0.912421,0.825015,0.758597,0.83024,0.333259
4,0.392236,0.400862,0.125789,0.437953,0.519968,0.437479,0.04748,0.640837,0.264325,0.219463,...,0.548551,0.664787,0.672099,0.042429,0.031315,0.504037,0.362528,0.796567,0.767595,0.185414


In [7]:
# Assigning X to all columns except 0
X_df2 = df2.drop(columns=0)
X_df2.head()

# Assigning Y to column 0
Y_df2 = df2[0]
print(Y_df2)

# The following line makes Y become a list
Y_df2 = np.array(Y_df2).reshape(-1)
print(X_df2.shape,Y_df2.shape)

list_titles = X_df2.columns

list_corr_df2 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df2.append(abs(np.corrcoef(Y_df2, X_df2[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df2, X_df2[i])[0][1]))

0      0.452499
1      0.355750
2      0.069522
3      0.680208
4      0.392236
         ...   
995    0.801062
996    0.627662
997    0.429654
998    0.384096
999    0.170044
Name: 0, Length: 1000, dtype: float64
(1000, 199999) (1000,)
1
Correlation matrix for column 0 and and column1: 0.0028562069836410844
2
Correlation matrix for column 0 and and column2: 0.046065266658427244
3
Correlation matrix for column 0 and and column3: 0.03842359570579697
4
Correlation matrix for column 0 and and column4: 0.018578002300179433
5
Correlation matrix for column 0 and and column5: 0.004214271175782043
6
Correlation matrix for column 0 and and column6: 0.008672108413203634
7
Correlation matrix for column 0 and and column7: -0.019612429300754283
8
Correlation matrix for column 0 and and column8: -0.04949712666727957
9
Correlation matrix for column 0 and and column9: 0.013489712328429024
10
Correlation matrix for column 0 and and column10: -0.017482130519732287
11
Correlation matrix for column 0 and 

Correlation matrix for column 0 and and column236: -0.032163287556776
237
Correlation matrix for column 0 and and column237: -0.013683935918921218
238
Correlation matrix for column 0 and and column238: 0.01388254385038125
239
Correlation matrix for column 0 and and column239: -0.025797261081472744
240
Correlation matrix for column 0 and and column240: -0.006430286365373554
241
Correlation matrix for column 0 and and column241: -0.022810203006571723
242
Correlation matrix for column 0 and and column242: -0.03583605646507682
243
Correlation matrix for column 0 and and column243: -0.04429750301136019
244
Correlation matrix for column 0 and and column244: -0.005421366229369469
245
Correlation matrix for column 0 and and column245: -0.005956650087609033
246
Correlation matrix for column 0 and and column246: -0.005028865230322208
247
Correlation matrix for column 0 and and column247: -0.0379615960565524
248
Correlation matrix for column 0 and and column248: -0.00014830621577057191
249
Correl

Correlation matrix for column 0 and and column501: -0.03266475507547654
502
Correlation matrix for column 0 and and column502: -0.06467189875793798
503
Correlation matrix for column 0 and and column503: 0.015008856347435814
504
Correlation matrix for column 0 and and column504: -0.01204594471439784
505
Correlation matrix for column 0 and and column505: 0.024295147142088736
506
Correlation matrix for column 0 and and column506: -0.052954474018946596
507
Correlation matrix for column 0 and and column507: 0.004270321961316119
508
Correlation matrix for column 0 and and column508: 0.020838881357056825
509
Correlation matrix for column 0 and and column509: 0.02056232741757837
510
Correlation matrix for column 0 and and column510: -0.014016544805685557
511
Correlation matrix for column 0 and and column511: -0.022931348607523085
512
Correlation matrix for column 0 and and column512: 0.014532887621160789
513
Correlation matrix for column 0 and and column513: 0.011550853457991856
514
Correlatio

Correlation matrix for column 0 and and column730: -0.029505074569140335
731
Correlation matrix for column 0 and and column731: 0.05568175724752543
732
Correlation matrix for column 0 and and column732: -0.019642465700465
733
Correlation matrix for column 0 and and column733: -0.016170623214792273
734
Correlation matrix for column 0 and and column734: 0.005104972346578304
735
Correlation matrix for column 0 and and column735: 0.0108244084407609
736
Correlation matrix for column 0 and and column736: -0.03828565834265581
737
Correlation matrix for column 0 and and column737: 0.006238088617432411
738
Correlation matrix for column 0 and and column738: 0.0070950862003548295
739
Correlation matrix for column 0 and and column739: -0.0009730264546489022
740
Correlation matrix for column 0 and and column740: 0.0037621813790582673
741
Correlation matrix for column 0 and and column741: -0.03325762927370817
742
Correlation matrix for column 0 and and column742: 0.0012054759795232471
743
Correlatio

Correlation matrix for column 0 and and column905: 0.049009203511601006
906
Correlation matrix for column 0 and and column906: -0.04329152417398378
907
Correlation matrix for column 0 and and column907: 0.011393548203798036
908
Correlation matrix for column 0 and and column908: 0.021396037400604202
909
Correlation matrix for column 0 and and column909: 0.01218998119518383
910
Correlation matrix for column 0 and and column910: -0.025043615895760197
911
Correlation matrix for column 0 and and column911: 0.016871536131378824
912
Correlation matrix for column 0 and and column912: 0.00879705641698349
913
Correlation matrix for column 0 and and column913: 0.01897879022198978
914
Correlation matrix for column 0 and and column914: -0.0040690484087768635
915
Correlation matrix for column 0 and and column915: -0.026857708913717797
916
Correlation matrix for column 0 and and column916: 0.01728451528593683
917
Correlation matrix for column 0 and and column917: -0.0021850084266228113
918
Correlatio

We now compare which of the correlation lists has the highest numbers of every parameter

In [8]:
a = 0
b = 0
for i in range(0,1000):
    if list_corr_df1[i] > list_corr_df2[i]:
        a+=1
    elif list_corr_df1[i] < list_corr_df2[i]:
        b+=1
    else:
        print()
        
print(a)
print(b)

192
808


Clearly it is seen that the bigger DataFrame has the highest number of spurious correlations.

To counter spurious correlations, random projection can be used

In [9]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.1)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7884,7885,7886,7887,7888,7889,7890,7891,7892,7893
0,0.300199,-0.173978,0.078233,-1.726764,-4.585345,-3.033471,5.550452,-0.723122,-6.243887,-2.341382,...,3.009550,-7.127626,2.901055,0.962544,-4.088991,-0.054105,3.798056,-2.588254,-1.655238,-0.303377
1,5.015503,0.020997,3.656333,-2.837659,-2.104038,-3.899778,7.544000,-3.403810,-6.346939,-3.596113,...,-1.326506,-4.874957,2.750304,-0.873909,-1.975044,-2.259007,7.105675,-1.155544,-0.355670,-1.884380
2,-0.239934,-0.738623,3.691516,0.261331,-3.118078,-4.786485,7.148691,-2.758258,-6.686483,-1.679739,...,0.187661,-10.016290,1.113536,1.806542,-5.570791,1.703869,4.716192,-5.414080,-1.700958,4.816827
3,0.624445,-0.048598,1.042270,-1.002414,-4.517967,-5.905475,5.784623,-2.486873,-4.843091,-5.624176,...,-0.339278,-4.210808,0.723555,1.174350,-5.506172,2.074235,5.945460,-4.800700,-0.533448,2.455960
4,-0.397925,-3.155691,3.648532,1.871568,-4.996279,-5.406214,5.561939,-2.586612,-2.645546,-1.577511,...,-2.078683,-5.822710,3.256039,3.051832,-2.301120,-0.470248,5.394928,-4.764884,0.397636,2.417511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3.839388,-1.676448,1.169817,-1.449885,-1.790990,-3.743954,5.740807,-3.382575,-4.260587,-4.446568,...,-2.778800,-4.301105,1.574508,-1.491172,-4.567912,0.016098,4.408972,-6.505560,-1.031040,-1.630653
9996,0.950041,-1.764177,0.009245,0.109634,-4.005916,-3.850693,6.135153,-1.660782,-5.967097,-3.420071,...,-1.658988,-7.940387,2.452405,-0.489299,-6.098683,0.510000,4.751061,-4.929743,-1.523160,-0.464634
9997,1.752898,0.350061,1.730298,2.082928,-3.755512,-4.276688,3.899094,-3.374565,-5.024225,-2.160153,...,0.826561,-8.844682,1.615204,0.178399,-3.901108,0.827823,4.938783,-4.287445,0.108319,0.163174
9998,3.702259,-0.505050,0.643614,-1.631455,-2.966584,-5.134588,3.876267,-0.726758,-3.279228,-1.113116,...,-0.996003,-5.421841,-1.270141,1.220343,-4.805742,-1.636416,4.104539,-5.273421,1.740816,1.893074


In [10]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)
X_df_new.head()

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

0       0.300199
1       5.015503
2      -0.239934
3       0.624445
4      -0.397925
          ...   
9995    3.839388
9996    0.950041
9997    1.752898
9998    3.702259
9999    2.562510
Name: 0, Length: 10000, dtype: float64
(10000, 7893) (10000,)
1
Correlation matrix for column 0 and and column1: 0.004356179409255896
2
Correlation matrix for column 0 and and column2: 0.005194777262234194
3
Correlation matrix for column 0 and and column3: -0.02022567494834268
4
Correlation matrix for column 0 and and column4: 0.004106526683322501
5
Correlation matrix for column 0 and and column5: 0.0021698467793640226
6
Correlation matrix for column 0 and and column6: -0.007191648691336352
7
Correlation matrix for column 0 and and column7: 0.01828224363812822
8
Correlation matrix for column 0 and and column8: 0.0006616714014108566
9
Correlation matrix for column 0 and and column9: -0.007645019571954036
10
Correlation matrix for column 0 and and column10: -0.0032812367998512554
11
Correlation matrix fo

Correlation matrix for column 0 and and column136: 0.005594436027706514
137
Correlation matrix for column 0 and and column137: 0.007990202278950743
138
Correlation matrix for column 0 and and column138: -0.005606257795615971
139
Correlation matrix for column 0 and and column139: 0.002151933824961114
140
Correlation matrix for column 0 and and column140: -0.003405325698036947
141
Correlation matrix for column 0 and and column141: 0.006125763851532085
142
Correlation matrix for column 0 and and column142: 0.005430477827782775
143
Correlation matrix for column 0 and and column143: 0.00030725241273043435
144
Correlation matrix for column 0 and and column144: 0.0014433709032909984
145
Correlation matrix for column 0 and and column145: -0.0027370935419859846
146
Correlation matrix for column 0 and and column146: 0.0029323966681381137
147
Correlation matrix for column 0 and and column147: 0.010746072820687805
148
Correlation matrix for column 0 and and column148: 0.011013394116158537
149
Corr

292
Correlation matrix for column 0 and and column292: -0.00020407364118426666
293
Correlation matrix for column 0 and and column293: -0.004135838356990621
294
Correlation matrix for column 0 and and column294: 0.0013081302503139827
295
Correlation matrix for column 0 and and column295: 0.003767835767555572
296
Correlation matrix for column 0 and and column296: 0.00573881897468818
297
Correlation matrix for column 0 and and column297: 0.015260452454290665
298
Correlation matrix for column 0 and and column298: 0.0018245840970920262
299
Correlation matrix for column 0 and and column299: -0.007599845607651544
300
Correlation matrix for column 0 and and column300: 0.004589601715030047
301
Correlation matrix for column 0 and and column301: -0.010498303276265608
302
Correlation matrix for column 0 and and column302: 0.0029519651601025067
303
Correlation matrix for column 0 and and column303: 0.00443744612148611
304
Correlation matrix for column 0 and and column304: -0.0012227110901440541
305

Correlation matrix for column 0 and and column466: 0.012012552598785696
467
Correlation matrix for column 0 and and column467: -0.013903685013578375
468
Correlation matrix for column 0 and and column468: 0.008087512638514536
469
Correlation matrix for column 0 and and column469: 0.006488554608490465
470
Correlation matrix for column 0 and and column470: 0.012164401642733117
471
Correlation matrix for column 0 and and column471: 0.003188419632750119
472
Correlation matrix for column 0 and and column472: -0.0022368123827863507
473
Correlation matrix for column 0 and and column473: 0.0007429186949010799
474
Correlation matrix for column 0 and and column474: 0.008702714145348433
475
Correlation matrix for column 0 and and column475: -0.0006697759976766907
476
Correlation matrix for column 0 and and column476: 0.000713428867477572
477
Correlation matrix for column 0 and and column477: 0.0076148392237353505
478
Correlation matrix for column 0 and and column478: -0.016189286819391146
479
Corr

Correlation matrix for column 0 and and column631: -0.010054723292093976
632
Correlation matrix for column 0 and and column632: 0.005472165825227963
633
Correlation matrix for column 0 and and column633: -0.00379707500870083
634
Correlation matrix for column 0 and and column634: 0.008119713082383902
635
Correlation matrix for column 0 and and column635: -0.0024564440712836564
636
Correlation matrix for column 0 and and column636: -0.013177575984302203
637
Correlation matrix for column 0 and and column637: 0.018534329961151022
638
Correlation matrix for column 0 and and column638: -0.015018489841215371
639
Correlation matrix for column 0 and and column639: -0.0024033245191248313
640
Correlation matrix for column 0 and and column640: 0.020611011211814478
641
Correlation matrix for column 0 and and column641: 0.010829580658544202
642
Correlation matrix for column 0 and and column642: -0.004111350254141302
643
Correlation matrix for column 0 and and column643: -0.006268404426483277
644
Cor

Correlation matrix for column 0 and and column794: -0.015523728094982687
795
Correlation matrix for column 0 and and column795: 0.012188615362803152
796
Correlation matrix for column 0 and and column796: 0.014451893249136911
797
Correlation matrix for column 0 and and column797: -0.009402185705989472
798
Correlation matrix for column 0 and and column798: -0.001558545740230537
799
Correlation matrix for column 0 and and column799: -0.006858487627919643
800
Correlation matrix for column 0 and and column800: 0.02033900373163168
801
Correlation matrix for column 0 and and column801: 0.00986044302719962
802
Correlation matrix for column 0 and and column802: -0.010781714664203254
803
Correlation matrix for column 0 and and column803: 0.008813908590459937
804
Correlation matrix for column 0 and and column804: 0.004994067696851836
805
Correlation matrix for column 0 and and column805: 0.019907854802272532
806
Correlation matrix for column 0 and and column806: -0.009393699059376499
807
Correlat

Correlation matrix for column 0 and and column952: 0.0059034687815462
953
Correlation matrix for column 0 and and column953: -0.0015643735216866847
954
Correlation matrix for column 0 and and column954: -0.010885818861722912
955
Correlation matrix for column 0 and and column955: 0.007306994685910933
956
Correlation matrix for column 0 and and column956: -0.0016036240043232049
957
Correlation matrix for column 0 and and column957: -0.008481338946713636
958
Correlation matrix for column 0 and and column958: 0.014708132761334881
959
Correlation matrix for column 0 and and column959: 0.008785381376226089
960
Correlation matrix for column 0 and and column960: -0.006961881646557237
961
Correlation matrix for column 0 and and column961: 0.014509022786658362
962
Correlation matrix for column 0 and and column962: -0.007278560958217945
963
Correlation matrix for column 0 and and column963: -0.01296163624333049
964
Correlation matrix for column 0 and and column964: -0.014356969700714876
965
Corre

In [11]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass
print(a)
print(b)

333
324


In [12]:
print((a/len(list_corr_df1)*100))

33.300000000000004


In [13]:
print((b/len(list_corr_df_new)*100))

32.4
