# Spurious Correlations of Highly Dimensional Big Data

This Notebook aims at showing how PCA and random projection can solve the problem of spurious correlations in Big Data.

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.utils import shuffle

In [2]:
# User defined parameters

# Number of rows for df1
x = 10000
print(x)

# Number of columns for df1
y = 100000

# Number of rows for df2
z = 1000

10000


In [3]:
# Creating a dataframe with x number of rows and y number of columns
df = pd.DataFrame(np.random.random_sample((x,y)))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0.106080,0.915745,0.147077,0.167503,0.487968,0.226538,0.254787,0.623292,0.470101,0.755654,...,0.666783,0.522985,0.476157,0.773045,0.434901,0.052657,0.037750,0.628907,0.401869,0.665959
1,0.319986,0.894195,0.218637,0.707869,0.719441,0.360796,0.309027,0.827702,0.060604,0.328451,...,0.769788,0.042038,0.452191,0.851151,0.986762,0.593540,0.947720,0.608447,0.622881,0.626858
2,0.376211,0.163538,0.743877,0.511810,0.384389,0.848757,0.219392,0.917807,0.385766,0.191366,...,0.684092,0.191433,0.165400,0.830016,0.330929,0.231106,0.115412,0.976810,0.556879,0.075121
3,0.853908,0.090614,0.091299,0.610245,0.243554,0.035242,0.925989,0.466310,0.573845,0.005383,...,0.673818,0.963165,0.213568,0.949922,0.785582,0.340721,0.378636,0.281106,0.372827,0.324596
4,0.542421,0.730235,0.370256,0.337504,0.850236,0.857652,0.358410,0.944153,0.870036,0.563044,...,0.784753,0.469854,0.175904,0.760820,0.759634,0.764462,0.818192,0.516211,0.540741,0.288769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.967672,0.675855,0.696169,0.366158,0.744880,0.617432,0.257100,0.561354,0.879241,0.147494,...,0.854400,0.836588,0.527283,0.982147,0.415412,0.067872,0.638334,0.427063,0.694614,0.017652
9996,0.032825,0.541817,0.923347,0.594182,0.836764,0.335170,0.197741,0.082055,0.515414,0.477600,...,0.159158,0.568057,0.307247,0.271126,0.650856,0.771000,0.607291,0.294432,0.887875,0.694161
9997,0.086819,0.337319,0.060720,0.658627,0.152006,0.682497,0.420853,0.437837,0.893868,0.282279,...,0.255598,0.669310,0.810364,0.144628,0.391283,0.945316,0.474387,0.663845,0.505760,0.343621
9998,0.323677,0.711385,0.836919,0.719498,0.560333,0.363645,0.071005,0.763729,0.598458,0.521407,...,0.281114,0.707636,0.409575,0.366253,0.141426,0.416714,0.123108,0.605565,0.228123,0.524266


In order to assess the correlations of the different parameters, correlations between the column with index 0 and the 99 other first columns is assessed.

In [4]:
# Assigning X to all columns except 0
X_df = df.drop(columns=0)
X_df.head()

# Assigning Y to column 0
Y_df = df[0]
print(Y_df)

# The following line makes Y become a list
Y_df = np.array(Y_df).reshape(-1)
print(X_df.shape,Y_df.shape)

list_titles = X_df.columns
list_corr_df1 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df1.append(abs(np.corrcoef(Y_df, X_df[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df, X_df[i])[0][1]))

0       0.106080
1       0.319986
2       0.376211
3       0.853908
4       0.542421
          ...   
9995    0.967672
9996    0.032825
9997    0.086819
9998    0.323677
9999    0.790703
Name: 0, Length: 10000, dtype: float64
(10000, 99999) (10000,)
1
Correlation matrix for column 0 and and column1: 0.008410080794700778
2
Correlation matrix for column 0 and and column2: 0.011796689361842694
3
Correlation matrix for column 0 and and column3: -0.0022427847885608835
4
Correlation matrix for column 0 and and column4: -0.013131780284264701
5
Correlation matrix for column 0 and and column5: 0.0008577028320867955
6
Correlation matrix for column 0 and and column6: 0.004824884520918836
7
Correlation matrix for column 0 and and column7: 0.004351348981220188
8
Correlation matrix for column 0 and and column8: 0.006261389333225878
9
Correlation matrix for column 0 and and column9: 0.01979980349323801
10
Correlation matrix for column 0 and and column10: -0.0021128928553898743
11
Correlation matrix f

112
Correlation matrix for column 0 and and column112: 0.007589444032416984
113
Correlation matrix for column 0 and and column113: 0.00631071428632858
114
Correlation matrix for column 0 and and column114: -0.011335778635386321
115
Correlation matrix for column 0 and and column115: 0.020347770356824536
116
Correlation matrix for column 0 and and column116: -0.0013416253819515844
117
Correlation matrix for column 0 and and column117: 0.0018599675977342587
118
Correlation matrix for column 0 and and column118: -0.0017400245004382774
119
Correlation matrix for column 0 and and column119: 0.0059033969642908616
120
Correlation matrix for column 0 and and column120: -0.011833502078286606
121
Correlation matrix for column 0 and and column121: -0.007054144832400789
122
Correlation matrix for column 0 and and column122: -0.015435932768433825
123
Correlation matrix for column 0 and and column123: -0.004471301311200906
124
Correlation matrix for column 0 and and column124: 0.02284723877086267
125

246
Correlation matrix for column 0 and and column246: -0.02011769056747025
247
Correlation matrix for column 0 and and column247: 0.0019035025013175875
248
Correlation matrix for column 0 and and column248: 0.02858805769128366
249
Correlation matrix for column 0 and and column249: 0.003789155935127317
250
Correlation matrix for column 0 and and column250: -0.008654299788447487
251
Correlation matrix for column 0 and and column251: -0.013084332669159182
252
Correlation matrix for column 0 and and column252: 0.019151926312808668
253
Correlation matrix for column 0 and and column253: 0.018090627479301916
254
Correlation matrix for column 0 and and column254: -0.0043645859515094275
255
Correlation matrix for column 0 and and column255: 0.01732203263421653
256
Correlation matrix for column 0 and and column256: -0.004067799207404243
257
Correlation matrix for column 0 and and column257: -0.016519521478622012
258
Correlation matrix for column 0 and and column258: 0.015383740019436043
259
Cor

Correlation matrix for column 0 and and column391: -0.0065499045140867875
392
Correlation matrix for column 0 and and column392: 0.00912466324142368
393
Correlation matrix for column 0 and and column393: 0.002468394483766919
394
Correlation matrix for column 0 and and column394: 0.003486660791940275
395
Correlation matrix for column 0 and and column395: 0.0048899569056571335
396
Correlation matrix for column 0 and and column396: 0.0038211227783309792
397
Correlation matrix for column 0 and and column397: -0.015794355404036748
398
Correlation matrix for column 0 and and column398: -0.007902471277884663
399
Correlation matrix for column 0 and and column399: 0.012267472367960998
400
Correlation matrix for column 0 and and column400: -0.019489828429522116
401
Correlation matrix for column 0 and and column401: 0.006002925940971487
402
Correlation matrix for column 0 and and column402: -0.01914456505983028
403
Correlation matrix for column 0 and and column403: 0.008064630277713215
404
Correl

Correlation matrix for column 0 and and column520: -0.011451557336995473
521
Correlation matrix for column 0 and and column521: -0.00013667667977637442
522
Correlation matrix for column 0 and and column522: 0.016969733603467544
523
Correlation matrix for column 0 and and column523: -0.001737639454076102
524
Correlation matrix for column 0 and and column524: -0.0002090829199679311
525
Correlation matrix for column 0 and and column525: 0.015620944384854864
526
Correlation matrix for column 0 and and column526: 0.008556486189399196
527
Correlation matrix for column 0 and and column527: -0.025019128994948375
528
Correlation matrix for column 0 and and column528: -0.00684022589652719
529
Correlation matrix for column 0 and and column529: -0.0038711446267915887
530
Correlation matrix for column 0 and and column530: 0.016901850090548398
531
Correlation matrix for column 0 and and column531: 0.0005681936159771186
532
Correlation matrix for column 0 and and column532: -0.018356204796752946
533


Correlation matrix for column 0 and and column632: 0.0004513931635522035
633
Correlation matrix for column 0 and and column633: -0.0002942859766991348
634
Correlation matrix for column 0 and and column634: 0.000296436123219085
635
Correlation matrix for column 0 and and column635: 0.010378596053504484
636
Correlation matrix for column 0 and and column636: 0.013885980753649323
637
Correlation matrix for column 0 and and column637: -0.000899986979085352
638
Correlation matrix for column 0 and and column638: -0.007618805742861051
639
Correlation matrix for column 0 and and column639: 0.017180238069168387
640
Correlation matrix for column 0 and and column640: -0.015750846156920552
641
Correlation matrix for column 0 and and column641: -0.009330803444041925
642
Correlation matrix for column 0 and and column642: 0.0013344818817050768
643
Correlation matrix for column 0 and and column643: -0.008498330465128299
644
Correlation matrix for column 0 and and column644: -0.022529777301939318
645
Co

Correlation matrix for column 0 and and column776: -0.004403917141282288
777
Correlation matrix for column 0 and and column777: -0.0025647768581208204
778
Correlation matrix for column 0 and and column778: 0.0017172514817186083
779
Correlation matrix for column 0 and and column779: -0.011179318263835437
780
Correlation matrix for column 0 and and column780: -0.0029099563911280436
781
Correlation matrix for column 0 and and column781: -0.01443930240722866
782
Correlation matrix for column 0 and and column782: -0.0009190277240046135
783
Correlation matrix for column 0 and and column783: 0.0032886092523973735
784
Correlation matrix for column 0 and and column784: -0.007839068590364451
785
Correlation matrix for column 0 and and column785: 0.017173424317075435
786
Correlation matrix for column 0 and and column786: -0.004334586967113384
787
Correlation matrix for column 0 and and column787: -0.010383098518951141
788
Correlation matrix for column 0 and and column788: 0.00844508539761477
789


Correlation matrix for column 0 and and column925: -0.006374547367887362
926
Correlation matrix for column 0 and and column926: 0.011636398177349495
927
Correlation matrix for column 0 and and column927: 0.014909241352950672
928
Correlation matrix for column 0 and and column928: -0.018952146159645376
929
Correlation matrix for column 0 and and column929: 0.005611915220867736
930
Correlation matrix for column 0 and and column930: -0.0037020589058698195
931
Correlation matrix for column 0 and and column931: 0.00931411163129096
932
Correlation matrix for column 0 and and column932: -0.022018566806286853
933
Correlation matrix for column 0 and and column933: -0.013725217454896743
934
Correlation matrix for column 0 and and column934: -0.008168913285266415
935
Correlation matrix for column 0 and and column935: 0.006107847422500853
936
Correlation matrix for column 0 and and column936: -0.005423784649596361
937
Correlation matrix for column 0 and and column937: 0.006083054546772936
938
Corre

In [5]:
# See how many correlations are significant

To see if indeed the correlations between the different parameters increases with the data size, the original dataframe is compared to a sub-set dataframe which only takes the first z rows of the original dataframe. If the correlations in the original dataframe are higher than in the smaller dataframe this would prove that the bigger the data size the more frequent the number of spurious correlations.

In [6]:
# Creating smaller dataframe taking z number of rows from original dataframe
df2 = df.iloc[:z]
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0.10608,0.915745,0.147077,0.167503,0.487968,0.226538,0.254787,0.623292,0.470101,0.755654,...,0.666783,0.522985,0.476157,0.773045,0.434901,0.052657,0.03775,0.628907,0.401869,0.665959
1,0.319986,0.894195,0.218637,0.707869,0.719441,0.360796,0.309027,0.827702,0.060604,0.328451,...,0.769788,0.042038,0.452191,0.851151,0.986762,0.59354,0.94772,0.608447,0.622881,0.626858
2,0.376211,0.163538,0.743877,0.51181,0.384389,0.848757,0.219392,0.917807,0.385766,0.191366,...,0.684092,0.191433,0.1654,0.830016,0.330929,0.231106,0.115412,0.97681,0.556879,0.075121
3,0.853908,0.090614,0.091299,0.610245,0.243554,0.035242,0.925989,0.46631,0.573845,0.005383,...,0.673818,0.963165,0.213568,0.949922,0.785582,0.340721,0.378636,0.281106,0.372827,0.324596
4,0.542421,0.730235,0.370256,0.337504,0.850236,0.857652,0.35841,0.944153,0.870036,0.563044,...,0.784753,0.469854,0.175904,0.76082,0.759634,0.764462,0.818192,0.516211,0.540741,0.288769


In [7]:
# Assigning X to all columns except 0
X_df2 = df2.drop(columns=0)
X_df2.head()

# Assigning Y to column 0
Y_df2 = df2[0]
print(Y_df2)

# The following line makes Y become a list
Y_df2 = np.array(Y_df2).reshape(-1)
print(X_df2.shape,Y_df2.shape)

list_titles = X_df2.columns

list_corr_df2 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df2.append(abs(np.corrcoef(Y_df2, X_df2[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df2, X_df2[i])[0][1]))

0      0.106080
1      0.319986
2      0.376211
3      0.853908
4      0.542421
         ...   
995    0.641559
996    0.242909
997    0.203350
998    0.368447
999    0.299287
Name: 0, Length: 1000, dtype: float64
(1000, 99999) (1000,)
1
Correlation matrix for column 0 and and column1: -0.0477426241530883
2
Correlation matrix for column 0 and and column2: 0.04434594117709765
3
Correlation matrix for column 0 and and column3: 0.04805079902113313
4
Correlation matrix for column 0 and and column4: -0.02707519350495381
5
Correlation matrix for column 0 and and column5: -0.006342213012390813
6
Correlation matrix for column 0 and and column6: 0.012939329432205134
7
Correlation matrix for column 0 and and column7: -0.0330964761784665
8
Correlation matrix for column 0 and and column8: 0.03585218674529555
9
Correlation matrix for column 0 and and column9: -0.009663411262792269
10
Correlation matrix for column 0 and and column10: 0.004993276417411408
11
Correlation matrix for column 0 and and co

Correlation matrix for column 0 and and column124: 0.03318242346086818
125
Correlation matrix for column 0 and and column125: -0.043832934446400476
126
Correlation matrix for column 0 and and column126: -0.005840219188269412
127
Correlation matrix for column 0 and and column127: 0.006706078937552263
128
Correlation matrix for column 0 and and column128: -0.03062494483726141
129
Correlation matrix for column 0 and and column129: -0.01015750767760574
130
Correlation matrix for column 0 and and column130: 0.05574622691807613
131
Correlation matrix for column 0 and and column131: -0.08176084573003772
132
Correlation matrix for column 0 and and column132: -0.00656511653547868
133
Correlation matrix for column 0 and and column133: -0.002825903748749321
134
Correlation matrix for column 0 and and column134: -0.0031421844664571486
135
Correlation matrix for column 0 and and column135: -0.023031454399690142
136
Correlation matrix for column 0 and and column136: 0.06759381998084266
137
Correlati

Correlation matrix for column 0 and and column267: -0.03656680062574586
268
Correlation matrix for column 0 and and column268: 0.005599818555775767
269
Correlation matrix for column 0 and and column269: -0.06306484788556033
270
Correlation matrix for column 0 and and column270: -0.012257446854683074
271
Correlation matrix for column 0 and and column271: 0.000279638998083468
272
Correlation matrix for column 0 and and column272: -0.028159547957593054
273
Correlation matrix for column 0 and and column273: 0.015968066262744662
274
Correlation matrix for column 0 and and column274: 0.025320069216493577
275
Correlation matrix for column 0 and and column275: -0.022705432460810034
276
Correlation matrix for column 0 and and column276: 0.02673772074954977
277
Correlation matrix for column 0 and and column277: -0.006657301082581819
278
Correlation matrix for column 0 and and column278: 0.004544755798685027
279
Correlation matrix for column 0 and and column279: 0.041099069654355294
280
Correlati

Correlation matrix for column 0 and and column392: 0.042827467523249334
393
Correlation matrix for column 0 and and column393: 0.05359542087022039
394
Correlation matrix for column 0 and and column394: -0.02679503925879085
395
Correlation matrix for column 0 and and column395: 0.0705210043399516
396
Correlation matrix for column 0 and and column396: 0.0223559031461472
397
Correlation matrix for column 0 and and column397: -0.08455244618375303
398
Correlation matrix for column 0 and and column398: -0.01455086391849715
399
Correlation matrix for column 0 and and column399: 0.054670278993187056
400
Correlation matrix for column 0 and and column400: -0.04425667351937459
401
Correlation matrix for column 0 and and column401: 0.04909664618588005
402
Correlation matrix for column 0 and and column402: -0.0023749027174081324
403
Correlation matrix for column 0 and and column403: -0.011315563464338353
404
Correlation matrix for column 0 and and column404: 0.03133453266781659
405
Correlation matr

Correlation matrix for column 0 and and column583: -0.004007801601505427
584
Correlation matrix for column 0 and and column584: 0.05319863070905277
585
Correlation matrix for column 0 and and column585: -0.05999587449309339
586
Correlation matrix for column 0 and and column586: 0.02001973763808169
587
Correlation matrix for column 0 and and column587: -0.018374294931896556
588
Correlation matrix for column 0 and and column588: -0.023851551153347007
589
Correlation matrix for column 0 and and column589: 0.038636323399760704
590
Correlation matrix for column 0 and and column590: -0.0022472460165805876
591
Correlation matrix for column 0 and and column591: 0.0237962445259037
592
Correlation matrix for column 0 and and column592: 0.008624277445462167
593
Correlation matrix for column 0 and and column593: 0.030508804486426358
594
Correlation matrix for column 0 and and column594: -0.04594080781557257
595
Correlation matrix for column 0 and and column595: -0.027432955739486144
596
Correlatio

Correlation matrix for column 0 and and column748: 0.05888450503475155
749
Correlation matrix for column 0 and and column749: -0.007451519804722221
750
Correlation matrix for column 0 and and column750: 0.022594340627599418
751
Correlation matrix for column 0 and and column751: -0.006902178458392991
752
Correlation matrix for column 0 and and column752: 0.028298074616834826
753
Correlation matrix for column 0 and and column753: 0.0002942918012477627
754
Correlation matrix for column 0 and and column754: -0.008169866129044133
755
Correlation matrix for column 0 and and column755: -0.012844405230576535
756
Correlation matrix for column 0 and and column756: 0.03191453674604816
757
Correlation matrix for column 0 and and column757: -0.010389454346836772
758
Correlation matrix for column 0 and and column758: 0.025551222087447294
759
Correlation matrix for column 0 and and column759: 0.008615589005242253
760
Correlation matrix for column 0 and and column760: -0.004859964768151009
761
Correla

945
Correlation matrix for column 0 and and column945: -0.027835012661045235
946
Correlation matrix for column 0 and and column946: -0.022054214166691465
947
Correlation matrix for column 0 and and column947: -0.004961932177071786
948
Correlation matrix for column 0 and and column948: 0.007456494522693268
949
Correlation matrix for column 0 and and column949: 0.021636665811985967
950
Correlation matrix for column 0 and and column950: 0.041196719652590245
951
Correlation matrix for column 0 and and column951: -0.0285928867774957
952
Correlation matrix for column 0 and and column952: -0.019776147688084778
953
Correlation matrix for column 0 and and column953: 0.010047862996875809
954
Correlation matrix for column 0 and and column954: 0.003950498922218449
955
Correlation matrix for column 0 and and column955: 0.03318546328431479
956
Correlation matrix for column 0 and and column956: 0.062226354704302544
957
Correlation matrix for column 0 and and column957: 0.007515147044634959
958
Correl

We now compare which of the correlation lists has the highest numbers of every parameter

In [8]:
a = 0
b = 0
for i in range(0,1000):
    if list_corr_df1[i] > list_corr_df2[i]:
        a+=1
    elif list_corr_df1[i] < list_corr_df2[i]:
        b+=1
    else:
        print()
        
print(a)
print(b)

179
821


Clearly it is seen that the bigger DataFrame has the highest number of spurious correlations.

To counter spurious correlations, random projection can be used

In [9]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.1)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7884,7885,7886,7887,7888,7889,7890,7891,7892,7893
0,5.670862,0.057151,-0.135402,-1.760102,2.884212,-0.093212,1.685615,2.814256,2.700628,-1.347755,...,1.209315,1.390876,1.451976,2.948232,-0.509694,-0.034612,1.378267,0.137740,1.366152,0.946494
1,2.349787,-0.402329,-1.252380,-1.877291,3.687636,2.634576,0.074284,2.934766,0.614080,0.252553,...,2.498750,0.775407,1.800461,2.809300,0.692554,0.587198,1.199599,1.679041,1.074368,0.544879
2,1.713806,-1.715508,-0.970058,-2.137792,1.657335,1.357015,0.471879,0.803816,0.512119,-0.320291,...,1.945260,-0.052550,1.167428,2.982206,-0.043860,0.360122,3.182344,0.211192,-0.386432,1.600708
3,2.882477,0.129041,-0.923119,-2.345714,2.924268,2.261255,1.662121,1.496799,0.678567,-1.153655,...,2.390584,-1.342872,-0.971854,2.580534,0.841322,0.463500,-1.729076,-2.083165,0.700075,0.117966
4,1.767186,1.700076,-2.044642,-1.984174,0.728949,1.024204,-1.893587,2.392170,1.166060,-0.920460,...,1.538565,1.645440,-1.222361,1.823988,1.467061,-0.804428,0.068588,-0.993741,-1.958512,0.449796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3.888164,1.604034,-0.745974,-2.825862,2.179911,1.981320,1.695968,1.384654,-0.621952,-0.540189,...,2.437488,3.293972,1.744840,3.879388,0.659033,1.590077,-0.334041,0.675320,1.351580,-0.882549
9996,2.039231,0.355057,-0.326451,-2.666728,2.166216,1.267619,0.875377,1.591650,1.392652,0.237891,...,2.596895,-0.109567,0.310242,1.231030,-0.112818,0.961360,0.095505,0.804940,-1.545162,0.630513
9997,1.835096,-0.005389,-0.897589,-2.394124,1.176861,0.183814,-1.292158,1.466139,1.084703,0.108558,...,1.959892,0.748968,-0.990033,4.283234,0.882570,-0.618762,1.663424,-0.549575,-1.246046,0.536350
9998,2.530072,-1.532775,-0.534896,-3.317434,1.829255,2.396135,0.535863,2.170911,0.800595,-0.980433,...,2.960863,0.539518,1.807661,3.204140,1.670447,-0.235949,0.983791,1.125146,-0.578627,-1.025699


In [10]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)
X_df_new.head()

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

0       5.670862
1       2.349787
2       1.713806
3       2.882477
4       1.767186
          ...   
9995    3.888164
9996    2.039231
9997    1.835096
9998    2.530072
9999    2.569609
Name: 0, Length: 10000, dtype: float64
(10000, 7893) (10000,)
1
Correlation matrix for column 0 and and column1: -0.0061114474803215454
2
Correlation matrix for column 0 and and column2: -0.016787868634932708
3
Correlation matrix for column 0 and and column3: 0.0039108991671462584
4
Correlation matrix for column 0 and and column4: -0.002491263333554025
5
Correlation matrix for column 0 and and column5: -0.01700317613660019
6
Correlation matrix for column 0 and and column6: 0.008883907327021035
7
Correlation matrix for column 0 and and column7: -0.004128929381324226
8
Correlation matrix for column 0 and and column8: 0.018177863541202974
9
Correlation matrix for column 0 and and column9: 0.005472569314095642
10
Correlation matrix for column 0 and and column10: -0.0018440643286751445
11
Correlation matrix

Correlation matrix for column 0 and and column212: -0.003021316903189185
213
Correlation matrix for column 0 and and column213: -0.008187269417495308
214
Correlation matrix for column 0 and and column214: 0.018495647550215676
215
Correlation matrix for column 0 and and column215: -0.004277185178660367
216
Correlation matrix for column 0 and and column216: -0.0057002435978854415
217
Correlation matrix for column 0 and and column217: -0.0005908295405163509
218
Correlation matrix for column 0 and and column218: -0.002891712481006255
219
Correlation matrix for column 0 and and column219: -0.0180078699060549
220
Correlation matrix for column 0 and and column220: 0.008958025650954382
221
Correlation matrix for column 0 and and column221: -0.024580742246638042
222
Correlation matrix for column 0 and and column222: -0.005291240333939133
223
Correlation matrix for column 0 and and column223: -0.006968708051347492
224
Correlation matrix for column 0 and and column224: 0.006087427484892501
225
Co

Correlation matrix for column 0 and and column329: -0.004702087124769406
330
Correlation matrix for column 0 and and column330: -0.008772565865573236
331
Correlation matrix for column 0 and and column331: -0.01598494610797631
332
Correlation matrix for column 0 and and column332: -0.016181171517536536
333
Correlation matrix for column 0 and and column333: -0.003974220066736992
334
Correlation matrix for column 0 and and column334: -0.005304921181705303
335
Correlation matrix for column 0 and and column335: 0.0007662807473960344
336
Correlation matrix for column 0 and and column336: -0.004070258461658416
337
Correlation matrix for column 0 and and column337: -0.02005135120065564
338
Correlation matrix for column 0 and and column338: 0.012816778120192164
339
Correlation matrix for column 0 and and column339: -0.014781425184066652
340
Correlation matrix for column 0 and and column340: -0.003576382575604375
341
Correlation matrix for column 0 and and column341: -0.013648367046697031
342
Co

Correlation matrix for column 0 and and column446: -0.01631427927692008
447
Correlation matrix for column 0 and and column447: 0.006252380534369855
448
Correlation matrix for column 0 and and column448: -0.01056021678295443
449
Correlation matrix for column 0 and and column449: -0.004478952993470724
450
Correlation matrix for column 0 and and column450: 0.0068568295028087626
451
Correlation matrix for column 0 and and column451: -0.01965011097969346
452
Correlation matrix for column 0 and and column452: -0.011508745635210621
453
Correlation matrix for column 0 and and column453: -0.0007189075818628455
454
Correlation matrix for column 0 and and column454: -0.02615604236535367
455
Correlation matrix for column 0 and and column455: -0.03013923881322245
456
Correlation matrix for column 0 and and column456: -0.0023797695726805444
457
Correlation matrix for column 0 and and column457: 0.007435960967824108
458
Correlation matrix for column 0 and and column458: -0.010849326920835279
459
Corr

Correlation matrix for column 0 and and column559: 0.01628629858586051
560
Correlation matrix for column 0 and and column560: -0.0008621830993100623
561
Correlation matrix for column 0 and and column561: 0.00033086893794231735
562
Correlation matrix for column 0 and and column562: -0.012590873855041207
563
Correlation matrix for column 0 and and column563: -0.0005867986792472518
564
Correlation matrix for column 0 and and column564: 0.005013234819169789
565
Correlation matrix for column 0 and and column565: -0.009391494781872407
566
Correlation matrix for column 0 and and column566: 0.0040324038395743575
567
Correlation matrix for column 0 and and column567: -0.018425591314133145
568
Correlation matrix for column 0 and and column568: 0.010631983860326176
569
Correlation matrix for column 0 and and column569: -0.004600491323769583
570
Correlation matrix for column 0 and and column570: -0.007805597398626539
571
Correlation matrix for column 0 and and column571: -0.014539009092489162
572


Correlation matrix for column 0 and and column778: -0.003887917987155067
779
Correlation matrix for column 0 and and column779: -0.0068250529143480225
780
Correlation matrix for column 0 and and column780: 0.009679577429124153
781
Correlation matrix for column 0 and and column781: -0.01567855387637372
782
Correlation matrix for column 0 and and column782: 0.007826148079707036
783
Correlation matrix for column 0 and and column783: -0.005788583006595976
784
Correlation matrix for column 0 and and column784: 0.004001113966040925
785
Correlation matrix for column 0 and and column785: 0.00561840767864584
786
Correlation matrix for column 0 and and column786: 0.022565932103604105
787
Correlation matrix for column 0 and and column787: 0.005360342635639809
788
Correlation matrix for column 0 and and column788: 0.009711098556096186
789
Correlation matrix for column 0 and and column789: 0.016510214212456145
790
Correlation matrix for column 0 and and column790: -0.0013007352461355637
791
Correla

896
Correlation matrix for column 0 and and column896: -0.0031697671643407233
897
Correlation matrix for column 0 and and column897: 0.004266928160509329
898
Correlation matrix for column 0 and and column898: -0.0020584088291278235
899
Correlation matrix for column 0 and and column899: -0.0013854470532934805
900
Correlation matrix for column 0 and and column900: -0.019821008242446675
901
Correlation matrix for column 0 and and column901: 0.00949131684155884
902
Correlation matrix for column 0 and and column902: -0.014547210617499678
903
Correlation matrix for column 0 and and column903: 0.0022980796429672465
904
Correlation matrix for column 0 and and column904: 0.008714836107337962
905
Correlation matrix for column 0 and and column905: 0.0007514191833999454
906
Correlation matrix for column 0 and and column906: 0.007747799549591329
907
Correlation matrix for column 0 and and column907: 0.0006844335188360046
908
Correlation matrix for column 0 and and column908: -0.010657607595578635
9

In [11]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass
print(a)
print(b)

336
349


In [12]:
print((a/len(list_corr_df1)*100))

33.6


In [13]:
print((b/len(list_corr_df_new)*100))

34.9
