# Spurious Correlations of Highly Dimensional Big Data

This Notebook aims at showing how PCA and random projection can solve the problem of spurious correlations in Big Data.

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.utils import shuffle

## DataFrame Creation

In [2]:
# User defined parameters

# Number of rows for df1
x = 10000

# Number of columns for df1
y = 100000

# Number of rows for df2
z = 1000

In [3]:
# Creating a dataframe with x number of rows and y number of columns
df = pd.DataFrame(np.random.random_sample((x,y)))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0.719829,0.462003,0.081165,0.698193,0.255332,0.155932,0.249774,0.252748,0.733747,0.102541,...,0.497148,0.283155,0.654431,0.226188,0.501927,0.197329,0.057901,0.649044,0.033201,0.007397
1,0.709905,0.966296,0.731566,0.669224,0.134816,0.572518,0.026870,0.423052,0.204950,0.105009,...,0.751547,0.358415,0.025055,0.323651,0.051560,0.682866,0.522802,0.872769,0.248739,0.812329
2,0.744070,0.194531,0.227904,0.124831,0.701871,0.985827,0.908152,0.042201,0.780544,0.538415,...,0.868260,0.368283,0.638358,0.405806,0.958656,0.185602,0.891345,0.288123,0.240144,0.299516
3,0.859256,0.051323,0.393889,0.390453,0.520116,0.843590,0.287601,0.209255,0.855863,0.267118,...,0.184361,0.340770,0.139397,0.668547,0.894468,0.321328,0.266349,0.424386,0.884104,0.017647
4,0.295052,0.513433,0.920665,0.246196,0.756871,0.320587,0.303282,0.425225,0.957904,0.105989,...,0.600181,0.312879,0.113000,0.319973,0.764525,0.401135,0.101091,0.911585,0.190624,0.765528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.571081,0.487237,0.758406,0.119115,0.861536,0.125699,0.117643,0.423059,0.549159,0.208591,...,0.014271,0.132231,0.213918,0.688567,0.781721,0.154324,0.437190,0.095790,0.780704,0.392751
9996,0.981745,0.547775,0.942482,0.765274,0.412681,0.279292,0.602263,0.773271,0.841478,0.263435,...,0.151594,0.470156,0.751226,0.982421,0.961168,0.388828,0.460306,0.813378,0.907603,0.514374
9997,0.523137,0.064083,0.489168,0.826083,0.348416,0.629712,0.176498,0.317711,0.504286,0.111038,...,0.723349,0.864000,0.816017,0.902807,0.499842,0.513102,0.486274,0.171698,0.660239,0.209246
9998,0.792555,0.118775,0.768572,0.659464,0.485785,0.484552,0.206746,0.817139,0.070220,0.513239,...,0.454292,0.467809,0.211402,0.447427,0.666081,0.210682,0.482833,0.860486,0.536626,0.618696


In order to assess the correlations of the different parameters, correlations between the column with index 0 and the 999 other first columns is assessed.

In [4]:
# Assigning X to all columns except 0
X_df = df.drop(columns=0)
X_df.head()

# Assigning Y to column 0
Y_df = df[0]
print(Y_df)

# The following line makes Y become a list
Y_df = np.array(Y_df).reshape(-1)
print(X_df.shape,Y_df.shape)

list_titles = X_df.columns
list_corr_df1 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df1.append(abs(np.corrcoef(Y_df, X_df[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df, X_df[i])[0][1]))

0       0.719829
1       0.709905
2       0.744070
3       0.859256
4       0.295052
          ...   
9995    0.571081
9996    0.981745
9997    0.523137
9998    0.792555
9999    0.891010
Name: 0, Length: 10000, dtype: float64
(10000, 99999) (10000,)
1
Correlation matrix for column 0 and and column1: 0.00878774869902865
2
Correlation matrix for column 0 and and column2: -0.004377273146763183
3
Correlation matrix for column 0 and and column3: -0.004375743106629405
4
Correlation matrix for column 0 and and column4: -0.0007744643087520813
5
Correlation matrix for column 0 and and column5: 0.008508545428741783
6
Correlation matrix for column 0 and and column6: -0.003240309059438602
7
Correlation matrix for column 0 and and column7: 0.0014300941949683585
8
Correlation matrix for column 0 and and column8: -0.004237918690511354
9
Correlation matrix for column 0 and and column9: -0.008779802282128902
10
Correlation matrix for column 0 and and column10: -0.00108283873412623
11
Correlation matrix

Correlation matrix for column 0 and and column118: -0.015080095454994388
119
Correlation matrix for column 0 and and column119: -0.0003541281591486478
120
Correlation matrix for column 0 and and column120: 0.008096285773556049
121
Correlation matrix for column 0 and and column121: -0.007619402085270549
122
Correlation matrix for column 0 and and column122: 0.004167597443589617
123
Correlation matrix for column 0 and and column123: 0.0049089655088146444
124
Correlation matrix for column 0 and and column124: -0.011372321197127873
125
Correlation matrix for column 0 and and column125: 0.004011155192541974
126
Correlation matrix for column 0 and and column126: -0.017087881095942538
127
Correlation matrix for column 0 and and column127: -0.009896497237090488
128
Correlation matrix for column 0 and and column128: 0.003630597113940047
129
Correlation matrix for column 0 and and column129: 0.009063198185047862
130
Correlation matrix for column 0 and and column130: 0.0073421694153993295
131
Cor

Correlation matrix for column 0 and and column240: 0.013090346797642795
241
Correlation matrix for column 0 and and column241: 0.011961815641272413
242
Correlation matrix for column 0 and and column242: -0.005377547516721666
243
Correlation matrix for column 0 and and column243: -0.003564298268094033
244
Correlation matrix for column 0 and and column244: -0.00025412353538416856
245
Correlation matrix for column 0 and and column245: 0.001470871255226352
246
Correlation matrix for column 0 and and column246: 0.0036152924757046653
247
Correlation matrix for column 0 and and column247: -0.0025984825975268154
248
Correlation matrix for column 0 and and column248: 0.006785401274212861
249
Correlation matrix for column 0 and and column249: 0.003499674352428564
250
Correlation matrix for column 0 and and column250: -0.002279360742148717
251
Correlation matrix for column 0 and and column251: 0.0031541626126476456
252
Correlation matrix for column 0 and and column252: 0.002804804246743771
253
Co

396
Correlation matrix for column 0 and and column396: -0.003086506969270775
397
Correlation matrix for column 0 and and column397: 0.004568950413773698
398
Correlation matrix for column 0 and and column398: -0.006297107703378275
399
Correlation matrix for column 0 and and column399: -0.0019640367511066944
400
Correlation matrix for column 0 and and column400: 0.012470835804704844
401
Correlation matrix for column 0 and and column401: 0.0026264121253730525
402
Correlation matrix for column 0 and and column402: -0.0029734037194437454
403
Correlation matrix for column 0 and and column403: -0.003070819712492153
404
Correlation matrix for column 0 and and column404: 0.011010800286985709
405
Correlation matrix for column 0 and and column405: -0.000701318300417776
406
Correlation matrix for column 0 and and column406: 0.005165254785939364
407
Correlation matrix for column 0 and and column407: -0.0065696554985807225
408
Correlation matrix for column 0 and and column408: -0.003248821143736477


Correlation matrix for column 0 and and column530: -0.007005869101344543
531
Correlation matrix for column 0 and and column531: -0.0033956877528015287
532
Correlation matrix for column 0 and and column532: 0.010267361234309451
533
Correlation matrix for column 0 and and column533: 0.026035028671142834
534
Correlation matrix for column 0 and and column534: -0.004060462965174009
535
Correlation matrix for column 0 and and column535: -0.0032954350316612593
536
Correlation matrix for column 0 and and column536: -0.018277180678534698
537
Correlation matrix for column 0 and and column537: 0.0054119881577494675
538
Correlation matrix for column 0 and and column538: 0.012740523692205075
539
Correlation matrix for column 0 and and column539: 0.0054914509680329794
540
Correlation matrix for column 0 and and column540: -0.00028313974099638395
541
Correlation matrix for column 0 and and column541: 0.0004429537352880834
542
Correlation matrix for column 0 and and column542: 0.008384393575560112
543

Correlation matrix for column 0 and and column690: 0.01679326751298403
691
Correlation matrix for column 0 and and column691: 0.006484418746527429
692
Correlation matrix for column 0 and and column692: -0.004019831766687907
693
Correlation matrix for column 0 and and column693: -0.006614823814025867
694
Correlation matrix for column 0 and and column694: 0.0076495983909158575
695
Correlation matrix for column 0 and and column695: 0.003955922663028285
696
Correlation matrix for column 0 and and column696: -0.010256659063601719
697
Correlation matrix for column 0 and and column697: -0.01982917444636959
698
Correlation matrix for column 0 and and column698: 0.0022048305765769048
699
Correlation matrix for column 0 and and column699: -0.011048456576100795
700
Correlation matrix for column 0 and and column700: -0.00045069851025507726
701
Correlation matrix for column 0 and and column701: -0.004848497108003785
702
Correlation matrix for column 0 and and column702: -0.0043303304803515436
703
C

Correlation matrix for column 0 and and column816: 0.008086281282381893
817
Correlation matrix for column 0 and and column817: 0.01587639150902741
818
Correlation matrix for column 0 and and column818: -0.0083056261219259
819
Correlation matrix for column 0 and and column819: -0.0001444871134727124
820
Correlation matrix for column 0 and and column820: -0.00692491191021976
821
Correlation matrix for column 0 and and column821: -0.0004386807642914367
822
Correlation matrix for column 0 and and column822: 0.0010215549821607104
823
Correlation matrix for column 0 and and column823: 0.0013607637617154727
824
Correlation matrix for column 0 and and column824: 0.00324802490653126
825
Correlation matrix for column 0 and and column825: 0.002004875831515691
826
Correlation matrix for column 0 and and column826: 0.018335348322407734
827
Correlation matrix for column 0 and and column827: 0.016830459006553722
828
Correlation matrix for column 0 and and column828: 0.008379293435988738
829
Correlati

Correlation matrix for column 0 and and column961: 0.015891767856822888
962
Correlation matrix for column 0 and and column962: -0.012277407019214731
963
Correlation matrix for column 0 and and column963: 0.0039105739418402894
964
Correlation matrix for column 0 and and column964: 0.010919160680879154
965
Correlation matrix for column 0 and and column965: 0.01585770963608188
966
Correlation matrix for column 0 and and column966: -0.007820519358571495
967
Correlation matrix for column 0 and and column967: -0.00872252042589658
968
Correlation matrix for column 0 and and column968: -0.0073161517971370845
969
Correlation matrix for column 0 and and column969: -0.007105890309282302
970
Correlation matrix for column 0 and and column970: 0.0001991153538823826
971
Correlation matrix for column 0 and and column971: 0.009014451266114113
972
Correlation matrix for column 0 and and column972: 0.01564835069243717
973
Correlation matrix for column 0 and and column973: -0.010972749057833528
974
Correl

To see if indeed the correlations between the different parameters increases with the data size, the original dataframe is compared to a sub-set dataframe which only takes the first z rows of the original dataframe. If the correlations in the original dataframe are higher than in the smaller dataframe this would prove that the bigger the data size the more frequent the number of spurious correlations.

In [5]:
# Creating smaller dataframe taking z number of rows from original dataframe
df2 = df.iloc[:z]
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0.719829,0.462003,0.081165,0.698193,0.255332,0.155932,0.249774,0.252748,0.733747,0.102541,...,0.497148,0.283155,0.654431,0.226188,0.501927,0.197329,0.057901,0.649044,0.033201,0.007397
1,0.709905,0.966296,0.731566,0.669224,0.134816,0.572518,0.026870,0.423052,0.204950,0.105009,...,0.751547,0.358415,0.025055,0.323651,0.051560,0.682866,0.522802,0.872769,0.248739,0.812329
2,0.744070,0.194531,0.227904,0.124831,0.701871,0.985827,0.908152,0.042201,0.780544,0.538415,...,0.868260,0.368283,0.638358,0.405806,0.958656,0.185602,0.891345,0.288123,0.240144,0.299516
3,0.859256,0.051323,0.393889,0.390453,0.520116,0.843590,0.287601,0.209255,0.855863,0.267118,...,0.184361,0.340770,0.139397,0.668547,0.894468,0.321328,0.266349,0.424386,0.884104,0.017647
4,0.295052,0.513433,0.920665,0.246196,0.756871,0.320587,0.303282,0.425225,0.957904,0.105989,...,0.600181,0.312879,0.113000,0.319973,0.764525,0.401135,0.101091,0.911585,0.190624,0.765528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.475734,0.197568,0.342474,0.607344,0.088745,0.261387,0.329680,0.040369,0.345740,0.026497,...,0.312242,0.145031,0.440357,0.391295,0.381609,0.630485,0.065148,0.035008,0.077511,0.359699
996,0.979696,0.543375,0.378511,0.552622,0.225528,0.296851,0.987186,0.186232,0.483569,0.146092,...,0.999511,0.976450,0.989609,0.404554,0.659847,0.282518,0.349211,0.412232,0.969186,0.482887
997,0.331396,0.947620,0.314432,0.675827,0.771553,0.495141,0.474989,0.714843,0.326292,0.627399,...,0.972455,0.594838,0.850237,0.734964,0.332858,0.724317,0.170480,0.180732,0.367235,0.989698
998,0.811667,0.864191,0.670762,0.405521,0.926258,0.212404,0.338725,0.032479,0.432321,0.565063,...,0.945478,0.305691,0.449997,0.859140,0.871295,0.677072,0.230960,0.247891,0.392236,0.322405


In [6]:
# Assigning X to all columns except 0
X_df2 = df2.drop(columns=0)
X_df2.head()

# Assigning Y to column 0
Y_df2 = df2[0]
print(Y_df2)

# The following line makes Y become a list
Y_df2 = np.array(Y_df2).reshape(-1)
print(X_df2.shape,Y_df2.shape)

list_titles = X_df2.columns

list_corr_df2 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df2.append(abs(np.corrcoef(Y_df2, X_df2[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df2, X_df2[i])[0][1]))

0      0.719829
1      0.709905
2      0.744070
3      0.859256
4      0.295052
         ...   
995    0.475734
996    0.979696
997    0.331396
998    0.811667
999    0.633195
Name: 0, Length: 1000, dtype: float64
(1000, 99999) (1000,)
1
Correlation matrix for column 0 and and column1: -0.04279218412167287
2
Correlation matrix for column 0 and and column2: 0.01916400639687699
3
Correlation matrix for column 0 and and column3: -0.0456468385697526
4
Correlation matrix for column 0 and and column4: -0.03832387633578141
5
Correlation matrix for column 0 and and column5: -0.025710826497435415
6
Correlation matrix for column 0 and and column6: -0.01900551649230253
7
Correlation matrix for column 0 and and column7: 0.018825487579633076
8
Correlation matrix for column 0 and and column8: -0.005081415471813811
9
Correlation matrix for column 0 and and column9: 0.01655500748551623
10
Correlation matrix for column 0 and and column10: -0.02239534793078227
11
Correlation matrix for column 0 and and 

Correlation matrix for column 0 and and column236: 0.0031757766624125757
237
Correlation matrix for column 0 and and column237: -0.0028632561698742217
238
Correlation matrix for column 0 and and column238: -0.02493412998068336
239
Correlation matrix for column 0 and and column239: 0.0037361195756692336
240
Correlation matrix for column 0 and and column240: 0.057604753762421114
241
Correlation matrix for column 0 and and column241: 0.011631606654496267
242
Correlation matrix for column 0 and and column242: -0.05521012328751815
243
Correlation matrix for column 0 and and column243: -0.03790245942599595
244
Correlation matrix for column 0 and and column244: 0.007280012685199165
245
Correlation matrix for column 0 and and column245: -0.022742655125585962
246
Correlation matrix for column 0 and and column246: 0.01752989992345153
247
Correlation matrix for column 0 and and column247: -0.025095602440525947
248
Correlation matrix for column 0 and and column248: 0.021441332526509097
249
Correla

Correlation matrix for column 0 and and column371: 0.03966411660179003
372
Correlation matrix for column 0 and and column372: 0.015213520388685224
373
Correlation matrix for column 0 and and column373: 0.0027858737869313624
374
Correlation matrix for column 0 and and column374: 0.01475298815809689
375
Correlation matrix for column 0 and and column375: -0.018790840605134727
376
Correlation matrix for column 0 and and column376: 0.019839929204134098
377
Correlation matrix for column 0 and and column377: -0.03160579081010633
378
Correlation matrix for column 0 and and column378: 0.024266425357930575
379
Correlation matrix for column 0 and and column379: 0.03724517031977056
380
Correlation matrix for column 0 and and column380: -0.03265777423859397
381
Correlation matrix for column 0 and and column381: 0.007118137676351979
382
Correlation matrix for column 0 and and column382: 0.061460617563158314
383
Correlation matrix for column 0 and and column383: 0.03048371112097424
384
Correlation ma

Correlation matrix for column 0 and and column492: -0.026650550419144017
493
Correlation matrix for column 0 and and column493: 0.04440791623353246
494
Correlation matrix for column 0 and and column494: 0.07393763292668924
495
Correlation matrix for column 0 and and column495: -0.01277085891605039
496
Correlation matrix for column 0 and and column496: 0.03754919858570218
497
Correlation matrix for column 0 and and column497: -0.011243778461837474
498
Correlation matrix for column 0 and and column498: -0.03584642144087534
499
Correlation matrix for column 0 and and column499: 0.060490934220694424
500
Correlation matrix for column 0 and and column500: 0.05074295441894697
501
Correlation matrix for column 0 and and column501: 0.04988185705500255
502
Correlation matrix for column 0 and and column502: -0.011570496323944128
503
Correlation matrix for column 0 and and column503: -0.036261583732427834
504
Correlation matrix for column 0 and and column504: 0.012474776685799324
505
Correlation m

Correlation matrix for column 0 and and column610: 0.001035573224625963
611
Correlation matrix for column 0 and and column611: 0.04896996554349866
612
Correlation matrix for column 0 and and column612: 0.03940557264409528
613
Correlation matrix for column 0 and and column613: -0.028060863558906586
614
Correlation matrix for column 0 and and column614: -0.015170513305248977
615
Correlation matrix for column 0 and and column615: -0.02371022716144888
616
Correlation matrix for column 0 and and column616: 0.018421659829682708
617
Correlation matrix for column 0 and and column617: -0.003005418586508161
618
Correlation matrix for column 0 and and column618: 0.019885201614033222
619
Correlation matrix for column 0 and and column619: -0.036150192215761254
620
Correlation matrix for column 0 and and column620: -0.017753735934537402
621
Correlation matrix for column 0 and and column621: 0.037380149831960724
622
Correlation matrix for column 0 and and column622: 0.005673293831320163
623
Correlati

Correlation matrix for column 0 and and column725: 0.024696485256739606
726
Correlation matrix for column 0 and and column726: 0.0173009392052181
727
Correlation matrix for column 0 and and column727: 0.0327241478327306
728
Correlation matrix for column 0 and and column728: -0.04147787268619617
729
Correlation matrix for column 0 and and column729: -0.022256579636239858
730
Correlation matrix for column 0 and and column730: -0.019095037760575594
731
Correlation matrix for column 0 and and column731: 0.022718680983408384
732
Correlation matrix for column 0 and and column732: 0.010123772121697088
733
Correlation matrix for column 0 and and column733: -0.035928466385254265
734
Correlation matrix for column 0 and and column734: -0.013641121811091702
735
Correlation matrix for column 0 and and column735: -0.03706100489827455
736
Correlation matrix for column 0 and and column736: -0.01466434378541428
737
Correlation matrix for column 0 and and column737: -0.018475481590973543
738
Correlation

Correlation matrix for column 0 and and column878: 0.03696717002140671
879
Correlation matrix for column 0 and and column879: 0.002177061803436777
880
Correlation matrix for column 0 and and column880: 0.017965480576651233
881
Correlation matrix for column 0 and and column881: 0.02893731617811406
882
Correlation matrix for column 0 and and column882: -0.014108080732543037
883
Correlation matrix for column 0 and and column883: -0.011961486831130639
884
Correlation matrix for column 0 and and column884: 0.016168919681790038
885
Correlation matrix for column 0 and and column885: -0.007910481776207985
886
Correlation matrix for column 0 and and column886: -0.017954143488455365
887
Correlation matrix for column 0 and and column887: -0.029719138547353665
888
Correlation matrix for column 0 and and column888: 0.009610665891329976
889
Correlation matrix for column 0 and and column889: -0.02022441877021743
890
Correlation matrix for column 0 and and column890: -0.07462593984652244
891
Correlati

We now compare which of the correlation lists has the highest numbers of every parameter

In [7]:
a = 0
b = 0
for i in range(0,1000):
    if abs(list_corr_df1[i]) > abs(list_corr_df2[i]):
        a+=1
    elif abs(list_corr_df1[i]) < abs(list_corr_df2[i]):
        b+=1
    else:
        print()
        
print(a)
print(b)

189
811


In [8]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df2)):
    if abs(list_corr_df2[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df2:",(b/len(list_corr_df2)*100),"%")

Percentage of correlations in df1: 30.099999999999998 %
Percentage of correlations in df2: 76.3 %


Clearly it is seen that the lower the number of observations, the higher the frequency of spurious correlations. However the number of spurious correlations present in the bigger DataFrame is still very significant.

To counter spurious correlations, random projection can be used.

## Random Projection eps = 0.1

In [None]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.1)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

In [None]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:len(list_titles)]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

In [None]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")

## Random Projection eps = 0.5

In [None]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.5)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

In [None]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

In [None]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")