# Spurious Correlations of Highly Dimensional Big Data

This Notebook aims at showing how PCA and random projection can solve the problem of spurious correlations in Big Data.

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.utils import shuffle

## DataFrame Creation

In [2]:
# User defined parameters

# Number of rows for df1
x = 10000
print(x)

# Number of columns for df1
y = 100000

# Number of rows for df2
z = 1000

10000


In [3]:
# Creating a dataframe with x number of rows and y number of columns
df = pd.DataFrame(np.random.random_sample((x,y)))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0.432658,0.542699,0.701828,0.752799,0.084320,0.181851,0.435853,0.428314,0.735621,0.035503,...,0.702495,0.540373,0.288365,0.613886,0.278045,0.513059,0.816923,0.571857,0.931124,0.149962
1,0.861965,0.015572,0.531486,0.950157,0.333998,0.407491,0.784808,0.944718,0.032295,0.508345,...,0.798755,0.717060,0.984885,0.729571,0.700661,0.341816,0.762865,0.588008,0.435988,0.565671
2,0.537376,0.922757,0.623719,0.433085,0.307835,0.981855,0.577426,0.960850,0.566917,0.641841,...,0.240074,0.073767,0.341991,0.781537,0.866267,0.972218,0.643555,0.933557,0.774322,0.985500
3,0.370857,0.789797,0.366539,0.660219,0.898803,0.818456,0.406590,0.157035,0.827784,0.866977,...,0.626063,0.314577,0.434237,0.156274,0.301250,0.956873,0.605684,0.392845,0.375512,0.580585
4,0.211964,0.576742,0.821346,0.703811,0.823517,0.833011,0.705805,0.073956,0.286782,0.367882,...,0.633150,0.500588,0.629900,0.208067,0.584094,0.266629,0.635911,0.487324,0.556880,0.283315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.398674,0.089090,0.539369,0.907472,0.735692,0.303876,0.751676,0.403484,0.447551,0.900031,...,0.096743,0.839717,0.903867,0.842435,0.072567,0.418437,0.903756,0.321422,0.704029,0.060462
9996,0.488569,0.554871,0.033123,0.297833,0.626508,0.239693,0.905135,0.224265,0.099503,0.042473,...,0.858701,0.422357,0.765951,0.067505,0.916515,0.762937,0.268308,0.980624,0.379942,0.676284
9997,0.106419,0.099702,0.765909,0.644508,0.514297,0.160067,0.151494,0.378121,0.084789,0.650732,...,0.461926,0.297105,0.525139,0.585168,0.687204,0.998276,0.560255,0.405642,0.796491,0.303413
9998,0.216807,0.708445,0.418530,0.545919,0.091939,0.286811,0.695540,0.989298,0.610778,0.717046,...,0.384268,0.238792,0.230001,0.098568,0.526395,0.173791,0.475571,0.914406,0.726831,0.036829


In order to assess the correlations of the different parameters, correlations between the column with index 0 and the 99 other first columns is assessed.

In [4]:
# Assigning X to all columns except 0
X_df = df.drop(columns=0)
X_df.head()

# Assigning Y to column 0
Y_df = df[0]
print(Y_df)

# The following line makes Y become a list
Y_df = np.array(Y_df).reshape(-1)
print(X_df.shape,Y_df.shape)

list_titles = X_df.columns
list_corr_df1 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df1.append(abs(np.corrcoef(Y_df, X_df[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df, X_df[i])[0][1]))

0       0.432658
1       0.861965
2       0.537376
3       0.370857
4       0.211964
          ...   
9995    0.398674
9996    0.488569
9997    0.106419
9998    0.216807
9999    0.649189
Name: 0, Length: 10000, dtype: float64
(10000, 99999) (10000,)
1
Correlation matrix for column 0 and and column1: 0.0022149490637002095
2
Correlation matrix for column 0 and and column2: 0.020534350321741533
3
Correlation matrix for column 0 and and column3: 0.0072119578187907025
4
Correlation matrix for column 0 and and column4: 0.016208972299325736
5
Correlation matrix for column 0 and and column5: -0.028917066115991857
6
Correlation matrix for column 0 and and column6: 0.004015515347408022
7
Correlation matrix for column 0 and and column7: -0.0026961258093711775
8
Correlation matrix for column 0 and and column8: -0.01430599498160686
9
Correlation matrix for column 0 and and column9: 0.009789420733391873
10
Correlation matrix for column 0 and and column10: -0.009967666929082025
11
Correlation matrix 

Correlation matrix for column 0 and and column128: -0.00936822132033165
129
Correlation matrix for column 0 and and column129: -0.0033107272700837027
130
Correlation matrix for column 0 and and column130: -0.0013008300271157725
131
Correlation matrix for column 0 and and column131: 0.0035345420018601964
132
Correlation matrix for column 0 and and column132: 0.004587754739841332
133
Correlation matrix for column 0 and and column133: 0.0018081210705127467
134
Correlation matrix for column 0 and and column134: 0.014513392357815685
135
Correlation matrix for column 0 and and column135: -0.010281152733471178
136
Correlation matrix for column 0 and and column136: -0.005915806167103551
137
Correlation matrix for column 0 and and column137: 0.004748147224506085
138
Correlation matrix for column 0 and and column138: 0.009675638171706455
139
Correlation matrix for column 0 and and column139: -0.010222167482634542
140
Correlation matrix for column 0 and and column140: 0.0034267674340995296
141
Co

Correlation matrix for column 0 and and column245: -0.00841227291568955
246
Correlation matrix for column 0 and and column246: 0.002073338717553649
247
Correlation matrix for column 0 and and column247: -0.001277324975352435
248
Correlation matrix for column 0 and and column248: 0.011992547142357224
249
Correlation matrix for column 0 and and column249: 0.010958090505589416
250
Correlation matrix for column 0 and and column250: 0.012839932878822366
251
Correlation matrix for column 0 and and column251: 0.018226577192354085
252
Correlation matrix for column 0 and and column252: -0.004819946234327377
253
Correlation matrix for column 0 and and column253: -0.01264932127162981
254
Correlation matrix for column 0 and and column254: -0.004387364143382554
255
Correlation matrix for column 0 and and column255: -0.011323263449974127
256
Correlation matrix for column 0 and and column256: -0.010856143127887357
257
Correlation matrix for column 0 and and column257: 0.00043547564496251346
258
Corre

Correlation matrix for column 0 and and column366: 0.005960429694293639
367
Correlation matrix for column 0 and and column367: 0.003936728456007163
368
Correlation matrix for column 0 and and column368: -0.013825502310124891
369
Correlation matrix for column 0 and and column369: -0.0033682760593557004
370
Correlation matrix for column 0 and and column370: 0.008211273282265188
371
Correlation matrix for column 0 and and column371: -0.0029029953603020254
372
Correlation matrix for column 0 and and column372: 0.019379642330422556
373
Correlation matrix for column 0 and and column373: -0.004648185653873376
374
Correlation matrix for column 0 and and column374: 0.014307331902839978
375
Correlation matrix for column 0 and and column375: -0.006940645852796699
376
Correlation matrix for column 0 and and column376: 0.003203513956229563
377
Correlation matrix for column 0 and and column377: -0.017591334913650065
378
Correlation matrix for column 0 and and column378: -0.009225738067044157
379
Cor

Correlation matrix for column 0 and and column491: -0.006316507301578264
492
Correlation matrix for column 0 and and column492: 0.002142511457314178
493
Correlation matrix for column 0 and and column493: -0.00441940024694101
494
Correlation matrix for column 0 and and column494: 0.0006735591208107562
495
Correlation matrix for column 0 and and column495: -0.006332138262356634
496
Correlation matrix for column 0 and and column496: 0.0021758636737344204
497
Correlation matrix for column 0 and and column497: -0.008664705120362158
498
Correlation matrix for column 0 and and column498: 0.009689333917821712
499
Correlation matrix for column 0 and and column499: -0.011002295715436473
500
Correlation matrix for column 0 and and column500: -0.0017399748112746265
501
Correlation matrix for column 0 and and column501: 0.013262788506727572
502
Correlation matrix for column 0 and and column502: -0.018638528720795394
503
Correlation matrix for column 0 and and column503: -0.006788002707584213
504
Co

Correlation matrix for column 0 and and column603: 0.003976964179694233
604
Correlation matrix for column 0 and and column604: 0.0009522536487825649
605
Correlation matrix for column 0 and and column605: 0.005169447130945831
606
Correlation matrix for column 0 and and column606: -0.008508482212062398
607
Correlation matrix for column 0 and and column607: -0.010694269918066053
608
Correlation matrix for column 0 and and column608: 0.0057789904775227266
609
Correlation matrix for column 0 and and column609: 0.012352629710342388
610
Correlation matrix for column 0 and and column610: 0.021115358282294976
611
Correlation matrix for column 0 and and column611: 0.00485079516967278
612
Correlation matrix for column 0 and and column612: 0.015182053921923474
613
Correlation matrix for column 0 and and column613: -0.004722373690799709
614
Correlation matrix for column 0 and and column614: 0.0018967958935880444
615
Correlation matrix for column 0 and and column615: 0.0012347496082188698
616
Correl

Correlation matrix for column 0 and and column722: 0.002762924238834611
723
Correlation matrix for column 0 and and column723: -0.010485254216577132
724
Correlation matrix for column 0 and and column724: 0.0041533287626969615
725
Correlation matrix for column 0 and and column725: -0.0015576734875435267
726
Correlation matrix for column 0 and and column726: 0.019023384460231916
727
Correlation matrix for column 0 and and column727: -0.008460870235900617
728
Correlation matrix for column 0 and and column728: -0.001446802883280942
729
Correlation matrix for column 0 and and column729: 0.01595890859558219
730
Correlation matrix for column 0 and and column730: -0.0004115499944638123
731
Correlation matrix for column 0 and and column731: 0.007244584843617764
732
Correlation matrix for column 0 and and column732: 0.008282671616622139
733
Correlation matrix for column 0 and and column733: -0.007909322046541234
734
Correlation matrix for column 0 and and column734: -0.00035798990310929705
735
C

Correlation matrix for column 0 and and column829: -0.008276893886763264
830
Correlation matrix for column 0 and and column830: 0.020590525654252147
831
Correlation matrix for column 0 and and column831: 0.008311766106719763
832
Correlation matrix for column 0 and and column832: -0.006879474754202891
833
Correlation matrix for column 0 and and column833: 0.0008385598350246811
834
Correlation matrix for column 0 and and column834: 0.014570218985470064
835
Correlation matrix for column 0 and and column835: -0.013716459400858036
836
Correlation matrix for column 0 and and column836: -0.011668682143131258
837
Correlation matrix for column 0 and and column837: 0.0005145256500889375
838
Correlation matrix for column 0 and and column838: -0.0013435565342844642
839
Correlation matrix for column 0 and and column839: 0.00457041083286845
840
Correlation matrix for column 0 and and column840: 0.0016828851340733562
841
Correlation matrix for column 0 and and column841: 0.007578550294842847
842
Corr

Correlation matrix for column 0 and and column950: -0.0012478094254913432
951
Correlation matrix for column 0 and and column951: -0.0055702655069665066
952
Correlation matrix for column 0 and and column952: -0.016492188335864792
953
Correlation matrix for column 0 and and column953: 0.027516156914241894
954
Correlation matrix for column 0 and and column954: -0.00291523397705891
955
Correlation matrix for column 0 and and column955: -0.011852321581297415
956
Correlation matrix for column 0 and and column956: -0.010137140895218583
957
Correlation matrix for column 0 and and column957: -0.0004973562512440936
958
Correlation matrix for column 0 and and column958: -0.005065442509432373
959
Correlation matrix for column 0 and and column959: -0.010938801919494886
960
Correlation matrix for column 0 and and column960: -0.007269859254675258
961
Correlation matrix for column 0 and and column961: -0.0186869571556779
962
Correlation matrix for column 0 and and column962: 0.010249521187076107
963
C

To see if indeed the correlations between the different parameters increases with the data size, the original dataframe is compared to a sub-set dataframe which only takes the first z rows of the original dataframe. If the correlations in the original dataframe are higher than in the smaller dataframe this would prove that the bigger the data size the more frequent the number of spurious correlations.

In [5]:
# Creating smaller dataframe taking z number of rows from original dataframe
df2 = df.iloc[:z]
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0.432658,0.542699,0.701828,0.752799,0.08432,0.181851,0.435853,0.428314,0.735621,0.035503,...,0.702495,0.540373,0.288365,0.613886,0.278045,0.513059,0.816923,0.571857,0.931124,0.149962
1,0.861965,0.015572,0.531486,0.950157,0.333998,0.407491,0.784808,0.944718,0.032295,0.508345,...,0.798755,0.71706,0.984885,0.729571,0.700661,0.341816,0.762865,0.588008,0.435988,0.565671
2,0.537376,0.922757,0.623719,0.433085,0.307835,0.981855,0.577426,0.96085,0.566917,0.641841,...,0.240074,0.073767,0.341991,0.781537,0.866267,0.972218,0.643555,0.933557,0.774322,0.9855
3,0.370857,0.789797,0.366539,0.660219,0.898803,0.818456,0.40659,0.157035,0.827784,0.866977,...,0.626063,0.314577,0.434237,0.156274,0.30125,0.956873,0.605684,0.392845,0.375512,0.580585
4,0.211964,0.576742,0.821346,0.703811,0.823517,0.833011,0.705805,0.073956,0.286782,0.367882,...,0.63315,0.500588,0.6299,0.208067,0.584094,0.266629,0.635911,0.487324,0.55688,0.283315


In [6]:
# Assigning X to all columns except 0
X_df2 = df2.drop(columns=0)
X_df2.head()

# Assigning Y to column 0
Y_df2 = df2[0]
print(Y_df2)

# The following line makes Y become a list
Y_df2 = np.array(Y_df2).reshape(-1)
print(X_df2.shape,Y_df2.shape)

list_titles = X_df2.columns

list_corr_df2 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df2.append(abs(np.corrcoef(Y_df2, X_df2[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df2, X_df2[i])[0][1]))

0      0.432658
1      0.861965
2      0.537376
3      0.370857
4      0.211964
         ...   
995    0.091245
996    0.171568
997    0.628507
998    0.244570
999    0.001009
Name: 0, Length: 1000, dtype: float64
(1000, 99999) (1000,)
1
Correlation matrix for column 0 and and column1: 0.0007737823161191175
2
Correlation matrix for column 0 and and column2: 0.04367653295879157
3
Correlation matrix for column 0 and and column3: -0.004814213910948161
4
Correlation matrix for column 0 and and column4: 0.04977627270261354
5
Correlation matrix for column 0 and and column5: -0.028399105218716875
6
Correlation matrix for column 0 and and column6: -0.013271257500304143
7
Correlation matrix for column 0 and and column7: 0.0048656656471897835
8
Correlation matrix for column 0 and and column8: 0.010023208872414051
9
Correlation matrix for column 0 and and column9: 0.015959564734841718
10
Correlation matrix for column 0 and and column10: 0.01701983370045406
11
Correlation matrix for column 0 and a

113
Correlation matrix for column 0 and and column113: 0.02019086077644912
114
Correlation matrix for column 0 and and column114: 0.051000198920764184
115
Correlation matrix for column 0 and and column115: 0.042805992727030444
116
Correlation matrix for column 0 and and column116: -0.04905301395202767
117
Correlation matrix for column 0 and and column117: 0.040347059778645396
118
Correlation matrix for column 0 and and column118: 0.03765857698605716
119
Correlation matrix for column 0 and and column119: 0.004427399315839868
120
Correlation matrix for column 0 and and column120: -0.001203651117460665
121
Correlation matrix for column 0 and and column121: -0.009459337791486952
122
Correlation matrix for column 0 and and column122: 0.027017071477696376
123
Correlation matrix for column 0 and and column123: 0.006370097671930172
124
Correlation matrix for column 0 and and column124: 0.02054839975792281
125
Correlation matrix for column 0 and and column125: -0.0488488332329166
126
Correlatio

231
Correlation matrix for column 0 and and column231: 0.0210009295313144
232
Correlation matrix for column 0 and and column232: -0.07205140787166622
233
Correlation matrix for column 0 and and column233: -0.056061479819712
234
Correlation matrix for column 0 and and column234: -0.0038535452119611386
235
Correlation matrix for column 0 and and column235: 0.0029091260320992336
236
Correlation matrix for column 0 and and column236: -0.0036222683078156116
237
Correlation matrix for column 0 and and column237: 0.03819506093952265
238
Correlation matrix for column 0 and and column238: 0.05851783379932521
239
Correlation matrix for column 0 and and column239: -0.026281308655665737
240
Correlation matrix for column 0 and and column240: -0.06192583448587771
241
Correlation matrix for column 0 and and column241: -0.01991221716802607
242
Correlation matrix for column 0 and and column242: 0.011310578172689257
243
Correlation matrix for column 0 and and column243: 0.025420685977083666
244
Correlat

Correlation matrix for column 0 and and column358: -0.02943902745731233
359
Correlation matrix for column 0 and and column359: -0.0029478435055780112
360
Correlation matrix for column 0 and and column360: -0.0205984498392427
361
Correlation matrix for column 0 and and column361: 0.0030462999339504523
362
Correlation matrix for column 0 and and column362: -0.0150671357867647
363
Correlation matrix for column 0 and and column363: 0.027413391049468563
364
Correlation matrix for column 0 and and column364: -0.0036706155056570352
365
Correlation matrix for column 0 and and column365: -0.015335157719749542
366
Correlation matrix for column 0 and and column366: 0.07439345035484461
367
Correlation matrix for column 0 and and column367: -0.022500817593082274
368
Correlation matrix for column 0 and and column368: 0.005343606320830109
369
Correlation matrix for column 0 and and column369: -0.028747198058027987
370
Correlation matrix for column 0 and and column370: 0.010935395354909271
371
Correla

Correlation matrix for column 0 and and column490: 0.04701663362962791
491
Correlation matrix for column 0 and and column491: -0.000918805806338529
492
Correlation matrix for column 0 and and column492: -0.004720758661898049
493
Correlation matrix for column 0 and and column493: -0.0853527527946181
494
Correlation matrix for column 0 and and column494: 0.014742728502418785
495
Correlation matrix for column 0 and and column495: 0.008082827923229605
496
Correlation matrix for column 0 and and column496: -0.026633943371428416
497
Correlation matrix for column 0 and and column497: 0.014747386826552453
498
Correlation matrix for column 0 and and column498: 0.015760884955619423
499
Correlation matrix for column 0 and and column499: -0.040009113552542705
500
Correlation matrix for column 0 and and column500: 0.021499441446824086
501
Correlation matrix for column 0 and and column501: 0.02806226710620797
502
Correlation matrix for column 0 and and column502: -0.04675352628401729
503
Correlation

Correlation matrix for column 0 and and column645: -0.034033494705918
646
Correlation matrix for column 0 and and column646: 0.022170563385631104
647
Correlation matrix for column 0 and and column647: -0.034119617853425876
648
Correlation matrix for column 0 and and column648: -0.017836270685998194
649
Correlation matrix for column 0 and and column649: 0.017882880916955106
650
Correlation matrix for column 0 and and column650: 0.004666308261476524
651
Correlation matrix for column 0 and and column651: -0.027813561216161575
652
Correlation matrix for column 0 and and column652: 0.019641099030132503
653
Correlation matrix for column 0 and and column653: -0.02353446283201116
654
Correlation matrix for column 0 and and column654: 0.01899954280733322
655
Correlation matrix for column 0 and and column655: 0.015599551661885489
656
Correlation matrix for column 0 and and column656: 0.016244725946217543
657
Correlation matrix for column 0 and and column657: 0.018119224824210012
658
Correlation 

Correlation matrix for column 0 and and column800: 0.03121659116645986
801
Correlation matrix for column 0 and and column801: -0.020515009955661263
802
Correlation matrix for column 0 and and column802: 0.02604013981423975
803
Correlation matrix for column 0 and and column803: -0.0038031028173573797
804
Correlation matrix for column 0 and and column804: -0.019913800903094706
805
Correlation matrix for column 0 and and column805: -0.0047795649202618485
806
Correlation matrix for column 0 and and column806: 0.0001862266411680096
807
Correlation matrix for column 0 and and column807: -0.013867644207764705
808
Correlation matrix for column 0 and and column808: -0.022140202019945616
809
Correlation matrix for column 0 and and column809: -0.05695538841110379
810
Correlation matrix for column 0 and and column810: 0.013211597558538986
811
Correlation matrix for column 0 and and column811: 0.005758804389124518
812
Correlation matrix for column 0 and and column812: -0.0013326706768789394
813
Cor

Correlation matrix for column 0 and and column965: -0.019559258589455227
966
Correlation matrix for column 0 and and column966: 0.018093517147414293
967
Correlation matrix for column 0 and and column967: -0.028836818205891127
968
Correlation matrix for column 0 and and column968: 0.0528636428028675
969
Correlation matrix for column 0 and and column969: 0.008266079508165432
970
Correlation matrix for column 0 and and column970: -0.044383105487690505
971
Correlation matrix for column 0 and and column971: -0.02101261851156224
972
Correlation matrix for column 0 and and column972: -0.03484408324290664
973
Correlation matrix for column 0 and and column973: -0.0036486369918226386
974
Correlation matrix for column 0 and and column974: 0.02192049268837458
975
Correlation matrix for column 0 and and column975: -0.002001365200963796
976
Correlation matrix for column 0 and and column976: -0.04392475238058586
977
Correlation matrix for column 0 and and column977: -0.018580280704191366
978
Correlat

We now compare which of the correlation lists has the highest numbers of every parameter

In [7]:
a = 0
b = 0
for i in range(0,1000):
    if list_corr_df1[i] > list_corr_df2[i]:
        a+=1
    elif list_corr_df1[i] < list_corr_df2[i]:
        b+=1
    else:
        print()
        
print(a)
print(b)

187
813


Clearly it is seen that the bigger DataFrame has the highest number of spurious correlations.

To counter spurious correlations, random projection can be used.

## Random Projection eps = 0.1

In [8]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.1)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7884,7885,7886,7887,7888,7889,7890,7891,7892,7893
0,-0.024848,-1.541833,1.158278,-2.494000,-0.784556,-0.092561,1.156911,0.711691,-1.733940,0.156639,...,2.582521,1.447590,-3.838699,0.087213,1.943857,-2.894286,-2.604449,0.694813,-1.507814,-1.073297
1,-0.884938,-2.664448,-1.432937,-2.472946,2.105344,0.174430,1.034394,1.240309,-2.572270,-2.609293,...,0.102818,1.559074,-0.613311,1.085195,3.185510,-1.898160,3.103780,-1.032152,-1.151855,-1.448240
2,-1.485197,-1.626260,-0.587923,-3.692758,0.402206,-1.328756,0.497601,4.019487,-1.694004,0.171575,...,-1.645473,-0.519057,-0.626842,0.024329,3.880690,-5.012962,3.075813,-1.806596,-3.313398,-2.252613
3,-1.140183,2.731584,-0.391255,-1.748100,-0.067548,0.243200,1.828247,3.983188,-0.791356,0.603110,...,0.789125,-0.394378,-1.115470,1.929399,2.458281,-3.927000,1.471075,1.029046,-3.833982,-1.143423
4,2.136889,-0.611482,-0.593634,-2.105980,2.890562,0.144562,-0.340611,3.865007,0.198156,0.504869,...,3.380793,1.243600,-2.225338,-0.004270,2.154269,-4.222592,1.081235,-0.309233,-1.535562,-2.689225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.201035,-1.127075,-1.801791,-2.067705,0.830829,-0.325193,0.562213,0.955960,-2.666419,-0.197345,...,0.019721,-0.865824,-0.677424,-0.291485,1.789928,-4.035212,-0.576941,1.002883,0.029540,0.078999
9996,-2.371264,-0.150553,-1.111286,-2.651551,0.559114,2.369030,-0.078540,4.174423,-2.905156,0.161672,...,1.311222,-0.016404,-0.954754,0.623269,2.146492,-4.597346,1.371856,0.524513,-1.339727,-2.460890
9997,-0.260565,-1.331190,0.860766,-0.636158,0.564495,-5.045219,-0.994985,2.427470,-2.166854,-0.021458,...,2.006439,0.485925,-1.447619,0.237774,-0.133147,-5.805767,3.003317,0.708121,-0.172259,-2.247800
9998,0.667385,-2.251110,-0.527862,-3.269131,-1.030063,2.049867,1.697210,2.428938,-2.036551,0.117435,...,1.441346,1.122541,-0.367701,-0.543903,3.477436,-4.677674,2.337859,-1.475752,-1.729650,-0.066612


In [9]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

0      -0.024848
1      -0.884938
2      -1.485197
3      -1.140183
4       2.136889
          ...   
9995    0.201035
9996   -2.371264
9997   -0.260565
9998    0.667385
9999    0.042417
Name: 0, Length: 10000, dtype: float64
(10000, 7893) (10000,)
1
Correlation matrix for column 0 and and column1: 0.015135204526740963
2
Correlation matrix for column 0 and and column2: 0.009983798225747142
3
Correlation matrix for column 0 and and column3: -0.0040578647227410575
4
Correlation matrix for column 0 and and column4: -0.013425213789533198
5
Correlation matrix for column 0 and and column5: -0.017728182445957
6
Correlation matrix for column 0 and and column6: 0.0009538599694183782
7
Correlation matrix for column 0 and and column7: -0.003913962442527566
8
Correlation matrix for column 0 and and column8: 0.012621405849879532
9
Correlation matrix for column 0 and and column9: 0.011060226726413255
10
Correlation matrix for column 0 and and column10: 0.0016855767981158488
11
Correlation matrix for

Correlation matrix for column 0 and and column159: -0.007703696210956819
160
Correlation matrix for column 0 and and column160: -0.007527236165043589
161
Correlation matrix for column 0 and and column161: 0.011708951156150478
162
Correlation matrix for column 0 and and column162: -0.0018310034110935235
163
Correlation matrix for column 0 and and column163: 0.01324539002506616
164
Correlation matrix for column 0 and and column164: -0.003289738734862902
165
Correlation matrix for column 0 and and column165: 3.7853686158946466e-05
166
Correlation matrix for column 0 and and column166: -0.0002806194117214276
167
Correlation matrix for column 0 and and column167: -0.0006158601401498484
168
Correlation matrix for column 0 and and column168: -0.010372676116761131
169
Correlation matrix for column 0 and and column169: -0.005393311879269913
170
Correlation matrix for column 0 and and column170: 0.005387222035893227
171
Correlation matrix for column 0 and and column171: 0.007742649471125975
172


Correlation matrix for column 0 and and column336: -0.0007428944513327965
337
Correlation matrix for column 0 and and column337: 0.014167458711862384
338
Correlation matrix for column 0 and and column338: -0.0039695926712210025
339
Correlation matrix for column 0 and and column339: 0.00295161810472186
340
Correlation matrix for column 0 and and column340: -0.012064270685304667
341
Correlation matrix for column 0 and and column341: -0.004009637821176848
342
Correlation matrix for column 0 and and column342: -0.003497021671842044
343
Correlation matrix for column 0 and and column343: -0.006652505471745727
344
Correlation matrix for column 0 and and column344: 0.024619481524468412
345
Correlation matrix for column 0 and and column345: -0.002097802306737544
346
Correlation matrix for column 0 and and column346: 0.014524191581820495
347
Correlation matrix for column 0 and and column347: -0.005440445156917759
348
Correlation matrix for column 0 and and column348: 0.0063996928691775245
349
Co

Correlation matrix for column 0 and and column527: -0.010169327059428565
528
Correlation matrix for column 0 and and column528: 0.013183604848766529
529
Correlation matrix for column 0 and and column529: -0.005435515342059758
530
Correlation matrix for column 0 and and column530: -0.010901037066773903
531
Correlation matrix for column 0 and and column531: -0.0020502723356837288
532
Correlation matrix for column 0 and and column532: 0.005592950903757214
533
Correlation matrix for column 0 and and column533: 0.012765742431337046
534
Correlation matrix for column 0 and and column534: 0.0023971311752808084
535
Correlation matrix for column 0 and and column535: 0.022202910695635503
536
Correlation matrix for column 0 and and column536: -0.009853091129599085
537
Correlation matrix for column 0 and and column537: -0.013041067350834067
538
Correlation matrix for column 0 and and column538: 0.005765886869915497
539
Correlation matrix for column 0 and and column539: -0.0010660519043368569
540
Co

Correlation matrix for column 0 and and column705: 0.0039535663146190265
706
Correlation matrix for column 0 and and column706: -0.022810603725727133
707
Correlation matrix for column 0 and and column707: 0.008099326451314885
708
Correlation matrix for column 0 and and column708: 0.003787819615276108
709
Correlation matrix for column 0 and and column709: -9.74679412060873e-05
710
Correlation matrix for column 0 and and column710: 0.02107155007820405
711
Correlation matrix for column 0 and and column711: 0.0009769021063823324
712
Correlation matrix for column 0 and and column712: 0.012244782718713066
713
Correlation matrix for column 0 and and column713: -0.007344724313823351
714
Correlation matrix for column 0 and and column714: -0.002010927831975869
715
Correlation matrix for column 0 and and column715: 0.0030513867101813774
716
Correlation matrix for column 0 and and column716: -0.012752461751784515
717
Correlation matrix for column 0 and and column717: -0.007308714797187786
718
Corr

Correlation matrix for column 0 and and column882: -0.003118159817427044
883
Correlation matrix for column 0 and and column883: 0.01896066301343916
884
Correlation matrix for column 0 and and column884: -0.0009785107580332982
885
Correlation matrix for column 0 and and column885: 0.006994894083909601
886
Correlation matrix for column 0 and and column886: -0.00836002754972296
887
Correlation matrix for column 0 and and column887: 0.0021072802874199602
888
Correlation matrix for column 0 and and column888: 0.0016387291571838389
889
Correlation matrix for column 0 and and column889: -0.0014667972464831288
890
Correlation matrix for column 0 and and column890: 0.017521261868870198
891
Correlation matrix for column 0 and and column891: 0.0019209592607908447
892
Correlation matrix for column 0 and and column892: -0.007236020675579696
893
Correlation matrix for column 0 and and column893: -0.00731319855751391
894
Correlation matrix for column 0 and and column894: -0.003309555253868129
895
Cor

In [10]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")

Percentage of correlations in df1: 28.1 %
Percentage of correlations in df_new: 32.6 %


## Random Projection eps = 0.2

In [11]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.2)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2115,2116,2117,2118,2119,2120,2121,2122,2123,2124
0,1.400261,-2.697208,2.976999,-0.028454,-2.159186,1.145168,6.226056,6.760540,2.254834,-0.249832,...,4.332836,-0.432345,-1.848661,2.031912,-5.284094,-7.761841,-1.289820,-2.882298,2.560363,1.439307
1,-1.266678,-6.641611,2.322961,-1.313711,-1.762931,-1.954320,4.261518,4.255512,2.786756,2.314821,...,1.960228,-2.241690,-2.134887,1.746039,-2.468914,-5.804006,1.959734,-1.083333,6.797357,-0.382386
2,0.522648,-4.164001,0.133523,-0.007696,-0.717442,-2.506595,4.259381,3.501018,-0.779768,1.560663,...,3.610286,1.775304,-4.427600,4.242165,-3.678783,-5.813493,-1.676332,-3.300646,1.066685,-0.168002
3,1.382663,-4.132501,1.121052,-3.527834,-0.179706,-1.304581,8.027375,10.255411,1.780139,1.856727,...,3.195051,-2.022231,-3.360928,-4.158774,1.216447,-4.367921,-0.033589,-5.865837,2.189380,5.892513
4,1.958302,-1.545434,3.946175,-1.158203,0.929567,-2.069969,5.812903,3.394086,-3.758553,3.314987,...,2.029560,-2.285675,-3.273725,1.698072,-2.617459,-5.037106,1.844597,-1.868615,4.983936,2.642507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.908461,-5.323456,0.217761,-1.949222,-4.537959,-1.839237,7.093584,9.511215,3.202938,0.976556,...,2.925265,-0.586547,-5.745664,1.162478,0.452313,-10.100080,0.396682,-0.735845,4.962962,1.979264
9996,3.311561,-3.071614,1.111646,-2.219349,-4.017468,0.767845,2.473656,7.324158,-0.470124,0.373517,...,2.724547,-1.955512,0.756041,0.163172,-1.862569,-9.028602,-1.246523,-4.668762,2.619771,2.532071
9997,0.227375,-1.437460,3.714871,-1.022440,0.590845,-1.008299,2.794275,4.665550,1.772706,1.262765,...,1.851120,2.838050,-5.202238,2.748149,-5.478244,-6.464959,0.630945,1.027234,6.491129,-1.217329
9998,4.417335,-5.272479,-0.060801,-0.949153,-2.553583,-0.124669,7.489194,7.031136,1.940635,7.019902,...,1.272111,-2.165237,-2.791854,2.593969,-2.040552,-3.432216,1.284302,-4.966845,4.587922,-0.877870


In [12]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

0       1.400261
1      -1.266678
2       0.522648
3       1.382663
4       1.958302
          ...   
9995    2.908461
9996    3.311561
9997    0.227375
9998    4.417335
9999    1.656946
Name: 0, Length: 10000, dtype: float64
(10000, 2124) (10000,)
1
Correlation matrix for column 0 and and column1: -0.022870562282351698
2
Correlation matrix for column 0 and and column2: -0.006894108523904016
3
Correlation matrix for column 0 and and column3: -0.01003718393001632
4
Correlation matrix for column 0 and and column4: 0.014057392204764246
5
Correlation matrix for column 0 and and column5: -0.025322201111198404
6
Correlation matrix for column 0 and and column6: 0.002008418805360341
7
Correlation matrix for column 0 and and column7: 0.017954862790835597
8
Correlation matrix for column 0 and and column8: 0.01347230991438877
9
Correlation matrix for column 0 and and column9: 0.016607441530813956
10
Correlation matrix for column 0 and and column10: 0.00655572852344904
11
Correlation matrix for co

Correlation matrix for column 0 and and column126: 0.01730147958502977
127
Correlation matrix for column 0 and and column127: 0.008017300867539594
128
Correlation matrix for column 0 and and column128: -0.00800496950093584
129
Correlation matrix for column 0 and and column129: 0.005078781538814239
130
Correlation matrix for column 0 and and column130: -0.006655125559871783
131
Correlation matrix for column 0 and and column131: 0.0187951754224879
132
Correlation matrix for column 0 and and column132: 0.0030062629860453686
133
Correlation matrix for column 0 and and column133: -0.00326226611226527
134
Correlation matrix for column 0 and and column134: 0.012398531246303643
135
Correlation matrix for column 0 and and column135: -0.0011122173200292817
136
Correlation matrix for column 0 and and column136: -0.006821544028028757
137
Correlation matrix for column 0 and and column137: 0.004221969162728113
138
Correlation matrix for column 0 and and column138: -0.010980825491602519
139
Correlati

Correlation matrix for column 0 and and column297: -0.0032872952829058554
298
Correlation matrix for column 0 and and column298: -0.006055049852222291
299
Correlation matrix for column 0 and and column299: 0.0006509637685304564
300
Correlation matrix for column 0 and and column300: -0.013548983329087386
301
Correlation matrix for column 0 and and column301: 0.018026409187127827
302
Correlation matrix for column 0 and and column302: -0.003872015859621774
303
Correlation matrix for column 0 and and column303: 0.014662100683126048
304
Correlation matrix for column 0 and and column304: -0.004330287183691041
305
Correlation matrix for column 0 and and column305: 0.0068618782809153015
306
Correlation matrix for column 0 and and column306: 0.0020146893793344433
307
Correlation matrix for column 0 and and column307: -0.01432409969924737
308
Correlation matrix for column 0 and and column308: -0.0022118927246020433
309
Correlation matrix for column 0 and and column309: -0.021267391404402838
310


Correlation matrix for column 0 and and column429: -0.019913089483289632
430
Correlation matrix for column 0 and and column430: -0.01752502295627647
431
Correlation matrix for column 0 and and column431: -0.0013387214808394889
432
Correlation matrix for column 0 and and column432: -0.010720625690933147
433
Correlation matrix for column 0 and and column433: 0.013972534427613089
434
Correlation matrix for column 0 and and column434: -0.0026857533719553527
435
Correlation matrix for column 0 and and column435: -0.0016632769522591589
436
Correlation matrix for column 0 and and column436: 0.007681260257320054
437
Correlation matrix for column 0 and and column437: 0.00489402391191802
438
Correlation matrix for column 0 and and column438: -0.0021786660142260068
439
Correlation matrix for column 0 and and column439: -0.007582059612121566
440
Correlation matrix for column 0 and and column440: 0.009865777938143706
441
Correlation matrix for column 0 and and column441: -0.005921736088004019
442
C

Correlation matrix for column 0 and and column566: -0.003121698130799259
567
Correlation matrix for column 0 and and column567: -0.008207287572447252
568
Correlation matrix for column 0 and and column568: 0.0062090376033744525
569
Correlation matrix for column 0 and and column569: 0.00333692932186715
570
Correlation matrix for column 0 and and column570: -0.01296657411101311
571
Correlation matrix for column 0 and and column571: 0.012771125190303668
572
Correlation matrix for column 0 and and column572: -0.00658336790369949
573
Correlation matrix for column 0 and and column573: -0.0037223829419156746
574
Correlation matrix for column 0 and and column574: -0.0030111052654017943
575
Correlation matrix for column 0 and and column575: -0.0010364186736536983
576
Correlation matrix for column 0 and and column576: 0.0017210685206675367
577
Correlation matrix for column 0 and and column577: -0.004698498312636824
578
Correlation matrix for column 0 and and column578: -0.0025791655457196045
579


706
Correlation matrix for column 0 and and column706: -0.02131867632964193
707
Correlation matrix for column 0 and and column707: -0.007135913755763041
708
Correlation matrix for column 0 and and column708: 0.009307905983847026
709
Correlation matrix for column 0 and and column709: 0.011587412711433221
710
Correlation matrix for column 0 and and column710: -0.022025105831168634
711
Correlation matrix for column 0 and and column711: 0.00847011443520664
712
Correlation matrix for column 0 and and column712: -0.004007690589017402
713
Correlation matrix for column 0 and and column713: 0.00900456896227391
714
Correlation matrix for column 0 and and column714: -0.0047946545571664265
715
Correlation matrix for column 0 and and column715: -0.0022062436448876733
716
Correlation matrix for column 0 and and column716: -0.013333664130727577
717
Correlation matrix for column 0 and and column717: 0.011120492196592753
718
Correlation matrix for column 0 and and column718: 0.0030988861336514826
719
C

Correlation matrix for column 0 and and column847: -0.023845809911218666
848
Correlation matrix for column 0 and and column848: -0.006873962844549802
849
Correlation matrix for column 0 and and column849: -0.0058765523964916236
850
Correlation matrix for column 0 and and column850: 0.0069326574593738186
851
Correlation matrix for column 0 and and column851: -0.011737132351209273
852
Correlation matrix for column 0 and and column852: -0.009973105259390538
853
Correlation matrix for column 0 and and column853: 0.004835796916822165
854
Correlation matrix for column 0 and and column854: 0.002603032928786324
855
Correlation matrix for column 0 and and column855: 0.0001196295126555568
856
Correlation matrix for column 0 and and column856: 0.0027616725258076875
857
Correlation matrix for column 0 and and column857: -0.01043255411023372
858
Correlation matrix for column 0 and and column858: -0.011483476295215699
859
Correlation matrix for column 0 and and column859: -0.004390646651137052
860
C

986
Correlation matrix for column 0 and and column986: -0.004977918481834745
987
Correlation matrix for column 0 and and column987: 0.007800239326408671
988
Correlation matrix for column 0 and and column988: -0.0006943857447563185
989
Correlation matrix for column 0 and and column989: 0.004591254018933708
990
Correlation matrix for column 0 and and column990: 0.0073934980959825276
991
Correlation matrix for column 0 and and column991: -0.005846534916996527
992
Correlation matrix for column 0 and and column992: -0.0031944900961845455
993
Correlation matrix for column 0 and and column993: 0.011093844866036904
994
Correlation matrix for column 0 and and column994: -0.0035891031855906684
995
Correlation matrix for column 0 and and column995: -0.003886014971176405
996
Correlation matrix for column 0 and and column996: 0.0008269913549101555
997
Correlation matrix for column 0 and and column997: 0.005046149677757614
998
Correlation matrix for column 0 and and column998: 0.00842182957917232
99

In [13]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")

Percentage of correlations in df1: 28.1 %
Percentage of correlations in df_new: 32.9 %


## Random Projection eps = 0.5

In [14]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.5)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,432,433,434,435,436,437,438,439,440,441
0,3.910740,-11.297563,0.465880,0.336086,15.209279,-18.129274,4.794098,-3.141212,-0.047488,4.097987,...,-3.862185,-7.004162,-6.691073,-1.114256,-0.018733,6.596251,7.585845,-12.780151,4.181551,5.909107
1,2.127109,-4.263569,4.637384,6.875545,11.962525,-13.954461,-5.705361,-7.139280,-5.401913,0.611141,...,1.985745,-17.713577,0.276844,-0.086037,-6.957865,4.004928,1.424982,-8.533044,-4.993959,9.320091
2,-2.866881,-3.387237,1.158405,15.391080,14.118838,-20.272257,-3.414473,-3.409575,3.697080,-11.560904,...,6.416805,-11.527023,-2.418444,3.021329,-0.165989,2.253043,12.982221,-10.147266,3.352710,-0.770035
3,8.910116,-2.243584,2.263264,6.652770,13.533051,-21.760599,-1.924505,-4.203062,-1.452210,2.555772,...,1.568451,-17.407814,2.892227,1.328669,-7.398956,3.815490,3.672422,-11.748847,0.721904,19.528209
4,-4.438700,-8.100698,-0.019782,2.788382,17.103064,-17.270605,-3.696524,-0.515903,10.690265,-9.153298,...,5.155136,-13.012681,4.744619,1.378720,-7.171127,4.795577,4.606484,-12.901502,-0.846118,6.510367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-2.502477,-2.306237,4.562694,7.107292,23.167772,-22.702644,6.709305,3.076675,5.649558,-3.095081,...,-0.164621,-5.166770,-13.545210,-3.593507,-1.013143,7.649689,3.781937,-8.453578,2.357672,6.765363
9996,-2.727555,-6.452160,6.365482,2.588194,5.461602,-11.035559,-2.591498,-2.572092,4.910568,-2.680340,...,1.525594,-7.717392,-5.816222,-4.125643,-2.071676,7.780524,2.190628,-10.678227,-3.443896,2.524542
9997,6.802373,1.816245,-2.830349,10.897223,13.607299,-17.937482,3.076013,-0.630966,10.700979,1.662325,...,0.976878,-10.532427,-0.945868,-6.619204,2.848947,3.059518,3.534127,4.659640,3.774488,1.087394
9998,1.148428,-1.340830,6.765730,0.476471,13.050031,-14.474849,3.580745,-2.859333,3.879723,1.202848,...,0.907447,-6.914410,1.137363,-4.394476,-1.194265,5.313765,4.140011,-8.186666,0.874517,4.741044


In [15]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

0       3.910740
1       2.127109
2      -2.866881
3       8.910116
4      -4.438700
          ...   
9995   -2.502477
9996   -2.727555
9997    6.802373
9998    1.148428
9999   -3.274034
Name: 0, Length: 10000, dtype: float64
(10000, 441) (10000,)
1
Correlation matrix for column 0 and and column1: -0.0049550615588960355
2
Correlation matrix for column 0 and and column2: -0.004558937446045807
3
Correlation matrix for column 0 and and column3: 0.001448573455274988
4
Correlation matrix for column 0 and and column4: -0.002771506099696734
5
Correlation matrix for column 0 and and column5: 0.019216969506963416
6
Correlation matrix for column 0 and and column6: 0.0010042730065135225
7
Correlation matrix for column 0 and and column7: -0.0005833841703520084
8
Correlation matrix for column 0 and and column8: -0.00953659926832223
9
Correlation matrix for column 0 and and column9: 0.007694383238899984
10
Correlation matrix for column 0 and and column10: 0.005806848856120032
11
Correlation matrix f

Correlation matrix for column 0 and and column131: 0.002598248285284308
132
Correlation matrix for column 0 and and column132: -0.001985691461850764
133
Correlation matrix for column 0 and and column133: -0.0030484348225210314
134
Correlation matrix for column 0 and and column134: 0.004111862153691496
135
Correlation matrix for column 0 and and column135: 0.016398704087556337
136
Correlation matrix for column 0 and and column136: -0.005971334805611825
137
Correlation matrix for column 0 and and column137: -0.005644184995863034
138
Correlation matrix for column 0 and and column138: -0.0009190303094766494
139
Correlation matrix for column 0 and and column139: 0.014182762325006254
140
Correlation matrix for column 0 and and column140: -0.0026035619595071975
141
Correlation matrix for column 0 and and column141: 0.004450309913120609
142
Correlation matrix for column 0 and and column142: 0.011327685804707926
143
Correlation matrix for column 0 and and column143: 0.01204195233663607
144
Corr

Correlation matrix for column 0 and and column241: -0.0030861663886452913
242
Correlation matrix for column 0 and and column242: 0.007106158256322412
243
Correlation matrix for column 0 and and column243: -0.02159801384846157
244
Correlation matrix for column 0 and and column244: 0.008873992168431285
245
Correlation matrix for column 0 and and column245: -0.024194851885780046
246
Correlation matrix for column 0 and and column246: 0.003496688165803964
247
Correlation matrix for column 0 and and column247: -0.004132958297094985
248
Correlation matrix for column 0 and and column248: -0.002067328592360838
249
Correlation matrix for column 0 and and column249: 0.006444810618240447
250
Correlation matrix for column 0 and and column250: 0.014674361673287246
251
Correlation matrix for column 0 and and column251: 0.014972704385385036
252
Correlation matrix for column 0 and and column252: 0.0008729609086613909
253
Correlation matrix for column 0 and and column253: -0.0011255497000450764
254
Corr

Correlation matrix for column 0 and and column352: 0.006919437999066353
353
Correlation matrix for column 0 and and column353: 0.0005180523445701223
354
Correlation matrix for column 0 and and column354: -0.008869112853490327
355
Correlation matrix for column 0 and and column355: -0.015313015343136106
356
Correlation matrix for column 0 and and column356: -0.011614998515575702
357
Correlation matrix for column 0 and and column357: -0.0017329748194604498
358
Correlation matrix for column 0 and and column358: -0.0007216661553748053
359
Correlation matrix for column 0 and and column359: 0.01095365615756722
360
Correlation matrix for column 0 and and column360: -0.0008447123780501728
361
Correlation matrix for column 0 and and column361: -0.006134241654850679
362
Correlation matrix for column 0 and and column362: -0.012704015532642096
363
Correlation matrix for column 0 and and column363: 0.005649387682563063
364
Correlation matrix for column 0 and and column364: -0.022885779748602002
365


In [16]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")

Percentage of correlations in df1: 28.1 %
Percentage of correlations in df_new: 34.01360544217687 %
