# Spurious Correlations of Highly Dimensional Big Data

This Notebook aims at showing how PCA and random projection can solve the problem of spurious correlations in Big Data.

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.utils import shuffle

## DataFrame Creation

In [2]:
# User defined parameters

# Number of rows for df1
x = 10000

# Number of columns for df1
y = 100000

# Number of rows for df2
z = 1000

In [3]:
# Creating a dataframe with x number of rows and y number of columns
df = pd.DataFrame(np.random.randint(10000, size = (x,y)))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0,81,71,73,49,69,89,57,56,16,...,93,88,93,12,64,59,75,27,45,90
1,36,53,25,60,23,56,9,22,26,60,...,50,50,6,45,14,50,67,90,8,42
2,85,73,61,97,26,42,31,7,4,62,...,84,27,67,42,6,13,64,27,23,33
3,68,30,53,33,47,12,9,90,44,87,...,49,61,89,66,55,58,5,75,86,6
4,46,36,66,74,70,35,90,5,54,0,...,68,21,13,25,45,13,97,49,9,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,76,46,88,70,0,75,6,60,24,73,...,9,93,95,47,24,4,32,95,35,8
9996,47,8,91,41,29,64,23,67,63,0,...,73,18,76,89,38,64,87,38,35,45
9997,4,48,88,69,66,79,31,13,40,61,...,23,14,89,35,12,19,34,14,29,11
9998,12,61,47,22,82,35,43,58,81,29,...,30,35,81,29,98,98,21,23,38,38


In order to assess the correlations of the different parameters, correlations between the column with index 0 and the 99 other first columns is assessed.

In [4]:
# Assigning X to all columns except 0
X_df = df.drop(columns=0)
X_df.head()

# Assigning Y to column 0
Y_df = df[0]
print(Y_df)

# The following line makes Y become a list
Y_df = np.array(Y_df).reshape(-1)
print(X_df.shape,Y_df.shape)

list_titles = X_df.columns
list_corr_df1 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df1.append(abs(np.corrcoef(Y_df, X_df[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df, X_df[i])[0][1]))

0        0
1       36
2       85
3       68
4       46
        ..
9995    76
9996    47
9997     4
9998    12
9999    99
Name: 0, Length: 10000, dtype: int32
(10000, 99999) (10000,)
1
Correlation matrix for column 0 and and column1: -0.008761624733657856
2
Correlation matrix for column 0 and and column2: -0.00803416543579156
3
Correlation matrix for column 0 and and column3: 0.0018722384541600373
4
Correlation matrix for column 0 and and column4: -0.01030725701208035
5
Correlation matrix for column 0 and and column5: 0.0008987788190079406
6
Correlation matrix for column 0 and and column6: -0.03358528373035794
7
Correlation matrix for column 0 and and column7: 0.0002155141199845559
8
Correlation matrix for column 0 and and column8: 0.003297225728281968
9
Correlation matrix for column 0 and and column9: 0.001230536728268818
10
Correlation matrix for column 0 and and column10: -0.009565295860097286
11
Correlation matrix for column 0 and and column11: 0.013095351573846052
12
Correlation ma

214
Correlation matrix for column 0 and and column214: 0.0001269648468156349
215
Correlation matrix for column 0 and and column215: 0.013635182110576438
216
Correlation matrix for column 0 and and column216: -0.001575047001643536
217
Correlation matrix for column 0 and and column217: 0.0011617202862283188
218
Correlation matrix for column 0 and and column218: -0.012989927061575757
219
Correlation matrix for column 0 and and column219: 0.012757273026772355
220
Correlation matrix for column 0 and and column220: 0.0005916235134134344
221
Correlation matrix for column 0 and and column221: -0.028522621197938598
222
Correlation matrix for column 0 and and column222: 0.005377593969696469
223
Correlation matrix for column 0 and and column223: 0.004520104777902705
224
Correlation matrix for column 0 and and column224: -0.019273149516754698
225
Correlation matrix for column 0 and and column225: -0.016511538447334308
226
Correlation matrix for column 0 and and column226: -0.0045735222008488315
22

Correlation matrix for column 0 and and column452: 0.01198582581930646
453
Correlation matrix for column 0 and and column453: -0.006228368889216646
454
Correlation matrix for column 0 and and column454: 0.0037241620792729394
455
Correlation matrix for column 0 and and column455: -9.105509402496163e-05
456
Correlation matrix for column 0 and and column456: -0.006528262662078706
457
Correlation matrix for column 0 and and column457: 0.0010920805414586592
458
Correlation matrix for column 0 and and column458: 0.00467306934915353
459
Correlation matrix for column 0 and and column459: 0.00440088698598696
460
Correlation matrix for column 0 and and column460: -0.00962317317474416
461
Correlation matrix for column 0 and and column461: -0.0011255254024105698
462
Correlation matrix for column 0 and and column462: 0.005422138666653112
463
Correlation matrix for column 0 and and column463: 0.003987120967320262
464
Correlation matrix for column 0 and and column464: 0.003793440952383197
465
Correla

Correlation matrix for column 0 and and column709: -0.0023708933658155417
710
Correlation matrix for column 0 and and column710: 0.006374241866995585
711
Correlation matrix for column 0 and and column711: 0.0005911140068384814
712
Correlation matrix for column 0 and and column712: 0.0001405414950580112
713
Correlation matrix for column 0 and and column713: 0.002500110559334848
714
Correlation matrix for column 0 and and column714: 0.00237390379662379
715
Correlation matrix for column 0 and and column715: 0.01420229364478285
716
Correlation matrix for column 0 and and column716: -0.002037417971286101
717
Correlation matrix for column 0 and and column717: 0.007637806111032486
718
Correlation matrix for column 0 and and column718: -0.01683911698300044
719
Correlation matrix for column 0 and and column719: -0.021482215484999564
720
Correlation matrix for column 0 and and column720: -0.004496082414104056
721
Correlation matrix for column 0 and and column721: 0.02500288428275222
722
Correlat

Correlation matrix for column 0 and and column956: 0.012669653634942879
957
Correlation matrix for column 0 and and column957: -0.006172206086698093
958
Correlation matrix for column 0 and and column958: 0.00012982479549138078
959
Correlation matrix for column 0 and and column959: 0.006165821639213785
960
Correlation matrix for column 0 and and column960: -0.016685806572410915
961
Correlation matrix for column 0 and and column961: 0.002303283799829813
962
Correlation matrix for column 0 and and column962: -0.01166601858605935
963
Correlation matrix for column 0 and and column963: -0.011658928217509744
964
Correlation matrix for column 0 and and column964: -0.0017559622419909737
965
Correlation matrix for column 0 and and column965: 0.017595218721455697
966
Correlation matrix for column 0 and and column966: 0.001856738744346936
967
Correlation matrix for column 0 and and column967: 0.0012361743556334314
968
Correlation matrix for column 0 and and column968: 0.006305821757934007
969
Corr

To see if indeed the correlations between the different parameters increases with the data size, the original dataframe is compared to a sub-set dataframe which only takes the first z rows of the original dataframe. If the correlations in the original dataframe are higher than in the smaller dataframe this would prove that the bigger the data size the more frequent the number of spurious correlations.

In [18]:
# Creating smaller dataframe taking z number of rows from original dataframe
df2 = df.iloc[:z]
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0,81,71,73,49,69,89,57,56,16,...,93,88,93,12,64,59,75,27,45,90
1,36,53,25,60,23,56,9,22,26,60,...,50,50,6,45,14,50,67,90,8,42
2,85,73,61,97,26,42,31,7,4,62,...,84,27,67,42,6,13,64,27,23,33
3,68,30,53,33,47,12,9,90,44,87,...,49,61,89,66,55,58,5,75,86,6
4,46,36,66,74,70,35,90,5,54,0,...,68,21,13,25,45,13,97,49,9,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,92,59,89,89,28,87,87,98,66,14,...,6,75,4,5,58,24,23,58,87,52
996,46,83,36,45,53,12,4,5,6,16,...,73,50,22,76,87,74,20,68,92,46
997,34,57,13,75,22,58,55,88,27,86,...,62,28,69,49,2,71,10,54,54,96
998,99,16,98,4,63,78,2,23,47,4,...,51,36,43,93,78,65,10,35,86,46


In [6]:
# Assigning X to all columns except 0
X_df2 = df2.drop(columns=0)
X_df2.head()

# Assigning Y to column 0
Y_df2 = df2[0]
print(Y_df2)

# The following line makes Y become a list
Y_df2 = np.array(Y_df2).reshape(-1)
print(X_df2.shape,Y_df2.shape)

list_titles = X_df2.columns

list_corr_df2 = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df2.append(abs(np.corrcoef(Y_df2, X_df2[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df2, X_df2[i])[0][1]))

0       0
1      36
2      85
3      68
4      46
       ..
995    92
996    46
997    34
998    99
999    17
Name: 0, Length: 1000, dtype: int32
(1000, 99999) (1000,)
1
Correlation matrix for column 0 and and column1: -0.011599105445696048
2
Correlation matrix for column 0 and and column2: -0.03998041540467367
3
Correlation matrix for column 0 and and column3: 0.036389449924772305
4
Correlation matrix for column 0 and and column4: -0.024317976169136554
5
Correlation matrix for column 0 and and column5: -0.013126801393527197
6
Correlation matrix for column 0 and and column6: -0.03340095509214133
7
Correlation matrix for column 0 and and column7: 0.03924563583046502
8
Correlation matrix for column 0 and and column8: 0.021466393833891144
9
Correlation matrix for column 0 and and column9: 0.042313618692402674
10
Correlation matrix for column 0 and and column10: -0.027964993881709196
11
Correlation matrix for column 0 and and column11: 0.03586758577642002
12
Correlation matrix for column 0

Correlation matrix for column 0 and and column365: 0.027529713934620424
366
Correlation matrix for column 0 and and column366: -0.013145405456881048
367
Correlation matrix for column 0 and and column367: -0.03440310535827038
368
Correlation matrix for column 0 and and column368: 0.015835486458771125
369
Correlation matrix for column 0 and and column369: -0.015717640394498914
370
Correlation matrix for column 0 and and column370: -0.0034484293455213882
371
Correlation matrix for column 0 and and column371: -0.02791492413478826
372
Correlation matrix for column 0 and and column372: -0.032408527321870086
373
Correlation matrix for column 0 and and column373: 0.030012066004726667
374
Correlation matrix for column 0 and and column374: 0.0045284935300198125
375
Correlation matrix for column 0 and and column375: 0.037509847159268445
376
Correlation matrix for column 0 and and column376: 0.02213843947612247
377
Correlation matrix for column 0 and and column377: -0.0013256442890657225
378
Corre

Correlation matrix for column 0 and and column719: -0.052235162621991044
720
Correlation matrix for column 0 and and column720: 0.058204073954815014
721
Correlation matrix for column 0 and and column721: 0.029776494771289366
722
Correlation matrix for column 0 and and column722: -0.025009487364063356
723
Correlation matrix for column 0 and and column723: 0.01582054840702794
724
Correlation matrix for column 0 and and column724: -0.022520414601911374
725
Correlation matrix for column 0 and and column725: -0.02446596080618506
726
Correlation matrix for column 0 and and column726: -0.04213881176449175
727
Correlation matrix for column 0 and and column727: 0.061345198511558355
728
Correlation matrix for column 0 and and column728: 0.02675304808173655
729
Correlation matrix for column 0 and and column729: -0.025928232072618938
730
Correlation matrix for column 0 and and column730: 0.01025438065375579
731
Correlation matrix for column 0 and and column731: -0.010583190772387197
732
Correlatio

We now compare which of the correlation lists has the highest numbers of every parameter

In [7]:
a = 0
b = 0
for i in range(0,1000):
    if abs(list_corr_df1[i]) > abs(list_corr_df2[i]):
        a+=1
    elif abs(list_corr_df1[i]) < abs(list_corr_df2[i]):
        b+=1
    else:
        print()
        
print(a)
print(b)

179
821


In [8]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df2)):
    if abs(list_corr_df2[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df2)*100),"%")

Percentage of correlations in df1: 31.7 %
Percentage of correlations in df_new: 75.8 %


Clearly it is seen that the bigger DataFrame has the highest number of spurious correlations.

To counter spurious correlations, random projection can be used.

## Random Projection eps = 0.1

In [9]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.1)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7884,7885,7886,7887,7888,7889,7890,7891,7892,7893
0,-4.217059,199.841799,160.995027,-2.528952,-153.102867,-24.818381,-151.478439,-104.119758,-89.317268,21.402924,...,228.468532,-158.629225,111.255751,-157.100049,12.471367,-34.938189,-53.896495,-170.160444,-46.581163,329.699365
1,-77.417202,163.011116,30.015336,87.036582,17.218652,-212.732758,-204.907845,43.120616,-54.934532,137.755100,...,312.889684,81.300021,320.976928,-188.562079,-142.024611,-47.691927,95.605697,-355.607783,-9.165586,475.328944
2,179.784765,93.374683,296.708148,147.675099,-65.230062,-369.723552,-200.536508,-56.654974,11.853017,-55.915945,...,446.943358,108.842079,162.641988,-322.692126,-174.269726,-31.491693,-149.909751,-288.054459,39.752893,348.168822
3,127.104664,234.585010,167.434019,175.242012,-91.033472,-364.339100,-283.462486,-108.766973,-102.132902,-17.546429,...,230.743417,130.197838,130.030754,-245.057494,17.671657,9.571481,-55.346326,-203.642292,4.063760,446.091118
4,-32.811741,28.478391,163.075387,190.705412,-197.617820,-217.615671,-454.559556,-67.408973,-221.418277,136.806691,...,251.546675,-37.552280,298.404041,-108.828583,-103.937278,-48.216367,-166.720833,-281.288226,-95.541463,307.070093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,73.509171,-18.895512,245.767511,291.747406,-164.535482,-191.863011,-308.936881,99.197628,-59.259893,-93.146444,...,135.394760,43.074643,103.269845,-116.987871,111.434589,-56.263786,-171.369076,-268.967801,-121.620024,380.706475
9996,121.289944,121.424053,260.153638,-64.641557,-229.923911,-135.685810,-243.186040,-148.551511,-184.852302,255.470002,...,307.574265,29.789456,45.062045,-180.952626,-74.993267,106.982309,7.120259,-343.266913,-75.057764,380.708003
9997,87.889054,194.015313,248.996294,376.200997,-175.039895,-138.620090,-467.423808,-117.444155,-106.922765,-40.877243,...,307.581416,24.176626,173.248106,-205.710384,-191.049747,-185.743509,-4.677016,-153.403211,-8.514796,282.626312
9998,208.866278,22.168187,164.325649,40.042685,-113.302844,-8.828689,-502.662802,-61.469009,121.499620,-20.804286,...,216.594142,102.198243,292.959472,-234.541839,267.338616,53.291812,31.640098,-168.611971,-51.190693,229.371298


In [10]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:len(X_df_new.columns)]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

0        -4.217059
1       -77.417202
2       179.784765
3       127.104664
4       -32.811741
           ...    
9995     73.509171
9996    121.289944
9997     87.889054
9998    208.866278
9999    -35.042447
Name: 0, Length: 10000, dtype: float64
(10000, 7893) (10000,)
1
Correlation matrix for column 0 and and column1: 0.0004802714695351937
2
Correlation matrix for column 0 and and column2: 0.006957897595866699
3
Correlation matrix for column 0 and and column3: -0.014632464624914214
4
Correlation matrix for column 0 and and column4: -0.007638135305634175
5
Correlation matrix for column 0 and and column5: -0.0038710470822653983
6
Correlation matrix for column 0 and and column6: -0.008435686392955511
7
Correlation matrix for column 0 and and column7: -0.01799578800196431
8
Correlation matrix for column 0 and and column8: -0.0161673516171508
9
Correlation matrix for column 0 and and column9: 0.004069240244647963
10
Correlation matrix for column 0 and and column10: -0.013154122105681206
1

Correlation matrix for column 0 and and column146: 0.022644131585503522
147
Correlation matrix for column 0 and and column147: 0.0075872559675971415
148
Correlation matrix for column 0 and and column148: 0.014257479798815954
149
Correlation matrix for column 0 and and column149: 0.002222198434671294
150
Correlation matrix for column 0 and and column150: 0.0097483885153946
151
Correlation matrix for column 0 and and column151: -0.008028423704304406
152
Correlation matrix for column 0 and and column152: 0.0032128249636176787
153
Correlation matrix for column 0 and and column153: 0.003572759639505053
154
Correlation matrix for column 0 and and column154: -0.010778871211335004
155
Correlation matrix for column 0 and and column155: 0.011040945119218465
156
Correlation matrix for column 0 and and column156: 0.029254575397635298
157
Correlation matrix for column 0 and and column157: 0.0009072499598781838
158
Correlation matrix for column 0 and and column158: 0.010940071543175403
159
Correlati

Correlation matrix for column 0 and and column299: 0.013931856252332172
300
Correlation matrix for column 0 and and column300: 0.006017941688924191
301
Correlation matrix for column 0 and and column301: 0.00141370510289216
302
Correlation matrix for column 0 and and column302: -0.0009734689585127773
303
Correlation matrix for column 0 and and column303: 0.0027266731181026377
304
Correlation matrix for column 0 and and column304: 0.0007584688438608443
305
Correlation matrix for column 0 and and column305: -0.0009037907398567465
306
Correlation matrix for column 0 and and column306: -0.0014604088974343593
307
Correlation matrix for column 0 and and column307: -0.010495956883318277
308
Correlation matrix for column 0 and and column308: -0.013146095383347893
309
Correlation matrix for column 0 and and column309: -0.001549033848475128
310
Correlation matrix for column 0 and and column310: 0.0002131551151891996
311
Correlation matrix for column 0 and and column311: 0.0029309680067943463
312


Correlation matrix for column 0 and and column452: 0.0074332041902270005
453
Correlation matrix for column 0 and and column453: 0.0016514107065372678
454
Correlation matrix for column 0 and and column454: -0.004712425065571463
455
Correlation matrix for column 0 and and column455: -0.004884720139360042
456
Correlation matrix for column 0 and and column456: 0.014269016410362498
457
Correlation matrix for column 0 and and column457: -0.00037241399873170854
458
Correlation matrix for column 0 and and column458: -0.0003988302441866684
459
Correlation matrix for column 0 and and column459: 0.015124639062101496
460
Correlation matrix for column 0 and and column460: 0.007085167000715548
461
Correlation matrix for column 0 and and column461: 0.008775586087666352
462
Correlation matrix for column 0 and and column462: 0.017097451054067014
463
Correlation matrix for column 0 and and column463: 0.0108290431445961
464
Correlation matrix for column 0 and and column464: -0.0020952515500871163
465
Cor

Correlation matrix for column 0 and and column610: 0.012343553612356745
611
Correlation matrix for column 0 and and column611: -0.003980787555389276
612
Correlation matrix for column 0 and and column612: -0.0036599863256784547
613
Correlation matrix for column 0 and and column613: 0.0048150344854006105
614
Correlation matrix for column 0 and and column614: 0.0020272930871604905
615
Correlation matrix for column 0 and and column615: -0.005569820898539069
616
Correlation matrix for column 0 and and column616: 0.00844055968717503
617
Correlation matrix for column 0 and and column617: 0.0116697025655453
618
Correlation matrix for column 0 and and column618: 0.0015817219146964736
619
Correlation matrix for column 0 and and column619: -0.022691595825225987
620
Correlation matrix for column 0 and and column620: -0.010741318562059359
621
Correlation matrix for column 0 and and column621: -0.007157631874601878
622
Correlation matrix for column 0 and and column622: -0.015092434926780024
623
Corr

Correlation matrix for column 0 and and column777: -0.011622382584141354
778
Correlation matrix for column 0 and and column778: 0.002367022398631055
779
Correlation matrix for column 0 and and column779: -0.005618580383941319
780
Correlation matrix for column 0 and and column780: 0.012204573648466976
781
Correlation matrix for column 0 and and column781: -0.0009235886705188984
782
Correlation matrix for column 0 and and column782: -0.019527568275677917
783
Correlation matrix for column 0 and and column783: 0.005729024391156975
784
Correlation matrix for column 0 and and column784: -0.01648518752327228
785
Correlation matrix for column 0 and and column785: -0.0026909944332839776
786
Correlation matrix for column 0 and and column786: 0.01658888523179322
787
Correlation matrix for column 0 and and column787: 0.012672134113864544
788
Correlation matrix for column 0 and and column788: 0.0030231345274232204
789
Correlation matrix for column 0 and and column789: 0.012825067972060443
790
Corre

Correlation matrix for column 0 and and column945: 0.007265303755708302
946
Correlation matrix for column 0 and and column946: 0.0015711421710467833
947
Correlation matrix for column 0 and and column947: -0.003189603718394668
948
Correlation matrix for column 0 and and column948: -0.02084191614642358
949
Correlation matrix for column 0 and and column949: -0.0006668350851365975
950
Correlation matrix for column 0 and and column950: 0.00039194979417687774
951
Correlation matrix for column 0 and and column951: 0.010496648086929508
952
Correlation matrix for column 0 and and column952: -0.013758122572518966
953
Correlation matrix for column 0 and and column953: -0.007940699106087284
954
Correlation matrix for column 0 and and column954: 0.007874844125296206
955
Correlation matrix for column 0 and and column955: -0.025263051631451393
956
Correlation matrix for column 0 and and column956: 0.009126724429818881
957
Correlation matrix for column 0 and and column957: 0.00316763524401998
958
Corr

In [11]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")

Percentage of correlations in df1: 31.7 %
Percentage of correlations in df_new: 33.1 %


## Random Projection eps = 0.2

In [12]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.2)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2115,2116,2117,2118,2119,2120,2121,2122,2123,2124
0,156.592492,-349.840590,-132.268554,647.455512,-242.881879,-57.429986,667.605274,-286.381296,-531.626696,592.273060,...,174.869434,296.773306,370.394076,-683.608000,340.844176,-188.150574,-104.071969,946.159980,-73.908264,167.630747
1,286.093038,-431.371483,122.279260,579.067279,-352.968920,-191.591108,887.416738,-96.264162,-394.877822,808.451602,...,525.030068,68.047896,242.846780,-201.255685,78.230868,40.962204,211.756742,559.692202,-19.950915,352.433270
2,265.774605,-247.566825,-76.857236,173.247263,-44.644822,-118.649510,825.289021,62.992760,-229.539295,737.143890,...,294.360647,-195.634482,480.008065,-420.305256,186.748883,177.539483,67.208886,385.288503,-80.741421,149.780015
3,205.070328,-288.429775,255.480492,232.841823,-66.578465,-489.465223,1215.862804,362.261005,-306.095186,553.035661,...,57.101923,151.484472,89.169330,-136.325169,-245.584517,94.795037,-66.874548,267.566974,318.503547,-7.671701
4,347.393585,-649.724970,-131.167129,347.590854,-483.916184,-267.657619,932.187689,173.069340,-374.887699,828.362213,...,-181.081221,-138.192338,83.807231,-169.249544,-72.626561,49.620465,251.078678,692.242085,-144.627222,5.592163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,345.154489,-720.544882,-228.499408,484.011550,-188.035070,-261.672095,464.165766,-131.634186,-521.333323,623.559750,...,118.661272,345.874914,270.393865,-385.803334,-114.341766,28.722206,62.498938,515.716577,-0.068771,247.652365
9996,29.119657,-543.127239,-134.384101,394.816786,-393.486143,-329.257507,667.879575,-34.112995,-435.674045,707.712442,...,-70.198375,-115.783359,745.207817,-408.154562,-45.303171,10.052038,289.480712,402.045525,-68.089048,223.320892
9997,69.747126,-76.283853,11.597260,432.978527,-296.181790,-190.513500,653.193212,51.624295,-127.404411,450.944907,...,338.419769,390.528770,398.878245,-399.394352,262.661055,506.990221,-51.084796,290.946123,-261.597200,743.836645
9998,193.054444,-955.689731,43.039076,890.797349,-34.687003,-158.914996,791.410549,131.007560,-450.627983,492.239475,...,24.736254,-116.787902,473.963612,-713.068918,86.627685,-59.543844,251.978963,431.409038,-7.944481,279.561102


In [13]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

0       156.592492
1       286.093038
2       265.774605
3       205.070328
4       347.393585
           ...    
9995    345.154489
9996     29.119657
9997     69.747126
9998    193.054444
9999    314.215189
Name: 0, Length: 10000, dtype: float64
(10000, 2124) (10000,)
1
Correlation matrix for column 0 and and column1: -0.003533493689693437
2
Correlation matrix for column 0 and and column2: -0.0031804454489624987
3
Correlation matrix for column 0 and and column3: -0.007290092627356977
4
Correlation matrix for column 0 and and column4: 0.016749319768669408
5
Correlation matrix for column 0 and and column5: -0.001973060965365003
6
Correlation matrix for column 0 and and column6: -0.0022691294117757705
7
Correlation matrix for column 0 and and column7: -0.010205928035068002
8
Correlation matrix for column 0 and and column8: 0.0024284591001354295
9
Correlation matrix for column 0 and and column9: 0.00042774386333106084
10
Correlation matrix for column 0 and and column10: -0.01385525733705

Correlation matrix for column 0 and and column171: 0.0007095892878439913
172
Correlation matrix for column 0 and and column172: 0.008414731430072202
173
Correlation matrix for column 0 and and column173: -0.01820693899037434
174
Correlation matrix for column 0 and and column174: -0.0037559458545648977
175
Correlation matrix for column 0 and and column175: -0.014806169472958127
176
Correlation matrix for column 0 and and column176: -0.0005222730041094518
177
Correlation matrix for column 0 and and column177: 0.008059897157914364
178
Correlation matrix for column 0 and and column178: 0.008065526862064491
179
Correlation matrix for column 0 and and column179: 0.010971579946848917
180
Correlation matrix for column 0 and and column180: 0.0119063205409863
181
Correlation matrix for column 0 and and column181: 0.003629220019526324
182
Correlation matrix for column 0 and and column182: -0.026159673034840642
183
Correlation matrix for column 0 and and column183: 0.003194985848472665
184
Correla

Correlation matrix for column 0 and and column343: -0.006217986163935251
344
Correlation matrix for column 0 and and column344: -0.010375150825377726
345
Correlation matrix for column 0 and and column345: -0.003892287181732972
346
Correlation matrix for column 0 and and column346: 0.020434188211631055
347
Correlation matrix for column 0 and and column347: -0.0025375435192620285
348
Correlation matrix for column 0 and and column348: 0.007164056480629706
349
Correlation matrix for column 0 and and column349: -0.016011162689151648
350
Correlation matrix for column 0 and and column350: 0.004477682827540502
351
Correlation matrix for column 0 and and column351: 0.002719417198547389
352
Correlation matrix for column 0 and and column352: 0.007984869913504182
353
Correlation matrix for column 0 and and column353: -0.01164187197347713
354
Correlation matrix for column 0 and and column354: -0.0003619925433394826
355
Correlation matrix for column 0 and and column355: -0.008089260634247367
356
Cor

527
Correlation matrix for column 0 and and column527: -0.016928489436685405
528
Correlation matrix for column 0 and and column528: 0.009639767104936239
529
Correlation matrix for column 0 and and column529: 0.003856230443563508
530
Correlation matrix for column 0 and and column530: -0.0023085441319514143
531
Correlation matrix for column 0 and and column531: -0.005481408754979174
532
Correlation matrix for column 0 and and column532: -0.008513294291831579
533
Correlation matrix for column 0 and and column533: 0.006505567271983475
534
Correlation matrix for column 0 and and column534: -0.001532219261808355
535
Correlation matrix for column 0 and and column535: 0.0034556770052694163
536
Correlation matrix for column 0 and and column536: 0.005079490289332754
537
Correlation matrix for column 0 and and column537: -0.005806614641318585
538
Correlation matrix for column 0 and and column538: 0.024631935459668235
539
Correlation matrix for column 0 and and column539: -0.0018508449942422945
54

Correlation matrix for column 0 and and column733: 0.015338903708010346
734
Correlation matrix for column 0 and and column734: 0.001359547369360764
735
Correlation matrix for column 0 and and column735: 0.016813616186968
736
Correlation matrix for column 0 and and column736: 0.010806221318466806
737
Correlation matrix for column 0 and and column737: -0.01758216231128096
738
Correlation matrix for column 0 and and column738: 0.016277837951163214
739
Correlation matrix for column 0 and and column739: -0.01290491455345156
740
Correlation matrix for column 0 and and column740: 0.0018970648965619405
741
Correlation matrix for column 0 and and column741: -0.012124066859751523
742
Correlation matrix for column 0 and and column742: 0.024071235816227075
743
Correlation matrix for column 0 and and column743: -0.0007002936872216298
744
Correlation matrix for column 0 and and column744: 0.00983663558355177
745
Correlation matrix for column 0 and and column745: 0.0050572369930821
746
Correlation ma

Correlation matrix for column 0 and and column946: 0.007082502124946787
947
Correlation matrix for column 0 and and column947: 0.008041824047901944
948
Correlation matrix for column 0 and and column948: -0.021282845300246954
949
Correlation matrix for column 0 and and column949: 0.005132481915957876
950
Correlation matrix for column 0 and and column950: -0.008673982655994122
951
Correlation matrix for column 0 and and column951: -0.0009118772167626864
952
Correlation matrix for column 0 and and column952: 0.006371931920498189
953
Correlation matrix for column 0 and and column953: 0.009740742660158133
954
Correlation matrix for column 0 and and column954: 0.012305634437373378
955
Correlation matrix for column 0 and and column955: 0.009834273432899037
956
Correlation matrix for column 0 and and column956: 0.01765112916109094
957
Correlation matrix for column 0 and and column957: 0.012592654285397964
958
Correlation matrix for column 0 and and column958: -0.00042811550656013663
959
Correl

In [14]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")

Percentage of correlations in df1: 31.7 %
Percentage of correlations in df_new: 32.6 %


## Random Projection eps = 0.5

In [15]:
# Pass df1 in the random projection to create a new reduced DataFrame
transformer = random_projection.GaussianRandomProjection(eps = 0.5)
df_new = pd.DataFrame(transformer.fit_transform(df))
df_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,432,433,434,435,436,437,438,439,440,441
0,124.813898,-77.062034,1028.711768,1241.781127,1414.969347,373.729019,600.543184,-198.945551,-174.884503,403.608534,...,-1010.688622,608.518498,311.191329,-1758.007622,-758.862976,398.838897,-1390.472223,-136.772879,71.347151,1014.607342
1,418.766121,227.033525,479.483236,968.188954,1713.623245,463.906041,443.394693,697.579767,-102.549741,-445.013160,...,-1914.251156,237.357206,-757.919782,-1498.980189,-229.309622,742.769650,-856.168598,-406.863973,-363.379399,289.569611
2,-287.595514,1305.522782,891.937076,108.672619,1262.362157,763.379691,687.674580,1278.239764,66.749717,463.203579,...,-831.878056,1158.287179,-592.481247,-1338.775018,-1116.775658,331.200977,-633.747011,-174.506789,-61.435405,-139.832681
3,579.368302,1246.617330,1119.152304,691.824120,1229.786483,163.305810,961.481169,714.011166,543.452601,-70.765770,...,-1899.047253,559.903713,-202.701659,-839.533335,-909.299150,269.919311,-609.846911,-277.069024,-1174.517449,491.012534
4,47.608396,638.886379,1028.260062,1301.643296,1519.807878,313.108469,0.936830,132.960820,-701.579870,952.920662,...,-1066.372263,620.065443,-69.450308,-1411.783838,-1163.503924,696.214705,-443.986161,80.754473,-462.713720,759.432423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-313.987322,625.254891,38.650039,870.652573,1372.389163,270.440624,944.257623,618.924414,465.250422,-763.246220,...,-1241.163987,165.419883,-628.784610,-1587.797399,-1077.183157,627.672811,-823.842017,-701.149400,-585.102884,-75.815818
9996,841.590441,227.685654,766.684707,619.703611,797.761216,633.641032,400.211812,740.278161,-281.013819,-523.565316,...,-832.104646,302.989714,-137.704940,-962.476422,-1104.080213,1741.380619,-1233.218197,-299.162175,-248.402580,509.533531
9997,119.855555,371.397069,160.872070,1311.845742,1671.565629,-338.074010,538.515152,192.279089,288.815507,-610.311267,...,-1348.086721,270.932457,-356.623012,-1615.729988,-621.435789,385.996574,-1141.932790,-545.219344,-1282.514926,992.688622
9998,818.448151,-175.015753,367.743852,-593.880692,982.232684,-616.312886,1064.070871,172.403689,-84.664273,-861.876671,...,-463.504042,1096.968829,-942.104912,-1000.120799,-1195.682454,746.817487,-602.128279,-470.124459,-417.063286,59.374353


In [16]:
# Assigning X to all columns except 0
X_df_new = df_new.drop(columns=0)

# Assigning Y to column 0
Y_df_new = df_new[0]
print(Y_df_new)

# The following line makes Y become a list
Y_df_new = np.array(Y_df_new).reshape(-1)
print(X_df_new.shape,Y_df_new.shape)

list_titles = X_df_new.columns

list_corr_df_new = []
for i in list_titles[0:1000]:
    print(i)
    list_corr_df_new.append(abs(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))
    print('Correlation matrix for column 0 and and column' + str(i) + ': ' + str(np.corrcoef(Y_df_new, X_df_new[i])[0][1]))

0        124.813898
1        418.766121
2       -287.595514
3        579.368302
4         47.608396
           ...     
9995    -313.987322
9996     841.590441
9997     119.855555
9998     818.448151
9999    1436.121850
Name: 0, Length: 10000, dtype: float64
(10000, 441) (10000,)
1
Correlation matrix for column 0 and and column1: 0.004878477996461379
2
Correlation matrix for column 0 and and column2: -0.0004659120228969102
3
Correlation matrix for column 0 and and column3: -0.008028825462568868
4
Correlation matrix for column 0 and and column4: 0.025203445708758283
5
Correlation matrix for column 0 and and column5: 0.012319348849483006
6
Correlation matrix for column 0 and and column6: 0.015835261833887164
7
Correlation matrix for column 0 and and column7: 0.001475080498439664
8
Correlation matrix for column 0 and and column8: -0.0027197361053428685
9
Correlation matrix for column 0 and and column9: -0.003737709778494682
10
Correlation matrix for column 0 and and column10: 0.0168399267

Correlation matrix for column 0 and and column207: 0.004027256500001956
208
Correlation matrix for column 0 and and column208: -0.002822840571250798
209
Correlation matrix for column 0 and and column209: 0.0005002673959242869
210
Correlation matrix for column 0 and and column210: 0.01857968954293253
211
Correlation matrix for column 0 and and column211: 0.0065839299669945795
212
Correlation matrix for column 0 and and column212: 0.01242118225417362
213
Correlation matrix for column 0 and and column213: 0.004682710327568398
214
Correlation matrix for column 0 and and column214: -0.017696562839076772
215
Correlation matrix for column 0 and and column215: 0.007257965935245272
216
Correlation matrix for column 0 and and column216: -0.0023380229729881995
217
Correlation matrix for column 0 and and column217: -0.0048194766094375105
218
Correlation matrix for column 0 and and column218: -0.0033386733084786488
219
Correlation matrix for column 0 and and column219: -0.0070449433578014045
220
Co

Correlation matrix for column 0 and and column406: 0.006749137211032894
407
Correlation matrix for column 0 and and column407: -0.005733513370668201
408
Correlation matrix for column 0 and and column408: -0.01842510606840736
409
Correlation matrix for column 0 and and column409: -0.014173570910048791
410
Correlation matrix for column 0 and and column410: 0.00872112107634756
411
Correlation matrix for column 0 and and column411: 0.019129019378496124
412
Correlation matrix for column 0 and and column412: 0.010072978922185472
413
Correlation matrix for column 0 and and column413: 0.0035630959865401734
414
Correlation matrix for column 0 and and column414: -0.01215069149240006
415
Correlation matrix for column 0 and and column415: -0.008231117022014706
416
Correlation matrix for column 0 and and column416: -0.020688410422960712
417
Correlation matrix for column 0 and and column417: -0.013396644574405005
418
Correlation matrix for column 0 and and column418: -0.014241964879047034
419
Correl

In [17]:
a = 0
b = 0
for i in range(0,len(list_corr_df1)):
    if abs(list_corr_df1[i]) > 0.01:
        a+=1
    else:
        pass
        
for i in range(0,len(list_corr_df_new)):
    if abs(list_corr_df_new[i]) > 0.01:
        b+=1
    else:
        pass

print("Percentage of correlations in df1:",(a/len(list_corr_df1)*100),"%")

print("Percentage of correlations in df_new:",(b/len(list_corr_df_new)*100),"%")

Percentage of correlations in df1: 31.7 %
Percentage of correlations in df_new: 35.147392290249435 %
