In [1]:
import csv
import numpy as np
import time
import random

In [2]:
filename='featuresKrM'
data_file_name=filename+".csv" # diversity selection filename

 
diverse_ratio=0.8 # training set ratio, 
remaining_ratio=1-diverse_ratio # test set ratio

In [3]:
with open(data_file_name) as f:
        data_file = csv.reader(f)
        temp = next(data_file)
        n_samples = int(temp[0])
        N_features = int(temp[1])
        data = np.empty((n_samples, N_features))
        temp = next(data_file)
        feature_names = np.array(temp)
        
        for i, d in enumerate(data_file):
            data[i] = np.asarray(d[2:2+N_features], dtype=np.float64)

print("Total number of materials : ", data.shape[0])
N_materials = data.shape[0]
N_features = 6
print("Number of features: ", N_features)

feature_0 = data.T[0]
feature_1 = data.T[1]
feature_2 = data.T[2]
feature_3 = data.T[3]
feature_4 = data.T[4]
feature_5 = data.T[5]

# rescale
feature_0 = (feature_0 - np.min(feature_0))/(np.max(feature_0) - np.min(feature_0))
feature_1 = (feature_1 - np.min(feature_1))/(np.max(feature_1) - np.min(feature_1))
feature_2 = (feature_2 - np.min(feature_2))/(np.max(feature_2) - np.min(feature_2))
feature_3 = (feature_3 - np.min(feature_3))/(np.max(feature_3) - np.min(feature_3))
feature_4 = (feature_4 - np.min(feature_4))/(np.max(feature_4) - np.min(feature_4))
feature_5 = (feature_5 - np.min(feature_5))/(np.max(feature_5) - np.min(feature_5))

if N_features == 6:
    x = np.concatenate((feature_0.reshape(1,N_materials),feature_1.reshape(1,N_materials),feature_2.reshape(1,N_materials),
                        feature_3.reshape(1,N_materials),feature_4.reshape(1,N_materials),feature_5.reshape(1,N_materials),))
    
    
print("Shape of feature x: ", np.shape(x))
print("Example feature vector = " , x[:,0])
N_sample = int(N_materials * diverse_ratio)-1

print("Sampling %d diverse structures out of %d" % (N_sample+1,N_materials))
print("total accessible materials considered: ", N_materials)



print("Starting diversity selection. Seeking %d points" % (N_sample+1))
time.sleep(1)
# store indices of x here for the diverse and non-diverse sets.
diverse_set = []
remaining_set = list(range(N_materials))
### INITIALIZE WITH RANDOMLY SELECTED POINT
idx_init = random.sample(list(np.arange(N_materials)),1)[0]
diverse_set.append(idx_init)
remaining_set.remove(idx_init)
N_diverse = 1
while N_diverse <= N_sample:
        print("Selecting point ", N_diverse)
        min_d_to_diverse_set = np.zeros((N_materials-N_diverse,))
        # for every candidate point not in diverse set...
        for i in range(N_materials - N_diverse):
            # get the distance of this point to each point in the diverse set
            d_from_each_diverse_pt = np.linalg.norm(x[:,diverse_set] - x[:,remaining_set[i]].reshape(N_features,1),axis=0)
            # get the closest distance that this point is to the diverse set
            min_d_to_diverse_set[i] = np.min(d_from_each_diverse_pt)
        # select point that has the largest distance from the diverse set
        idx_select = remaining_set[np.argmax(min_d_to_diverse_set)]
        assert (len(remaining_set) == np.size(min_d_to_diverse_set))
        print("\tSelected point " , idx_select)
        # add point to diverse set; remove it from remaining set
        diverse_set.append(idx_select)
        remaining_set.remove(idx_select)
        print("\tPts in diverse set: ", len(diverse_set))
        print("\tPts in remaining set: ", len(remaining_set))
        print(diverse_set[N_diverse-1])
        N_diverse += 1

with open("divided_set_"+str(diverse_ratio)+"_"+str("%.1f"%remaining_ratio)+"_"+"Kr_M.txt", "w") as f:
    f.write(str(diverse_set)+" "+str(remaining_set))
    print("Save file name : divided_set_"+str(diverse_ratio)+"_"+str("%.1f"%remaining_ratio)+"_"+"Kr_M.txt")

Total number of materials :  1081
Number of features:  6
Shape of feature x:  (6, 1081)
Example feature vector =  [0.00328791 0.00934358 0.4236057  0.24965406 0.25052706 0.49097302]
Sampling 864 diverse structures out of 1081
total accessible materials considered:  1081
Starting diversity selection. Seeking 864 points
Selecting point  1
	Selected point  728
	Pts in diverse set:  2
	Pts in remaining set:  1079
247
Selecting point  2
	Selected point  837
	Pts in diverse set:  3
	Pts in remaining set:  1078
728
Selecting point  3
	Selected point  985
	Pts in diverse set:  4
	Pts in remaining set:  1077
837
Selecting point  4
	Selected point  596
	Pts in diverse set:  5
	Pts in remaining set:  1076
985
Selecting point  5
	Selected point  435
	Pts in diverse set:  6
	Pts in remaining set:  1075
596
Selecting point  6
	Selected point  248
	Pts in diverse set:  7
	Pts in remaining set:  1074
435
Selecting point  7
	Selected point  515
	Pts in diverse set:  8
	Pts in remaining set:  1073
248
S

	Selected point  935
	Pts in diverse set:  85
	Pts in remaining set:  996
548
Selecting point  85
	Selected point  564
	Pts in diverse set:  86
	Pts in remaining set:  995
935
Selecting point  86
	Selected point  530
	Pts in diverse set:  87
	Pts in remaining set:  994
564
Selecting point  87
	Selected point  1027
	Pts in diverse set:  88
	Pts in remaining set:  993
530
Selecting point  88
	Selected point  819
	Pts in diverse set:  89
	Pts in remaining set:  992
1027
Selecting point  89
	Selected point  651
	Pts in diverse set:  90
	Pts in remaining set:  991
819
Selecting point  90
	Selected point  962
	Pts in diverse set:  91
	Pts in remaining set:  990
651
Selecting point  91
	Selected point  92
	Pts in diverse set:  92
	Pts in remaining set:  989
962
Selecting point  92
	Selected point  1046
	Pts in diverse set:  93
	Pts in remaining set:  988
92
Selecting point  93
	Selected point  978
	Pts in diverse set:  94
	Pts in remaining set:  987
1046
Selecting point  94
	Selected point  7

	Selected point  327
	Pts in diverse set:  169
	Pts in remaining set:  912
203
Selecting point  169
	Selected point  84
	Pts in diverse set:  170
	Pts in remaining set:  911
327
Selecting point  170
	Selected point  154
	Pts in diverse set:  171
	Pts in remaining set:  910
84
Selecting point  171
	Selected point  619
	Pts in diverse set:  172
	Pts in remaining set:  909
154
Selecting point  172
	Selected point  249
	Pts in diverse set:  173
	Pts in remaining set:  908
619
Selecting point  173
	Selected point  382
	Pts in diverse set:  174
	Pts in remaining set:  907
249
Selecting point  174
	Selected point  119
	Pts in diverse set:  175
	Pts in remaining set:  906
382
Selecting point  175
	Selected point  107
	Pts in diverse set:  176
	Pts in remaining set:  905
119
Selecting point  176
	Selected point  1005
	Pts in diverse set:  177
	Pts in remaining set:  904
107
Selecting point  177
	Selected point  981
	Pts in diverse set:  178
	Pts in remaining set:  903
1005
Selecting point  178


	Selected point  452
	Pts in diverse set:  253
	Pts in remaining set:  828
278
Selecting point  253
	Selected point  344
	Pts in diverse set:  254
	Pts in remaining set:  827
452
Selecting point  254
	Selected point  222
	Pts in diverse set:  255
	Pts in remaining set:  826
344
Selecting point  255
	Selected point  965
	Pts in diverse set:  256
	Pts in remaining set:  825
222
Selecting point  256
	Selected point  713
	Pts in diverse set:  257
	Pts in remaining set:  824
965
Selecting point  257
	Selected point  161
	Pts in diverse set:  258
	Pts in remaining set:  823
713
Selecting point  258
	Selected point  724
	Pts in diverse set:  259
	Pts in remaining set:  822
161
Selecting point  259
	Selected point  38
	Pts in diverse set:  260
	Pts in remaining set:  821
724
Selecting point  260
	Selected point  737
	Pts in diverse set:  261
	Pts in remaining set:  820
38
Selecting point  261
	Selected point  338
	Pts in diverse set:  262
	Pts in remaining set:  819
737
Selecting point  262
	S

	Selected point  920
	Pts in diverse set:  338
	Pts in remaining set:  743
597
Selecting point  338
	Selected point  97
	Pts in diverse set:  339
	Pts in remaining set:  742
920
Selecting point  339
	Selected point  754
	Pts in diverse set:  340
	Pts in remaining set:  741
97
Selecting point  340
	Selected point  104
	Pts in diverse set:  341
	Pts in remaining set:  740
754
Selecting point  341
	Selected point  205
	Pts in diverse set:  342
	Pts in remaining set:  739
104
Selecting point  342
	Selected point  796
	Pts in diverse set:  343
	Pts in remaining set:  738
205
Selecting point  343
	Selected point  408
	Pts in diverse set:  344
	Pts in remaining set:  737
796
Selecting point  344
	Selected point  989
	Pts in diverse set:  345
	Pts in remaining set:  736
408
Selecting point  345
	Selected point  73
	Pts in diverse set:  346
	Pts in remaining set:  735
989
Selecting point  346
	Selected point  72
	Pts in diverse set:  347
	Pts in remaining set:  734
73
Selecting point  347
	Sele

	Selected point  27
	Pts in diverse set:  425
	Pts in remaining set:  656
120
Selecting point  425
	Selected point  738
	Pts in diverse set:  426
	Pts in remaining set:  655
27
Selecting point  426
	Selected point  82
	Pts in diverse set:  427
	Pts in remaining set:  654
738
Selecting point  427
	Selected point  326
	Pts in diverse set:  428
	Pts in remaining set:  653
82
Selecting point  428
	Selected point  36
	Pts in diverse set:  429
	Pts in remaining set:  652
326
Selecting point  429
	Selected point  158
	Pts in diverse set:  430
	Pts in remaining set:  651
36
Selecting point  430
	Selected point  808
	Pts in diverse set:  431
	Pts in remaining set:  650
158
Selecting point  431
	Selected point  464
	Pts in diverse set:  432
	Pts in remaining set:  649
808
Selecting point  432
	Selected point  325
	Pts in diverse set:  433
	Pts in remaining set:  648
464
Selecting point  433
	Selected point  416
	Pts in diverse set:  434
	Pts in remaining set:  647
325
Selecting point  434
	Selec

	Selected point  110
	Pts in diverse set:  510
	Pts in remaining set:  571
14
Selecting point  510
	Selected point  296
	Pts in diverse set:  511
	Pts in remaining set:  570
110
Selecting point  511
	Selected point  575
	Pts in diverse set:  512
	Pts in remaining set:  569
296
Selecting point  512
	Selected point  364
	Pts in diverse set:  513
	Pts in remaining set:  568
575
Selecting point  513
	Selected point  1032
	Pts in diverse set:  514
	Pts in remaining set:  567
364
Selecting point  514
	Selected point  209
	Pts in diverse set:  515
	Pts in remaining set:  566
1032
Selecting point  515
	Selected point  1064
	Pts in diverse set:  516
	Pts in remaining set:  565
209
Selecting point  516
	Selected point  768
	Pts in diverse set:  517
	Pts in remaining set:  564
1064
Selecting point  517
	Selected point  332
	Pts in diverse set:  518
	Pts in remaining set:  563
768
Selecting point  518
	Selected point  147
	Pts in diverse set:  519
	Pts in remaining set:  562
332
Selecting point  5

	Selected point  947
	Pts in diverse set:  598
	Pts in remaining set:  483
565
Selecting point  598
	Selected point  306
	Pts in diverse set:  599
	Pts in remaining set:  482
947
Selecting point  599
	Selected point  1072
	Pts in diverse set:  600
	Pts in remaining set:  481
306
Selecting point  600
	Selected point  329
	Pts in diverse set:  601
	Pts in remaining set:  480
1072
Selecting point  601
	Selected point  556
	Pts in diverse set:  602
	Pts in remaining set:  479
329
Selecting point  602
	Selected point  884
	Pts in diverse set:  603
	Pts in remaining set:  478
556
Selecting point  603
	Selected point  276
	Pts in diverse set:  604
	Pts in remaining set:  477
884
Selecting point  604
	Selected point  314
	Pts in diverse set:  605
	Pts in remaining set:  476
276
Selecting point  605
	Selected point  517
	Pts in diverse set:  606
	Pts in remaining set:  475
314
Selecting point  606
	Selected point  422
	Pts in diverse set:  607
	Pts in remaining set:  474
517
Selecting point  60

	Selected point  945
	Pts in diverse set:  683
	Pts in remaining set:  398
75
Selecting point  683
	Selected point  449
	Pts in diverse set:  684
	Pts in remaining set:  397
945
Selecting point  684
	Selected point  43
	Pts in diverse set:  685
	Pts in remaining set:  396
449
Selecting point  685
	Selected point  85
	Pts in diverse set:  686
	Pts in remaining set:  395
43
Selecting point  686
	Selected point  498
	Pts in diverse set:  687
	Pts in remaining set:  394
85
Selecting point  687
	Selected point  574
	Pts in diverse set:  688
	Pts in remaining set:  393
498
Selecting point  688
	Selected point  330
	Pts in diverse set:  689
	Pts in remaining set:  392
574
Selecting point  689
	Selected point  162
	Pts in diverse set:  690
	Pts in remaining set:  391
330
Selecting point  690
	Selected point  476
	Pts in diverse set:  691
	Pts in remaining set:  390
162
Selecting point  691
	Selected point  360
	Pts in diverse set:  692
	Pts in remaining set:  389
476
Selecting point  692
	Sele

	Selected point  606
	Pts in diverse set:  773
	Pts in remaining set:  308
83
Selecting point  773
	Selected point  541
	Pts in diverse set:  774
	Pts in remaining set:  307
606
Selecting point  774
	Selected point  543
	Pts in diverse set:  775
	Pts in remaining set:  306
541
Selecting point  775
	Selected point  936
	Pts in diverse set:  776
	Pts in remaining set:  305
543
Selecting point  776
	Selected point  959
	Pts in diverse set:  777
	Pts in remaining set:  304
936
Selecting point  777
	Selected point  256
	Pts in diverse set:  778
	Pts in remaining set:  303
959
Selecting point  778
	Selected point  159
	Pts in diverse set:  779
	Pts in remaining set:  302
256
Selecting point  779
	Selected point  403
	Pts in diverse set:  780
	Pts in remaining set:  301
159
Selecting point  780
	Selected point  96
	Pts in diverse set:  781
	Pts in remaining set:  300
403
Selecting point  781
	Selected point  1065
	Pts in diverse set:  782
	Pts in remaining set:  299
96
Selecting point  782
	S

	Selected point  45
	Pts in diverse set:  861
	Pts in remaining set:  220
202
Selecting point  861
	Selected point  121
	Pts in diverse set:  862
	Pts in remaining set:  219
45
Selecting point  862
	Selected point  376
	Pts in diverse set:  863
	Pts in remaining set:  218
121
Selecting point  863
	Selected point  1009
	Pts in diverse set:  864
	Pts in remaining set:  217
376
Save file name : divided_set_0.8_0.2_Kr_M.txt
