This notebook generates the test set file that be used to test the final, tuned models that use the 'Stoich45 Intersection' feature set to predict the HSE06 bandgap.

In [2]:
import pandas as pd
import numpy as np

In [23]:
# read column names from the training version of the Stoich45 intersection feature set
columns = pd.read_csv('../data/datasets_main/Stoich45_FeatureSelected_dataset.csv').columns

# import the full Stoich45 dataset
df_stoich45_full = pd.read_csv('../data/QMOF/qmof_database/qmof_database/relaxed_structures_fingerprints/stoich45_fingerprints.csv')

# import the MOF test set
df_hse06_test = pd.read_csv('../data/datasets_main/test_hse06.csv')

# merge such that only the MOFs in the test set remain
df_stoich45Int_test = df_hse06_test.merge(df_stoich45_full, left_on='qmof_id', right_on='MOF', how='left')[columns]

# print NaN values in the data frame
print(df_stoich45Int_test.isna().sum())

# drop rows with NaN values
df_stoich45Int_test.dropna(inplace = True)
df_stoich45Int_test

atomic_num_standard_deviation            0
atomic_num_max                           0
group_num_mean                           0
period_num_mean                          0
electronegativity_min                   41
electron_affinity_mean                  26
electron_affinity_geometric_mean        26
electron_affinity_standard_deviation    26
melting_mean                             0
melting_geometric_mean                   0
melting_standard_deviation               0
boiling_mean                             0
boiling_geometric_mean                   0
boiling_standard_deviation               0
density_mean                             0
density_geometric_mean                   0
density_standard_deviation               0
density_max                              0
ionization_energy_mean                   0
ionization_energy_geometric_mean         0
ionization_energy_standard_deviation     0
ionization_energy_max                    0
ionization_energy_geometric_min          0
outputs.hse

Unnamed: 0,atomic_num_standard_deviation,atomic_num_max,group_num_mean,period_num_mean,electronegativity_min,electron_affinity_mean,electron_affinity_geometric_mean,electron_affinity_standard_deviation,melting_mean,melting_geometric_mean,...,density_geometric_mean,density_standard_deviation,density_max,ionization_energy_mean,ionization_energy_geometric_mean,ionization_energy_standard_deviation,ionization_energy_max,ionization_energy_geometric_min,outputs.hse06.bandgap,MOF
0,4.717126,29.0,9.972973,1.729730,1.90,102.118919,72.280456,54.720389,1707.683514,246.316950,...,17.461229,1700.185134,8960.0,1213.921622,1203.934034,149.552611,1402.3,745.5,3.040455,qmof-9b6dfb6
1,4.654337,19.0,10.580645,1.903226,0.82,123.074194,115.108503,39.909882,1167.264516,158.342497,...,11.328725,1026.280317,2260.0,1179.503226,1144.387247,227.658555,1313.9,418.8,4.396124,qmof-1b6e4f0
2,6.322367,30.0,11.294118,1.882353,1.65,89.682353,0.000000,62.562395,1414.677647,213.446303,...,16.048708,1821.439157,7140.0,1230.023529,1220.265223,151.073420,1402.3,906.4,4.324525,qmof-a560beb
3,9.172250,48.0,8.461538,1.692308,1.69,109.346154,0.000000,45.625661,939.180769,91.931127,...,4.026897,1827.387355,8650.0,1231.300000,1223.857715,128.707453,1313.9,867.8,5.560470,qmof-1bc82b5
4,8.177816,48.0,10.433333,1.800000,1.69,103.133333,0.000000,55.541740,1568.990333,219.388124,...,15.582738,1764.808334,8650.0,1219.286667,1210.950704,139.316734,1402.3,867.8,4.378257,qmof-42f7d2d
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2158,2.764706,8.0,8.176471,1.588235,0.98,113.061765,105.571085,39.850485,1502.315294,179.412757,...,10.837738,1080.380436,2260.0,1179.482353,1156.316803,196.879649,1313.9,520.2,3.653921,qmof-da6d484
2159,4.126971,29.0,11.067797,1.762712,1.90,125.983051,94.730789,72.824214,1471.732712,197.157775,...,12.180584,1508.323682,8960.0,1252.835593,1240.440614,175.957776,1681.0,745.5,3.701753,qmof-855d1ed
2160,4.124784,19.0,9.647059,1.735294,0.82,107.679412,90.954336,45.032810,1054.414706,116.693383,...,5.591195,1006.123537,2260.0,1223.023529,1202.860391,179.275304,1402.3,418.8,4.872454,qmof-11627b4
2161,4.201239,28.0,9.320755,1.660377,1.91,99.358491,71.861425,53.011068,1494.248113,177.202934,...,9.371275,1543.272307,8908.0,1229.832075,1220.991739,141.082679,1402.3,737.1,3.866952,qmof-08b67ba


In [24]:
# save to csv
df_stoich45Int_test.to_csv('../data/datasets_main/Stoich45_FeatureSelected_test_set.csv', index=False)