# Build features

In this notebook we will go through how we can build features with the python library matminer [citation]. Some always, we start off with some imports. 

In [1]:
# Optional: Load the "autoreload" extension so that code can change
%load_ext autoreload

#OPTIONAL: Always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [48]:
import sys 
sys.path.insert(0, "../")

import os

import pandas as pd
import numpy as np
from tqdm import tqdm

from src.data.get_data_MP import data_MP
from src.features.build_features import featurize_by_material_id
from src.features.featurizeAll import FeaturizeAll
# Ignore warnings from nan-values in  
np.warnings.filterwarnings('ignore')

# Find and store all API-keys that are stored as environment variables .env in root folder
from dotenv import find_dotenv, load_dotenv
key_status = load_dotenv(find_dotenv())

# Private keys. If not present, add your own secret keys here
if (key_status):
    MAPI_KEY = os.getenv("MAPI_KEY")
    CAPI_KEY = os.getenv("CAPI_KEY")
else: 
    MAPI_KEY = None
    CAPI_KEY = None

from pathlib import Path
data_dir = Path.cwd().parent / "data"
print("Current data directory {}".format(data_dir))

Current data directory /home/oliver/Dokumenter/masterprosjekt/predicting-solid-state-qubit-candidates-v2/data


In [3]:
MP = data_MP(API_KEY=MAPI_KEY)
entries = MP.get_dataframe()
print("Number of entries after query: {}".format(len(entries)))

Data path /home/oliver/Dokumenter/masterprosjekt/predicting-solid-state-qubit-candidates-v2/data/raw/MP/MP.pkl detected. Reading now...
Done
Number of entries after query: 25212


In [4]:
#featurizerObject = FeaturizeAll()
#df = featurize_by_material_id(entries["material_id"].iloc[:2], featurizerObject, "b7RtVfJTsUg6TK8E")

In [5]:
def sortByMPID(df):
    mpid_num = []
    for i in df["material_id"]:
        mpid_num.append(int(i[3:]))
    df["mpid_num"] = mpid_num
    df = df.sort_values(by="mpid_num").reset_index(drop=True)
    df = df.drop(columns=["mpid_num"])
    #df = df.set_index("material_id")
    return df

# Oxidation

In [6]:
MP_oxidation_featurized = pd.read_csv(data_dir / "interim" / "Featurized" / "MP_oxidationFeaturized.csv", sep=",")
print(MP_oxidation_featurized.shape)
#MP_oxidation_featurized = findIdenticalIDs(MP_oxidation_featurized)
#MP_oxidation_featurized

(25270, 113)


In [7]:
def findIdenticalIDs(someEntries):
    dropIds = np.zeros(len(someEntries))
    for i, mpid in tqdm(enumerate(someEntries["material_id"])):
        for j in entries["material_id"]:
            if mpid==j:
                dropIds[i] += 1
    return someEntries.drop(np.where(dropIds==0)[0]).reset_index(drop=True)

In [8]:
MP_oxidation_featurized = findIdenticalIDs(MP_oxidation_featurized)
MP_oxidation_featurized.shape

25270it [00:44, 565.59it/s]


(25201, 113)

In [9]:
dropIds1 = np.zeros(len(entries))
for i, mpid in tqdm(enumerate(entries["material_id"])):
    for j in MP_oxidation_featurized["material_id"]:
        if mpid==j:
            dropIds1[i] += 1

25212it [01:00, 420.05it/s]


In [10]:
entries["material_id"][np.where(dropIds1 == 0)[0]].values

array(['mp-20946', 'mp-31624', 'mp-565970', 'mp-568700', 'mp-644925',
       'mp-1101391', 'mp-1101820', 'mp-1172939', 'mp-1179149',
       'mp-1180710', 'mp-1293833'], dtype=object)

In [11]:
drop_columns = ["composition", "composition_oxid"]
MP_oxidation_featurized = MP_oxidation_featurized.drop(drop_columns, axis=1)

# Electronic

In [12]:
MP_electronic_featurized = pd.read_csv(data_dir / "interim" / "Featurized" / "MP_electronicFeaturized.csv", sep=",", index_col=0)
print(MP_electronic_featurized.shape)
MP_electronic_featurized = findIdenticalIDs(MP_electronic_featurized)
MP_electronic_featurized.shape

55it [00:00, 544.84it/s]

(25271, 19)


25271it [00:44, 565.10it/s]


(25201, 19)

In [13]:
drop_columns = ["full_formula"]
MP_electronic_featurized = MP_electronic_featurized.drop(drop_columns, axis=1)

MP_electronic_featurized

dropIds2 = np.zeros(len(entries))
for i, mpid in tqdm(enumerate(entries["material_id"])):
    for j in MP_electronic_featurized["material_id"]:
        if mpid==j:
            dropIds2[i] += 1

25212it [01:00, 415.51it/s]


In [14]:
#MP_electronic_featurized = MP_electronic_featurized.join(MP_oxidation_featurized.set_index('material_id'), on='material_id')
#MP_electronic_featurized = sortByMPID(MP_electronic_featurized)
#bandGaps["featurized_Eg"] = MP_electronic_featurized.pop("band_gap")
#MP_electronic_featurized
entries["material_id"][np.where(dropIds2 == 0)[0]].values

array(['mp-20946', 'mp-31624', 'mp-565970', 'mp-568700', 'mp-644925',
       'mp-1101391', 'mp-1101820', 'mp-1172939', 'mp-1179149',
       'mp-1180710', 'mp-1293833'], dtype=object)

# The rest

In [15]:
MP_rest_of_featurizers = pd.read_pickle(data_dir / "interim" / "Featurized" / "MP_featurizedAll.pkl")
MP_rest_of_featurizers.shape

(25212, 4700)

In [16]:
MP_rest_of_featurizers = findIdenticalIDs(MP_rest_of_featurizers)

25212it [00:44, 568.55it/s]


In [17]:
MP_rest_of_featurizers.shape

(25212, 4700)

In [18]:
#MP_electronic_featurized = MP_electronic_featurized.join(MP_rest_of_featurizers.set_index('material_id'), on='material_id')

In [19]:
dropIds3 = np.zeros(len(entries))
for i, mpid in tqdm(enumerate(entries["material_id"])):
    for j in MP_rest_of_featurizers["material_id"]:
        if mpid==j:
            dropIds3[i] += 1

25212it [01:06, 380.06it/s]


In [20]:
entries["material_id"][np.where(dropIds3 == 0)[0]].values

array([], dtype=object)

In [21]:
slettAlleDisse = list(entries["material_id"][np.where(dropIds1 == 0)[0]].values)\
               + list(entries["material_id"][np.where(dropIds2 == 0)[0]].values)\
               + list(entries["material_id"][np.where(dropIds3 == 0)[0]].values)
slettAlleDisse = list(set(slettAlleDisse))
print(len(slettAlleDisse))
slettAlleDisse

11


['mp-1101820',
 'mp-1180710',
 'mp-20946',
 'mp-1293833',
 'mp-1179149',
 'mp-1172939',
 'mp-644925',
 'mp-565970',
 'mp-31624',
 'mp-568700',
 'mp-1101391']

## slette

In [22]:
for i in slettAlleDisse:
    try:
        MP_rest_of_featurizers   = MP_rest_of_featurizers  [MP_rest_of_featurizers  ["material_id"] != i]
    except: 
        continue
for i in slettAlleDisse:
    try:
        MP_oxidation_featurized  = MP_oxidation_featurized [MP_oxidation_featurized ["material_id"] != i]
    except: 
        continue
for i in slettAlleDisse:
    try:
        MP_electronic_featurized = MP_electronic_featurized[MP_electronic_featurized["material_id"] != i]
    except: 
        continue

In [23]:
MP_rest_of_featurizers.shape

(25201, 4700)

In [24]:
MP_electronic_featurized.shape

(25201, 18)

In [25]:
MP_oxidation_featurized.shape

(25201, 111)

# Featurize the rest

In [26]:
leggtiligjen = slettAlleDisse
featurizerObject = FeaturizeAll()
df = featurize_by_material_id(leggtiligjen, featurizerObject, MAPI_KEY)

100%|██████████| 1/1 [00:00<00:00, 9962.72it/s]


Downloading dos and bandstructure objects..
                  full_formula  \
material_id                      
mp-1101391   Sr4Re8H32C6N12O42   
mp-1101820            Mg1Ti3H8   
mp-1172939       Na2Mg4H6S4O16   
mp-1179149            Sr2Mo2O8   
mp-1180710          Na24B52O90   
mp-1293833             Y2Fe4O8   
mp-20946            Ba1Y1Fe2O5   
mp-31624          Sr8Ta4Cr4O24   
mp-565970              Ba2W2O8   
mp-568700             Na5Li1N2   
mp-644925        Cs10Bi2Mo8O32   

                                                 bandstructure  \
material_id                                                      
mp-1101391                                                None   
mp-1101820                                                None   
mp-1172939                                                None   
mp-1179149                                                None   
mp-1180710                                                None   
mp-1293833                                         

MultipleFeaturizer:   0%|          | 0/11 [00:00<?, ?it/s]

Applying oxidation state featurizers...


CompositionToOxidComposition:   0%|          | 0/11 [00:00<?, ?it/s]

Applying featurizers (ElectronegativityDiff(stats=['minimum', 'maximum', 'range', 'mean', 'std_dev']), OxidationStates(stats=['minimum', 'maximum', 'range', 'std_dev'])) to column 'composition_oxid'.


MultipleFeaturizer:   0%|          | 0/11 [00:00<?, ?it/s]

Applying structure featurizers...
Applying featurizers (DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), PartialRadialDistributionFunction(exclude_elems=[], include_elems=[]), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(pattern_length=128)) to column 'structure'.


MultipleFeaturizer:   0%|          | 0/11 [00:00<?, ?it/s]

Applying site featurizers...


SiteStatsFingerprint:   0%|          | 0/11 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/11 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/11 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/11 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/11 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/11 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/11 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/11 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/11 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/11 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/11 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/11 [00:00<?, ?it/s]

Applying dos featurizers...
DOSFeaturizer()
<class 'matminer.featurizers.dos.DOSFeaturizer'>
Applying featurizers DOSFeaturizer() to column 'dos'.


DOSFeaturizer:   0%|          | 0/11 [00:00<?, ?it/s]

DOSFeaturizer|cbm_location_1
material_id
mp-1101391              0
mp-1101820              0
mp-1172939              0
mp-1179149              0
mp-1180710              0
mp-1293833              0
mp-20946                0
mp-31624      0.0;0.0;0.0
mp-565970               0
mp-568700               0
mp-644925               0
Name: DOSFeaturizer|cbm_location_1, dtype: object
DOSFeaturizer|vbm_location_1
material_id
mp-1101391              0
mp-1101820              0
mp-1172939              0
mp-1179149              0
mp-1180710              0
mp-1293833              0
mp-20946                0
mp-31624      0.0;0.0;0.0
mp-565970               0
mp-568700               0
mp-644925               0
Name: DOSFeaturizer|vbm_location_1, dtype: object
Applying bandstructure featurizers...
Applying featurizers BandFeaturizer() to column 'bandstructure'.


BandFeaturizer:   0%|          | 0/11 [00:00<?, ?it/s]

In [27]:
df

Unnamed: 0_level_0,DOSFeaturizer|cbm_hybridization,DOSFeaturizer|cbm_character_1,DOSFeaturizer|cbm_score_1,DOSFeaturizer|vbm_hybridization,DOSFeaturizer|vbm_character_1,DOSFeaturizer|vbm_score_1,DOSFeaturizer|vbm_specie_1_0,DOSFeaturizer|vbm_specie_1_Cr,DOSFeaturizer|cbm_specie_1_0,DOSFeaturizer|cbm_specie_1_Cr,...,VoronoiFingerprint|mean Voro_area_maximum,VoronoiFingerprint|std_dev Voro_area_maximum,VoronoiFingerprint|mean Voro_dist_mean,VoronoiFingerprint|std_dev Voro_dist_mean,VoronoiFingerprint|mean Voro_dist_std_dev,VoronoiFingerprint|std_dev Voro_dist_std_dev,VoronoiFingerprint|mean Voro_dist_minimum,VoronoiFingerprint|std_dev Voro_dist_minimum,VoronoiFingerprint|mean Voro_dist_maximum,VoronoiFingerprint|std_dev Voro_dist_maximum
material_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mp-1101391,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,0,...,7.023839,0.976089,3.035521,0.301694,0.787671,0.158788,1.385053,0.412354,4.156663,0.333327
mp-1101820,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,0,...,2.47759,0.045546,2.051067,0.085124,0.099675,0.070481,1.924555,0.011716,2.155308,0.158526
mp-1172939,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,0,...,6.24085,0.863647,2.889165,0.248961,0.70978,0.147942,1.495602,0.426623,3.888789,0.314001
mp-1179149,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,0,...,5.755912,0.483486,3.075222,0.274846,0.617884,0.025133,1.94115,0.304674,4.078832,0.420009
mp-1180710,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
mp-1293833,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,0,...,6.75263,0.83645,3.038188,0.233231,0.644195,0.115239,2.032979,0.101709,4.042751,0.263548
mp-20946,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,0,...,5.01993,0.567587,2.896172,0.185482,0.463701,0.122813,2.159141,0.267067,3.473805,0.360242
mp-31624,1.282979,3.0,0.459716,1.898573,3.0,0.346259,0,1,0,1,...,3.777399,0.477091,2.759744,0.334006,0.347495,0.181992,2.159428,0.333376,3.046604,0.579343
mp-565970,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,0,...,6.121136,0.406202,3.21801,0.302527,0.664342,0.033245,1.98431,0.362062,4.265212,0.433473
mp-568700,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,0,...,3.201073,0.025254,2.840116,0.045773,0.262754,0.053513,2.517868,0.111403,3.377436,0.092917


In [28]:
df = df.reset_index()
df

Unnamed: 0,material_id,DOSFeaturizer|cbm_hybridization,DOSFeaturizer|cbm_character_1,DOSFeaturizer|cbm_score_1,DOSFeaturizer|vbm_hybridization,DOSFeaturizer|vbm_character_1,DOSFeaturizer|vbm_score_1,DOSFeaturizer|vbm_specie_1_0,DOSFeaturizer|vbm_specie_1_Cr,DOSFeaturizer|cbm_specie_1_0,...,VoronoiFingerprint|mean Voro_area_maximum,VoronoiFingerprint|std_dev Voro_area_maximum,VoronoiFingerprint|mean Voro_dist_mean,VoronoiFingerprint|std_dev Voro_dist_mean,VoronoiFingerprint|mean Voro_dist_std_dev,VoronoiFingerprint|std_dev Voro_dist_std_dev,VoronoiFingerprint|mean Voro_dist_minimum,VoronoiFingerprint|std_dev Voro_dist_minimum,VoronoiFingerprint|mean Voro_dist_maximum,VoronoiFingerprint|std_dev Voro_dist_maximum
0,mp-1101391,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,...,7.023839,0.976089,3.035521,0.301694,0.787671,0.158788,1.385053,0.412354,4.156663,0.333327
1,mp-1101820,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,...,2.47759,0.045546,2.051067,0.085124,0.099675,0.070481,1.924555,0.011716,2.155308,0.158526
2,mp-1172939,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,...,6.24085,0.863647,2.889165,0.248961,0.70978,0.147942,1.495602,0.426623,3.888789,0.314001
3,mp-1179149,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,...,5.755912,0.483486,3.075222,0.274846,0.617884,0.025133,1.94115,0.304674,4.078832,0.420009
4,mp-1180710,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,mp-1293833,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,...,6.75263,0.83645,3.038188,0.233231,0.644195,0.115239,2.032979,0.101709,4.042751,0.263548
6,mp-20946,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,...,5.01993,0.567587,2.896172,0.185482,0.463701,0.122813,2.159141,0.267067,3.473805,0.360242
7,mp-31624,1.282979,3.0,0.459716,1.898573,3.0,0.346259,0,1,0,...,3.777399,0.477091,2.759744,0.334006,0.347495,0.181992,2.159428,0.333376,3.046604,0.579343
8,mp-565970,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,...,6.121136,0.406202,3.21801,0.302527,0.664342,0.033245,1.98431,0.362062,4.265212,0.433473
9,mp-568700,0.0,-1.0,0.0,0.0,-1.0,0.0,1,0,1,...,3.201073,0.025254,2.840116,0.045773,0.262754,0.053513,2.517868,0.111403,3.377436,0.092917


# Add together all contributions

In [29]:
MP_electronic_featurized = MP_electronic_featurized.join(MP_oxidation_featurized.set_index('material_id'), on='material_id')

In [30]:
MP_ALL_featurized = MP_rest_of_featurizers.join(MP_oxidation_featurized.set_index('material_id'), on='material_id')

In [31]:
MP_ALL_featurized

Unnamed: 0,material_id,AtomicOrbitals|HOMO_character,AtomicOrbitals|HOMO_element,AtomicOrbitals|HOMO_energy,AtomicOrbitals|LUMO_character,AtomicOrbitals|LUMO_element,AtomicOrbitals|LUMO_energy,AtomicOrbitals|gap_AO,AtomicPackingEfficiency|mean simul. packing efficiency,AtomicPackingEfficiency|mean abs simul. packing efficiency,...,Es,Fm,Md,No,Lr,minimum EN difference,maximum EN difference,range EN difference,mean EN difference,std_dev EN difference
0,mvc-12905,3.0,26,-0.295049,3.0,26,-0.295049,0.000000,-0.037170,0.037170,...,0,0,0,0,0,,,,,
1,mp-7,2.0,16,-0.261676,2.0,16,-0.261676,0.000000,0.023994,0.023994,...,0,0,0,0,0,,,,,
2,mp-14,2.0,34,-0.245806,2.0,34,-0.245806,0.000000,0.023994,0.023994,...,0,0,0,0,0,,,,,
3,mp-19,2.0,52,-0.226594,2.0,52,-0.226594,0.000000,0.023994,0.023994,...,0,0,0,0,0,,,,,
4,mp-24,2.0,6,-0.199186,2.0,6,-0.199186,0.000000,0.023994,0.023994,...,0,0,0,0,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25207,mp-1539137,3.0,24,-0.118123,3.0,24,-0.118123,0.000000,0.043418,0.043418,...,0,0,0,0,0,0.92,1.76,0.84,1.0600,0.593970
25208,mp-1541522,2.0,8,-0.338381,2.0,8,-0.338381,0.000000,0.043618,0.045390,...,0,0,0,0,0,1.25,1.42,0.17,1.3350,0.120208
25209,mp-1541714,2.0,17,-0.320380,2.0,33,-0.197497,0.122883,0.000000,0.000000,...,0,0,0,0,0,0.98,2.37,1.39,1.8140,0.982878
25210,mp-1542038,2.0,34,-0.245806,2.0,34,-0.245806,0.000000,-0.017041,0.039003,...,0,0,0,0,0,0.59,1.76,1.17,1.1750,0.827315


In [32]:
ALL = pd.concat([MP_ALL_featurized,df])

In [33]:
ALL = sortByMPID(ALL)

In [54]:
ALL

Unnamed: 0,material_id,AtomicOrbitals|HOMO_character,AtomicOrbitals|HOMO_element,AtomicOrbitals|HOMO_energy,AtomicOrbitals|LUMO_character,AtomicOrbitals|LUMO_element,AtomicOrbitals|LUMO_energy,AtomicOrbitals|gap_AO,AtomicPackingEfficiency|mean simul. packing efficiency,AtomicPackingEfficiency|mean abs simul. packing efficiency,...,BandFeaturizer|n_ex1_degen,ElectronegativityDiff|minimum EN difference,ElectronegativityDiff|maximum EN difference,ElectronegativityDiff|range EN difference,ElectronegativityDiff|mean EN difference,ElectronegativityDiff|std_dev EN difference,OxidationStates|minimum oxidation state,OxidationStates|maximum oxidation state,OxidationStates|range oxidation state,OxidationStates|std_dev oxidation state
0,mvc-12905,3.0,26,-0.295049,3.0,26,-0.295049,0.000000,-0.037170,0.037170,...,,,,,,,,,,
1,mp-7,2.0,16,-0.261676,2.0,16,-0.261676,0.000000,0.023994,0.023994,...,,,,,,,,,,
2,mp-14,2.0,34,-0.245806,2.0,34,-0.245806,0.000000,0.023994,0.023994,...,,,,,,,,,,
3,mp-19,2.0,52,-0.226594,2.0,52,-0.226594,0.000000,0.023994,0.023994,...,,,,,,,,,,
4,mp-24,2.0,6,-0.199186,2.0,6,-0.199186,0.000000,0.023994,0.023994,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25207,mp-1539137,3.0,24,-0.118123,3.0,24,-0.118123,0.000000,0.043418,0.043418,...,,,,,,,,,,
25208,mp-1541522,2.0,8,-0.338381,2.0,8,-0.338381,0.000000,0.043618,0.045390,...,,,,,,,,,,
25209,mp-1541714,2.0,17,-0.320380,2.0,33,-0.197497,0.122883,0.000000,0.000000,...,,,,,,,,,,
25210,mp-1542038,2.0,34,-0.245806,2.0,34,-0.245806,0.000000,-0.017041,0.039003,...,,,,,,,,,,


## Sette på ny som er bra for alle. Nå er det godt system.

# Add band gap from other databases

In [52]:
bandgaps = pd.read_pickle(data_dir / "interim" / "bandgaps.pkl")
bandgaps

Unnamed: 0,material_id,MP_Eg,OQMD_Eg,AFLOW_Eg,AFLOW-fitted_Eg,AFLOWML_Eg,JARVIS-TBMBJ_Eg,JARVIS-OPT_Eg,Exp_Eg,spillage
0,mvc-12905,1.2690,,,,1.047,,,,
1,mp-7,2.4881,2.085,2.5251,4.31683,2.490,3.0448,1.9604,,
2,mp-14,1.0119,,0.9784,2.23188,0.997,2.2888,0.8982,,
3,mp-19,0.5752,,0.1534,1.11978,,0.6148,0.1655,,1.318
4,mp-24,2.7785,,2.4528,4.21937,3.355,3.3186,2.7427,,
...,...,...,...,...,...,...,...,...,...,...
25209,mp-1539137,0.3523,0.508,,,0.584,,,,
25210,mp-1541522,4.0039,,,,3.445,,,,
25211,mp-1541714,2.6408,,,,2.857,,,,
25212,mp-1542038,1.2725,,,,1.094,,,,


In [58]:
ALL = ALL.join(bandgaps.set_index('material_id'), on='material_id')
ALL

Unnamed: 0,material_id,AtomicOrbitals|HOMO_character,AtomicOrbitals|HOMO_element,AtomicOrbitals|HOMO_energy,AtomicOrbitals|LUMO_character,AtomicOrbitals|LUMO_element,AtomicOrbitals|LUMO_energy,AtomicOrbitals|gap_AO,AtomicPackingEfficiency|mean simul. packing efficiency,AtomicPackingEfficiency|mean abs simul. packing efficiency,...,OxidationStates|std_dev oxidation state,MP_Eg,OQMD_Eg,AFLOW_Eg,AFLOW-fitted_Eg,AFLOWML_Eg,JARVIS-TBMBJ_Eg,JARVIS-OPT_Eg,Exp_Eg,spillage
0,mvc-12905,3.0,26,-0.295049,3.0,26,-0.295049,0.000000,-0.037170,0.037170,...,,1.2690,,,,1.047,,,,
1,mp-7,2.0,16,-0.261676,2.0,16,-0.261676,0.000000,0.023994,0.023994,...,,2.4881,2.085,2.5251,4.31683,2.490,3.0448,1.9604,,
2,mp-14,2.0,34,-0.245806,2.0,34,-0.245806,0.000000,0.023994,0.023994,...,,1.0119,,0.9784,2.23188,0.997,2.2888,0.8982,,
3,mp-19,2.0,52,-0.226594,2.0,52,-0.226594,0.000000,0.023994,0.023994,...,,0.5752,,0.1534,1.11978,,0.6148,0.1655,,1.318
4,mp-24,2.0,6,-0.199186,2.0,6,-0.199186,0.000000,0.023994,0.023994,...,,2.7785,,2.4528,4.21937,3.355,3.3186,2.7427,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25207,mp-1539137,3.0,24,-0.118123,3.0,24,-0.118123,0.000000,0.043418,0.043418,...,,0.3523,0.508,,,0.584,,,,
25208,mp-1541522,2.0,8,-0.338381,2.0,8,-0.338381,0.000000,0.043618,0.045390,...,,4.0039,,,,3.445,,,,
25209,mp-1541714,2.0,17,-0.320380,2.0,33,-0.197497,0.122883,0.000000,0.000000,...,,2.6408,,,,2.857,,,,
25210,mp-1542038,2.0,34,-0.245806,2.0,34,-0.245806,0.000000,-0.017041,0.039003,...,,1.2725,,,,1.094,,,,


In [59]:
ALL.to_pickle(data_dir / "interim" / "featurized" / "featurizedData.pkl")