# Build features

In this notebook we will go through how we can build features with the python library matminer [citation]. Some always, we start off with some imports. 

In [1]:
# Optional: Load the "autoreload" extension so that code can change
%load_ext autoreload

#OPTIONAL: Always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [2]:
import sys 
sys.path.insert(0, "../")

import os

import pandas as pd
import numpy as np
from tqdm import tqdm

from src.data.get_data_MP import data_MP
from src.features.build_features import featurize_by_material_id
from src.features.preset import PRESET_HEBNES_2021
# Ignore warnings from nan-values in  
np.warnings.filterwarnings('ignore')

# Find and store all API-keys that are stored as environment variables .env in root folder
from dotenv import find_dotenv, load_dotenv
key_status = load_dotenv(find_dotenv())

# Private keys. If not present, add your own secret keys here
if (key_status):
    MAPI_KEY = os.getenv("MAPI_KEY")
    CAPI_KEY = os.getenv("CAPI_KEY")
else: 
    MAPI_KEY = None
    CAPI_KEY = None

from pathlib import Path
data_dir = Path.cwd().parent / "data"
print("Current data directory {}".format(data_dir))


If you use the ChemEnv tool for your research, please consider citing the following reference(s) :
David Waroquiers, Xavier Gonze, Gian-Marco Rignanese, Cathrin Welker-Nieuwoudt, Frank Rosowski,
Michael Goebel, Stephan Schenk, Peter Degelmann, Rute Andre, Robert Glaum, and Geoffroy Hautier,
"Statistical analysis of coordination environments in oxides",
Chem. Mater., 2017, 29 (19), pp 8346-8360,
DOI: 10.1021/acs.chemmater.7b02766

Current data directory /home/oliver/Dokumenter/masterprosjekt/predicting-solid-state-qubit-candidates/data


In [3]:
MP = data_MP(API_KEY=MAPI_KEY)
entries = MP.get_dataframe()
print("Number of entries after query: {}".format(len(entries)))

Data path /home/oliver/Dokumenter/masterprosjekt/predicting-solid-state-qubit-candidates/data/raw/MP/MP.pkl detected. Reading now...
Done
Number of entries after query: 25212


In [4]:
#featurizerObject = PRESET_HEBNES_2021()
#df = featurize_by_material_id(entries["material_id"].iloc[:2], featurizerObject, "b7RtVfJTsUg6TK8E")

In [5]:
def sortByMPID(df):
    mpid_num = []
    for i in df["material_id"]:
        mpid_num.append(int(i[3:]))
    df["mpid_num"] = mpid_num
    df = df.sort_values(by="mpid_num").reset_index(drop=True)
    df = df.drop(columns=["mpid_num"])
    #df = df.set_index("material_id")
    return df

# Oxidation

In [6]:
MP_oxidation_featurized = pd.read_csv(data_dir / "interim" / "featurized" / "MP_oxidationFeaturized.csv", sep=",")
print(MP_oxidation_featurized.shape)
#MP_oxidation_featurized = findIdenticalIDs(MP_oxidation_featurized)
#MP_oxidation_featurized

(25270, 113)


In [7]:
def findIdenticalIDs(someEntries):
    dropIds = np.zeros(len(someEntries))
    for i, mpid in tqdm(enumerate(someEntries["material_id"])):
        for j in entries["material_id"]:
            if mpid==j:
                dropIds[i] += 1
    return someEntries.drop(np.where(dropIds==0)[0]).reset_index(drop=True)

In [8]:
MP_oxidation_featurized = findIdenticalIDs(MP_oxidation_featurized)
MP_oxidation_featurized.shape

25270it [00:43, 578.82it/s]


(25201, 113)

In [9]:
dropIds1 = np.zeros(len(entries))
for i, mpid in tqdm(enumerate(entries["material_id"])):
    for j in MP_oxidation_featurized["material_id"]:
        if mpid==j:
            dropIds1[i] += 1

25212it [00:57, 435.75it/s]


In [10]:
entries["material_id"][np.where(dropIds1 == 0)[0]].values

array(['mp-20946', 'mp-31624', 'mp-565970', 'mp-568700', 'mp-644925',
       'mp-1101391', 'mp-1101820', 'mp-1172939', 'mp-1179149',
       'mp-1180710', 'mp-1293833'], dtype=object)

In [11]:
drop_columns = ["composition", "composition_oxid"]
MP_oxidation_featurized = MP_oxidation_featurized.drop(drop_columns, axis=1)

# Electronic

In [12]:
MP_electronic_featurized = pd.read_csv(data_dir / "interim" / "featurized" / "MP_electronicFeaturized.csv", sep=",", index_col=0)
print(MP_electronic_featurized.shape)
MP_electronic_featurized = findIdenticalIDs(MP_electronic_featurized)
MP_electronic_featurized.shape

56it [00:00, 558.36it/s]

(25271, 19)


25271it [00:43, 580.57it/s]


(25201, 19)

In [13]:
drop_columns = ["full_formula"]
MP_electronic_featurized = MP_electronic_featurized.drop(drop_columns, axis=1)

MP_electronic_featurized

dropIds2 = np.zeros(len(entries))
for i, mpid in tqdm(enumerate(entries["material_id"])):
    for j in MP_electronic_featurized["material_id"]:
        if mpid==j:
            dropIds2[i] += 1

25212it [00:57, 435.22it/s]


In [14]:
entries["material_id"][np.where(dropIds2 == 0)[0]].values

array(['mp-20946', 'mp-31624', 'mp-565970', 'mp-568700', 'mp-644925',
       'mp-1101391', 'mp-1101820', 'mp-1172939', 'mp-1179149',
       'mp-1180710', 'mp-1293833'], dtype=object)

# The rest

In [15]:
MP_rest_of_featurizers = pd.read_pickle(data_dir / "interim" / "featurized" / "MP_featurizedAll.pkl")
MP_rest_of_featurizers.shape

(25212, 4700)

In [16]:
MP_rest_of_featurizers = findIdenticalIDs(MP_rest_of_featurizers)

25212it [00:43, 579.34it/s]


In [17]:
MP_rest_of_featurizers.shape

(25212, 4700)

In [18]:
#MP_electronic_featurized = MP_electronic_featurized.join(MP_rest_of_featurizers.set_index('material_id'), on='material_id')

In [19]:
dropIds3 = np.zeros(len(entries))
for i, mpid in tqdm(enumerate(entries["material_id"])):
    for j in MP_rest_of_featurizers["material_id"]:
        if mpid==j:
            dropIds3[i] += 1

25212it [01:00, 413.53it/s]


In [20]:
entries["material_id"][np.where(dropIds3 == 0)[0]].values

array([], dtype=object)

In [21]:
slettAlleDisse = list(entries["material_id"][np.where(dropIds1 == 0)[0]].values)\
               + list(entries["material_id"][np.where(dropIds2 == 0)[0]].values)\
               + list(entries["material_id"][np.where(dropIds3 == 0)[0]].values)
slettAlleDisse = list(set(slettAlleDisse))
print(len(slettAlleDisse))
slettAlleDisse

11


['mp-1179149',
 'mp-1101391',
 'mp-565970',
 'mp-1172939',
 'mp-1180710',
 'mp-20946',
 'mp-31624',
 'mp-1101820',
 'mp-568700',
 'mp-1293833',
 'mp-644925']

## slette

In [22]:
for i in slettAlleDisse:
    try:
        MP_rest_of_featurizers   = MP_rest_of_featurizers  [MP_rest_of_featurizers  ["material_id"] != i]
    except: 
        continue
for i in slettAlleDisse:
    try:
        MP_oxidation_featurized  = MP_oxidation_featurized [MP_oxidation_featurized ["material_id"] != i]
    except: 
        continue
for i in slettAlleDisse:
    try:
        MP_electronic_featurized = MP_electronic_featurized[MP_electronic_featurized["material_id"] != i]
    except: 
        continue

In [23]:
MP_rest_of_featurizers.shape

(25201, 4700)

In [24]:
MP_electronic_featurized.shape

(25201, 18)

In [25]:
MP_oxidation_featurized.shape

(25201, 111)

['mp-1179149',
 'mp-1101391',
 'mp-565970',
 'mp-1172939',
 'mp-1180710',
 'mp-20946',
 'mp-31624',
 'mp-1101820',
 'mp-568700',
 'mp-1293833',
 'mp-644925']

# Featurize the rest

In [26]:
leggtiligjen = slettAlleDisse
featurizerObject = PRESET_HEBNES_2021()
df = featurize_by_material_id(leggtiligjen, featurizerObject, MAPI_KEY)

100%|██████████| 1/1 [00:00<00:00, 14563.56it/s]


Downloading dos and bandstructure objects..
                  full_formula  \
material_id                      
mp-1101391   Sr4Re8H32C6N12O42   
mp-1101820            Mg1Ti3H8   
mp-1172939       Na2Mg4H6S4O16   
mp-1179149            Sr2Mo2O8   
mp-1180710          Na24B52O90   
mp-1293833             Y2Fe4O8   
mp-20946            Ba1Y1Fe2O5   
mp-31624          Sr8Ta4Cr4O24   
mp-565970              Ba2W2O8   
mp-568700             Na5Li1N2   
mp-644925        Cs10Bi2Mo8O32   

                                                 bandstructure  \
material_id                                                      
mp-1101391                                                None   
mp-1101820                                                None   
mp-1172939                                                None   
mp-1179149                                                None   
mp-1180710                                                None   
mp-1293833                                         

DOSFeaturizer:   0%|          | 0/11 [00:00<?, ?it/s]

TypeError: Cannot compare types 'ndarray(dtype=object)' and 'float'

In [None]:
df

In [None]:
df = df.reset_index()
df

# Add together all contributions

In [None]:
MP_electronic_featurized = MP_electronic_featurized.join(MP_oxidation_featurized.set_index('material_id'), on='material_id')

In [None]:
MP_ALL_featurized = MP_rest_of_featurizers.join(MP_oxidation_featurized.set_index('material_id'), on='material_id')

In [None]:
MP_ALL_featurized

In [None]:
ALL = pd.concat([MP_ALL_featurized,df])

In [None]:
ALL = sortByMPID(ALL)

In [None]:
ALL

## Sette på ny som er bra for alle. Nå er det godt system.

# Add band gap from other databases

In [None]:
bandgaps = pd.read_pickle(data_dir / "interim" / "bandgaps.pkl")
bandgaps

In [None]:
ALL = ALL.join(bandgaps.set_index('material_id'), on='material_id')
ALL

In [None]:
ALL.to_pickle(data_dir / "interim" / "featurized" / "featurizedData.pkl")