# Preprocessing

In this Jupyter notebook we will investigate how the data from dataMining.ipynb looks like, and try to understand how it is put together.

In [1]:
import os
if not os.path.exists('data/preprocessing'):
    os.makedirs('data/preprocessing')

# Tools for query of data
#from pymatgen 

from matminer.featurizers.composition import ElementFraction
from pymatgen import Composition

# Load featurizers and conversion functions
from matminer.featurizers.composition import ElementProperty, OxidationStates
from matminer.featurizers.structure import DensityFeatures
from matminer.featurizers.dos import DOSFeaturizer
from matminer.featurizers.conversions import CompositionToOxidComposition, StrToComposition

# pandas
import pandas as pd
import numpy as np

from tqdm import tqdm

# plotting 
import plotly.graph_objects as go

# Ignore warnings from nan-values in  
np.warnings.filterwarnings('ignore')

In [2]:
from helperFunctions import read_csv
data = read_csv("data/stage_4/MP_data_stage_4.csv")
data

Unnamed: 0,material_id,full_formula,icsd_ids,spacegroup.number,band_gap,run_type,cif,e_above_hull,elements,dos,structure,col_id
0,mp-1000,Ba1Te1,"[616165, 616163, 29152, 43656]",225,1.8555,GGA,# generated using pymatgen\ndata_BaTe\n_symmet...,0.000000,"['Ba', 'Te']",{'GGA': {'task_id': 'mp-1686464'}},Full Formula (Ba1 Te1)\nReduced Formula: BaTe\...,
1,mp-1006878,Ba1O2,[180398],65,2.3433,GGA,# generated using pymatgen\ndata_BaO2\n_symmet...,0.120015,"['Ba', 'O']",{'GGA': {'task_id': 'mp-1667967'}},Full Formula (Ba1 O2)\nReduced Formula: BaO2\n...,
2,mp-10074,Ge2Se4,"[50761, 637859, 90957]",122,1.6631,GGA,# generated using pymatgen\ndata_GeSe2\n_symme...,0.000000,"['Ge', 'Se']",{'GGA': {'task_id': 'mp-1701570'}},Full Formula (Ge2 Se4)\nReduced Formula: GeSe2...,
3,mp-1008500,Ba2O2,[173921],194,2.4251,GGA,# generated using pymatgen\ndata_BaO\n_symmetr...,0.025751,"['Ba', 'O']",{'GGA': {'task_id': 'mp-1695159'}},Full Formula (Ba2 O2)\nReduced Formula: BaO\na...,
4,mp-1008680,Ti1Ge1Pt1,"[188964, 188965]",216,0.9376,GGA,# generated using pymatgen\ndata_TiGePt\n_symm...,0.000000,"['Ti', 'Ge', 'Pt']",{'GGA': {'task_id': 'mp-1687503'}},Full Formula (Ti1 Ge1 Pt1)\nReduced Formula: T...,
...,...,...,...,...,...,...,...,...,...,...,...,...
1061,mp-984729,Ba1Sr3O4,[180202],221,2.5199,GGA,# generated using pymatgen\ndata_BaSr3O4\n_sym...,0.044250,"['Ba', 'Sr', 'O']",{'GGA': {'task_id': 'mp-1730558'}},Full Formula (Ba1 Sr3 O4)\nReduced Formula: Ba...,
1062,mp-985829,Hf1S2,"[638851, 601164, 603757, 182677, 638847]",164,1.2325,GGA,# generated using pymatgen\ndata_HfS2\n_symmet...,0.000000,"['Hf', 'S']",{'GGA': {'task_id': 'mp-1686545'}},Full Formula (Hf1 S2)\nReduced Formula: HfS2\n...,
1063,mp-985831,Hf1Se2,"[195308, 638902, 182678, 638899, 603743]",164,0.5549,GGA,# generated using pymatgen\ndata_HfSe2\n_symme...,0.000000,"['Hf', 'Se']",{'GGA': {'task_id': 'mp-1686558'}},Full Formula (Hf1 Se2)\nReduced Formula: HfSe2...,
1064,mp-9921,Zr2S6,"[42073, 651463, 651485]",11,1.0948,GGA,# generated using pymatgen\ndata_ZrS3\n_symmet...,0.000000,"['Zr', 'S']",{'GGA': {'task_id': 'mp-1697936'}},Full Formula (Zr2 S6)\nReduced Formula: ZrS3\n...,


In [3]:
data.describe()

Unnamed: 0,spacegroup.number,band_gap,e_above_hull,col_id
count,1066.0,1066.0,1066.0,0.0
mean,88.555347,3.044849,0.030098,
std,74.382284,1.52229,0.048863,
min,2.0,0.5072,0.0,
25%,15.0,1.8561,0.0,
50%,62.0,2.90325,0.00832,
75%,148.0,4.1374,0.036963,
max,230.0,6.7907,0.19999,


In [18]:
df = StrToComposition().featurize_dataframe(data[["full_formula", "dos", "structure"]], "full_formula")
df = ElementFraction().featurize_dataframe(df, "composition")
df = DOSFeaturizer().featurize_dataframe(df, "dos", ignore_errors=True)

HBox(children=(HTML(value='StrToComposition'), FloatProgress(value=0.0, max=1066.0), HTML(value='')))




HBox(children=(HTML(value='ElementFraction'), FloatProgress(value=0.0, max=1066.0), HTML(value='')))




HBox(children=(HTML(value='DOSFeaturizer'), FloatProgress(value=0.0, max=1066.0), HTML(value='')))




In [19]:
df

Unnamed: 0,full_formula,dos,structure,composition,H,He,Li,Be,B,C,...,cbm_hybridization,cbm_character_1,cbm_specie_1,cbm_location_1,cbm_score_1,vbm_hybridization,vbm_character_1,vbm_specie_1,vbm_location_1,vbm_score_1
0,Ba1Te1,{'GGA': {'task_id': 'mp-1686464'}},Full Formula (Ba1 Te1)\nReduced Formula: BaTe\...,"(Ba, Te)",0,0,0,0,0,0.0,...,,,,,,,,,,
1,Ba1O2,{'GGA': {'task_id': 'mp-1667967'}},Full Formula (Ba1 O2)\nReduced Formula: BaO2\n...,"(Ba, O)",0,0,0,0,0,0.0,...,,,,,,,,,,
2,Ge2Se4,{'GGA': {'task_id': 'mp-1701570'}},Full Formula (Ge2 Se4)\nReduced Formula: GeSe2...,"(Ge, Se)",0,0,0,0,0,0.0,...,,,,,,,,,,
3,Ba2O2,{'GGA': {'task_id': 'mp-1695159'}},Full Formula (Ba2 O2)\nReduced Formula: BaO\na...,"(Ba, O)",0,0,0,0,0,0.0,...,,,,,,,,,,
4,Ti1Ge1Pt1,{'GGA': {'task_id': 'mp-1687503'}},Full Formula (Ti1 Ge1 Pt1)\nReduced Formula: T...,"(Ti, Ge, Pt)",0,0,0,0,0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1061,Ba1Sr3O4,{'GGA': {'task_id': 'mp-1730558'}},Full Formula (Ba1 Sr3 O4)\nReduced Formula: Ba...,"(Ba, Sr, O)",0,0,0,0,0,0.0,...,,,,,,,,,,
1062,Hf1S2,{'GGA': {'task_id': 'mp-1686545'}},Full Formula (Hf1 S2)\nReduced Formula: HfS2\n...,"(Hf, S)",0,0,0,0,0,0.0,...,,,,,,,,,,
1063,Hf1Se2,{'GGA': {'task_id': 'mp-1686558'}},Full Formula (Hf1 Se2)\nReduced Formula: HfSe2...,"(Hf, Se)",0,0,0,0,0,0.0,...,,,,,,,,,,
1064,Zr2S6,{'GGA': {'task_id': 'mp-1697936'}},Full Formula (Zr2 S6)\nReduced Formula: ZrS3\n...,"(Zr, S)",0,0,0,0,0,0.0,...,,,,,,,,,,


## Featurize with matminer 

