In [1]:
from pymatgen import MPRester, Composition
from pymatgen.electronic_structure.plotter import BSPlotter
import pymatgen.analysis.find_dimension
import pymatgen.io.cif as pcif
import pandas as pd
from tqdm import tqdm
from mendeleev import element #mendeleev is a package with elemental informations like periodic table 
#Another package could be usful as well "periodictable"
import re
import numpy as np
from collections import OrderedDict
from IPython.display import clear_output
import time

In [10]:
def get_valence(group_id):
    if group_id == None:
        return(None)
    elif group_id >= 1 and group_id <= 12:
        return(group_id)
    elif group_id >= 13 and group_id <= 18:
        return(group_id-10)
    elif group_id ==0:
        return(8)
    else:
        raise ValueError('The group ID is out of range')

In [11]:
def NumberOfElement(unit_cell_formula,element):
    if element not in unit_cell_formula:
        raise ValueError('Element can not be found in pretty_formula')
    else:
        return(unit_cell_formula[element])

In [38]:
NumberOfElement({'C': 2.0, 'P': 2.0, 'V': 4.0},'C')+1

3.0

In [12]:
#def find_number(text, c):
#    return re.findall(r'%s(\d+)' % c, text)

In [13]:
def getOneElementFeature(feature, Element):
    currentElem = element(Element)
    
    if feature == "atomic_number":
        return(currentElem.atomic_number)
    
    elif feature == "atomic_weight":
        return(currentElem.atomic_weight)
    
    elif feature == "period":
        return(currentElem.period)
    
    elif feature == "group_id":
        return(currentElem.group_id)
    
    elif feature == "atomic_radius":
        return(currentElem.atomic_radius)
    
    elif feature == "covalent_radius":
        return(currentElem.covalent_radius)
    
    elif feature == "valence_electrons":
        return(get_valence(currentElem.group_id))
    
    elif feature == "number_of_outer_shell_electrons":
        d = OrderedDict(currentElem.ec.electrons_per_shell())
        els = list(d.items())
        return(list(d.items())[-1][1])
    
    elif feature == "ionenergies":
        return(currentElem.ionenergies[1])
    
    elif feature == "dipole_polarizability":
        return(currentElem.dipole_polarizability)
    
    elif feature == "melting_point":
        return(currentElem.melting_point)
    
    elif feature == "boiling_point":
        return(currentElem.boiling_point)
    
    elif feature == "atomic_density":
        return(currentElem.density)
    
    elif feature == "specific_heat":
        return(currentElem.specific_heat)
    
    elif feature == "fusion_heat":
        return(currentElem.fusion_heat)
    
    elif feature == "evaporation_heat":
        return(currentElem.evaporation_heat)
    
    elif feature == "thermal_conductivity":
        return(currentElem.thermal_conductivity)
    
    else:
        raise ValueError('This feature dose not exsits')

In [14]:
def createElementsDict(unit_cell_formula):
#    elements_dict = {}
#    for element in elements:
#        elements_dict[element] = NumberOfElement(unit_cell_formula, element)
    return(unit_cell_formula)

In [15]:
def natoms(unit_cell_formula,elements):
    n = 0
    for element in elements:
        n += NumberOfElement(unit_cell_formula, element)
    return(n)

In [16]:
def featureSum(feature, unit_cell_formula, elements):
    fsum = 0
    elements_dict = createElementsDict(unit_cell_formula)
    for Element in elements:
        fsum += getOneElementFeature(feature,Element)*elements_dict[Element]
    return(fsum)

In [33]:
def featureAvg(feature, unit_cell_formula, elements):
    return(featureSum(feature, unit_cell_formula, elements)/natoms(unit_cell_formula,elements))

In [18]:
def featureDifference(feature, unit_cell_formula, elements):
    favg = featureAvg(feature, unit_cell_formula, elements)
    fdiff = 0
    for Element in elements:
        fdiff +=  (getOneElementFeature(feature,Element) - favg)**2
    return(np.sqrt(fdiff/natoms(unit_cell_formula,elements)))

In [19]:
def featureLargest(feature, unit_cell_formula, elements):
    feature_values = []
    for Element in elements:
        feature_values.append(getOneElementFeature(feature,Element))
    return(max(feature_values))

In [20]:
def featureSmallest(feature, unit_cell_formula, elements):
    feature_values = []
    for Element in elements:
        feature_values.append(getOneElementFeature(feature,Element))
    return(min(feature_values))

In [22]:
featureSum("atomic_weight",{'Rb': 2.0, 'Te': 2.0, 'Au': 2.0}
,['Rb','Te','Au'])

820.0687379999999

In [31]:
atomicFeatures = ["atomic_number","atomic_weight","period","group_id","atomic_radius","covalent_radius","valence_electrons",\
                  "number_of_outer_shell_electrons","ionenergies","dipole_polarizability","melting_point","boiling_point",\
                  "atomic_density","specific_heat","fusion_heat","evaporation_heat","thermal_conductivity"]

In [15]:
len(atomicFeatures)

17

In [7]:
mpr = MPRester("Frv0akZ1InOToUmL")##API key should be inside the "", URL for API key: https://materialsproject.org/dashboard
data = mpr.query(criteria={"band_gap": {"$gt": -0.1}},properties=(["cif","pretty_formula","unit_cell_formula","band_gap","volume","spacegroup","density","elements","nelements"]))

HBox(children=(IntProgress(value=0, max=120612), HTML(value='')))

In [8]:
pd.DataFrame(data).to_excel('MPwithMag.xlsx')

In [4]:
pd_all = pd.DataFrame(data)

In [6]:
pd_all.head(20)

Unnamed: 0,cif,pretty_formula,unit_cell_formula,band_gap,volume,spacegroup,density,elements,nelements
0,# generated using pymatgen\ndata_V2PC\n_symmet...,V2PC,"{'C': 2.0, 'P': 2.0, 'V': 4.0}",0.0,89.180553,"{'source': 'spglib', 'symbol': 'P6_3/mmc', 'nu...",5.394855,"[V, P, C]",3
1,# generated using pymatgen\ndata_V5P3N\n_symme...,V5P3N,"{'N': 2.0, 'P': 6.0, 'V': 10.0}",0.0,208.249629,"{'source': 'spglib', 'symbol': 'P6_3/mcm', 'nu...",5.767212,"[V, P, N]",3
2,# generated using pymatgen\ndata_Ta4N5\n_symme...,Ta4N5,"{'N': 5.0, 'Ta': 4.0}",0.0,102.185062,"{'source': 'spglib', 'symbol': 'I4/m', 'number...",12.899903,"[Ta, N]",2
3,# generated using pymatgen\ndata_NdNi4B\n_symm...,NdNi4B,"{'B': 2.0, 'Ni': 8.0, 'Nd': 2.0}",0.0,154.385284,"{'source': 'spglib', 'symbol': 'P6/mmm', 'numb...",8.385803,"[Nd, Ni, B]",3
4,# generated using pymatgen\ndata_YCo\n_symmetr...,YCo,"{'Co': 2.0, 'Y': 2.0}",0.0,81.284252,"{'source': 'spglib', 'symbol': 'Cmcm', 'number...",6.040346,"[Y, Co]",2
5,# generated using pymatgen\ndata_Li(CoO2)4\n_s...,Li(CoO2)4,"{'Li': 1.0, 'Co': 4.0, 'O': 8.0}",0.0,129.200112,"{'source': 'spglib', 'symbol': 'R-3m', 'number...",4.764008,"[Co, Li, O]",3
6,# generated using pymatgen\ndata_Sm\n_symmetry...,Sm,{'Sm': 4.0},0.0,135.538653,"{'source': 'spglib', 'symbol': 'P6_3/mmc', 'nu...",7.368486,[Sm],1
7,# generated using pymatgen\ndata_YbGa2\n_symme...,YbGa2,"{'Ga': 2.0, 'Yb': 1.0}",0.0,63.577503,"{'source': 'spglib', 'symbol': 'P6/mmm', 'numb...",8.161617,"[Yb, Ga]",2
8,# generated using pymatgen\ndata_ErHg\n_symmet...,ErHg,"{'Er': 1.0, 'Hg': 1.0}",0.0,50.556323,"{'source': 'spglib', 'symbol': 'Pm-3m', 'numbe...",12.082121,"[Er, Hg]",2
9,# generated using pymatgen\ndata_Li(CuN)3\n_sy...,Li(CuN)3,"{'Li': 1.0, 'Cu': 3.0, 'N': 3.0}",0.0,65.704825,"{'source': 'spglib', 'symbol': 'Pm-3m', 'numbe...",6.055319,"[Cu, Li, N]",3


In [23]:
pd_all['spacegroup'][4]['crystal_system']

'orthorhombic'

In [24]:
pd_all.query('band_gap == 0').shape

(55101, 9)

In [25]:
pd_subset = pd.concat([pd_all.query('band_gap == 0')[0:10],pd_all.query('band_gap != 0')[-10::]])

In [26]:
pd_subset.shape

(20, 9)

In [27]:
pd_subset.head()

Unnamed: 0,cif,pretty_formula,unit_cell_formula,band_gap,volume,spacegroup,density,elements,nelements
0,# generated using pymatgen\ndata_V2PC\n_symmet...,V2PC,"{'C': 2.0, 'P': 2.0, 'V': 4.0}",0.0,89.180553,"{'source': 'spglib', 'symbol': 'P6_3/mmc', 'nu...",5.394855,"[V, P, C]",3
1,# generated using pymatgen\ndata_V5P3N\n_symme...,V5P3N,"{'N': 2.0, 'P': 6.0, 'V': 10.0}",0.0,208.249629,"{'source': 'spglib', 'symbol': 'P6_3/mcm', 'nu...",5.767212,"[V, P, N]",3
2,# generated using pymatgen\ndata_Ta4N5\n_symme...,Ta4N5,"{'N': 5.0, 'Ta': 4.0}",0.0,102.185062,"{'source': 'spglib', 'symbol': 'I4/m', 'number...",12.899903,"[Ta, N]",2
3,# generated using pymatgen\ndata_NdNi4B\n_symm...,NdNi4B,"{'B': 2.0, 'Ni': 8.0, 'Nd': 2.0}",0.0,154.385284,"{'source': 'spglib', 'symbol': 'P6/mmm', 'numb...",8.385803,"[Nd, Ni, B]",3
4,# generated using pymatgen\ndata_YCo\n_symmetr...,YCo,"{'Co': 2.0, 'Y': 2.0}",0.0,81.284252,"{'source': 'spglib', 'symbol': 'Cmcm', 'number...",6.040346,"[Y, Co]",2


In [28]:
pd_subset.isnull().sum()

cif                  0
pretty_formula       0
unit_cell_formula    0
band_gap             0
volume               0
spacegroup           0
density              0
elements             0
nelements            0
dtype: int64

In [29]:
pd_subset.reset_index(inplace=True)

In [34]:
dict_all = {}
for i in range(len(atomicFeatures)):
    dict_all["sum_"+atomicFeatures[i]] = []
    dict_all["avg_"+atomicFeatures[i]] = []
    dict_all["diff_"+atomicFeatures[i]] = []
    dict_all["max_"+atomicFeatures[i]] = []
    dict_all["min_"+atomicFeatures[i]] = []
    for j in range(pd_subset.shape[0]):
        feature_tester = []          #test whether there is None feature value
        for Element in pd_subset['elements'][j]:
            feature_tester.append(getOneElementFeature(atomicFeatures[i],Element))
        if None not in feature_tester:
            clear_output()
            print("row "+ str(j) + " and feature " + str(i))
            dict_all["sum_"+atomicFeatures[i]].append(featureSum(atomicFeatures[i],pd_subset['unit_cell_formula'][j],pd_subset['elements'][j]))
            dict_all["avg_"+atomicFeatures[i]].append(featureAvg(atomicFeatures[i],pd_subset['unit_cell_formula'][j],pd_subset['elements'][j]))
            dict_all["diff_"+atomicFeatures[i]].append(featureDifference(atomicFeatures[i],pd_subset['unit_cell_formula'][j],pd_subset['elements'][j]))
            dict_all["max_"+atomicFeatures[i]].append(featureLargest(atomicFeatures[i],pd_subset['unit_cell_formula'][j],pd_subset['elements'][j]))
            dict_all["min_"+atomicFeatures[i]].append(featureSmallest(atomicFeatures[i],pd_subset['unit_cell_formula'][j],pd_subset['elements'][j]))
        else:
            dict_all["sum_"+atomicFeatures[i]].append('None')
            dict_all["avg_"+atomicFeatures[i]].append('None')
            dict_all["diff_"+atomicFeatures[i]].append('None')
            dict_all["max_"+atomicFeatures[i]].append('None')
            dict_all["min_"+atomicFeatures[i]].append('None')

row 19 and feature 16


In [35]:
pd.DataFrame(pd.DataFrame(dict_all)=='None').sum()

  result = method(y)


sum_atomic_number             0
avg_atomic_number             0
diff_atomic_number            0
max_atomic_number             0
min_atomic_number             0
                             ..
sum_thermal_conductivity     12
avg_thermal_conductivity     12
diff_thermal_conductivity    12
max_thermal_conductivity     12
min_thermal_conductivity     12
Length: 85, dtype: int64

In [39]:
dict_all["band_gap"] = []
dict_all["material_density"] = []
dict_all["material_volume"] = []
dict_all["space_group"] = []
dict_all["volume_per_atom"] = []
dict_all["pretty_formula"] = []
dict_all['unit_cell_formula'] = []
dict_all["cif"] = []
for i in range(pd_subset.shape[0]):
    clear_output()
    natom = 0
    for Element in pd_subset['elements'][i]:
        natom += NumberOfElement(pd_subset['unit_cell_formula'][i],Element)
    dict_all["band_gap"].append(pd_subset['band_gap'][i])
    dict_all["material_density"].append(pd_subset['density'][i])
    dict_all["material_volume"].append(pd_subset['volume'][i])
    dict_all["space_group"].append(pd_subset["spacegroup"][i]['crystal_system'])
    dict_all["volume_per_atom"].append(pd_subset["volume"][i]/natom)
    dict_all["pretty_formula"].append(pd_subset["pretty_formula"][i])
    dict_all['unit_cell_formula'].append(pd_subset['unit_cell_formula'][i])
    dict_all["cif"].append(pd_subset["cif"][i])
    print("row "+ str(i))

row 19


In [40]:
ML_data = pd.DataFrame(dict_all)

In [41]:
ML_data.to_excel("ML_data_20.xlsx")

## mendeleev
#### atomic number: 
    C.atomic_number
#### atomic weight: 
    C.atomic_weight
#### period number: 
    C.period
#### group number:
    C.group_id
#### family number:
#### L quantum number:
#### Mendeleev number:
#### Atomic radius: 
    C.atomic_radius
#### covalent radius: 
    C.covalent_radius
#### ionic radius: 
    C.ionic_radii ?
#### crystal radius: 
    C.ionic_radii ?
#### Pauling EN:
#### Martynov-Batsanov EN:
#### Gordy EN:
#### Mulliken EN:
#### Allen EN:
#### Metallic valence:
#### Number of valence electrons:
    get_valence(C.group_id)
#### Number of s electrons:
#### Number of p electrons:
#### Number of d electrons:
#### Number of outer shell electrons:
    from collections import OrderedDict <br/>
    Si = element('Si') <br/>
    d = OrderedDict(Si.ec.electrons_per_shell()) <br/>
    els = list(d.items()) <br/>
    els[-1][1] <br/>
#### First ionization energy (kJ/mol): <br/>
    C.ionenergies[1] <br/>
#### Polarizability: <br/>
    C.dipole_polarizability <br/>
#### Melting point (K): <br/>
    C.melting_point <br/>
#### Boiling point (K): <br/>
    C.boiling_point <br/>
#### Density (g/mL): <br/>
    C.density <br/>
#### Specific heat (J/g•K): <br/>
    C.specific_heat <br/>
#### Heat of fusion (kJ/mol): <br/>
    C.fusion_heat <br/>
#### Heat of vaporization (kJ/mol): <br/>
    C.evaporation_heat <br/>
#### Thermal conductivity (W/m•K): <br/>
    C.thermal_conductivity <br/>
#### Heat atomization (kJ/mol): <br/>
#### Cohesive energy (eV): <br/>