# Storing results from API pymatgen

In [1]:
from pymatgen import MPRester, Composition
from pandas import DataFrame
import re
import pprint
import numpy as np

The Materials Project offers a variety of properties of hundreds of thousands of inorganic crystalline materials. Almost all Materials Project data is calculated using a theoretical technique called Density Functional Theory.
A few notes about DFT: 
 - DFT is a first-principles method. This means that it uses a minimum of empirical information, so it can handle unusual systems well, including materials that have never been synthesized! It scales well to several hundred atoms, but beyond that other methods need to be used.

 - However, DFT does still need some form of correction. The particular type of DFT used in Materials Project (GGA/PBE) systematically under-binds materials, meaning that bond lengths (and hence lattice parameters) are systematically larger than expected by 1-2%. This also results in a systematic error in our formation energies, but we can fix this systematic error by fitting our calculated data to experimental formation enthalpies.

 - DFT is a ground-state, 0 K method. It can calculate ground state properties well such as bulk modulus, along with electronic structure information (the shape of your band structures, for example) but it is notably bad at calculating excited states including band gaps, and systematically under-estimates band gaps by a large margin. For this reason, any screening based on band gap has to include a large safety margin of ~0.5 eV.

Taken from https://workshop.materialsproject.org/lessons/01_website_walkthrough/website_walkthrough/ (20.08.2020)

Another tool that is made based on Materials Project is the Python library matminer. Works well with pandas, and is used to extract features from materials and use machine learning such as SciKit-Learn. 
https://hackingmaterials.lbl.gov/matminer/ (20.08.2020)
Examples and jupyter notebooks using matminer is found here: 
https://nbviewer.jupyter.org/github/hackingmaterials/matminer_examples/blob/master/matminer_examples/index.ipynb

In [2]:
mpr = MPRester("b7RtVfJTsUg6TK8E")

In [3]:
with MPRester("b7RtVfJTsUg6TK8E") as mpr:
    docs = mpr.query(criteria={"formula_anonymous": "ABC3"},
                     properties=["task_id", "pretty_formula", "unit_cell_formula", "nsites"])



0 of 4358 done 0.0%
500 of 4358 done 11.5%
1000 of 4358 done 22.9%
1500 of 4358 done 34.4%
2000 of 4358 done 45.9%
2500 of 4358 done 57.4%
3000 of 4358 done 68.8%
3500 of 4358 done 80.3%
4000 of 4358 done 91.8%
4358 of 4358 done 100.0%


In [4]:
data1 = mpr.get_data("*1*1O3")

In [5]:
print (len(data1))

2398


In [6]:
target = [{'Cs': 1.0, 'Pa': 1.0, 'O': 3}, {'Cs': 1.0, 'U': 1.0, 'O': 3}, {'K': 1.0, 'Re': 1.0, 'O': 3},
         {'K': 1.0, 'Re': 1.0, 'O': 3}, {'K': 1.0, 'W': 1.0, 'O': 3}, {'Tl': 1.0, 'Pa': 1.0, 'O': 3},
         {'Tl': 1.0, 'U': 1.0, 'O': 3}, {'Eu': 1.0, 'Cu': 1.0, 'O': 3}, {'Hg': 1.0, 'Pa': 1.0, 'O': 3},
         {'Na': 1.0, 'Re': 1.0, 'O': 3}, {'Ag': 1.0, 'Pa': 1.0, 'O': 3}, {'Ag': 1.0, 'U': 1.0, 'O': 3},
         {'Bi': 1.0, 'Cr': 1.0, 'O': 3}, {'Bi': 1.0, 'Cu': 1.0, 'O': 3}, {'Bi': 1.0, 'Lu': 1.0, 'O': 3}, 
         {'Bi': 1.0, 'Rh': 1.0, 'O': 3}, {'Bi': 1.0, 'V': 1.0, 'O': 3}, {'Ca': 1.0, 'Co': 1.0, 'O': 3},
         {'Ca': 1.0, 'Pu': 1.0, 'O': 3}, {'Cd': 1.0, 'Pa': 1.0, 'O': 3}, {'Cd': 1.0, 'Pu': 1.0, 'O': 3},
         {'Ce': 1.0, 'Co': 1.0, 'O': 3}, {'Ce': 1.0, 'Cu': 1.0, 'O': 3}, {'Ce': 1.0, 'In': 1.0, 'O': 3},
         {'Ce': 1.0, 'Ni': 1.0, 'O': 3}, {'Ce': 1.0, 'Rh': 1.0, 'O': 3}, {'Ce': 1.0, 'Ru': 1.0, 'O': 3}, 
         {'Ce': 1.0, 'Sc': 1.0, 'O': 3}, {'Dy': 1.0, 'Cu': 1.0, 'O': 3}, {'Dy': 1.0, 'Ga': 1.0, 'O': 3},
         {'Er': 1.0, 'Co': 1.0, 'O': 3}, {'Er': 1.0, 'Ga': 1.0, 'O': 3}, {'Eu': 1.0, 'Co': 1.0, 'O': 3},
         {'Eu': 1.0, 'Cr': 1.0, 'O': 3}, {'Eu': 1.0, 'Ge': 1.0, 'O': 3}, {'Eu': 1.0, 'Hf': 1.0, 'O': 3},
         {'Eu': 1.0, 'Ir': 1.0, 'O': 3}, {'Eu': 1.0, 'Mn': 1.0, 'O': 3}, {'Eu': 1.0, 'Mo': 1.0, 'O': 3}, 
         {'Eu': 1.0, 'Nb': 1.0, 'O': 3}, {'Eu': 1.0, 'Pa': 1.0, 'O': 3}, {'Eu': 1.0, 'Pb': 1.0, 'O': 3},
         {'Eu': 1.0, 'Pu': 1.0, 'O': 3}, {'Eu': 1.0, 'Ru': 1.0, 'O': 3}, {'Eu': 1.0, 'Sn': 1.0, 'O': 3},
         {'Eu': 1.0, 'Ti': 1.0, 'O': 3}, {'Eu': 1.0, 'V': 1.0, 'O': 3}, {'Eu': 1.0, 'Zr': 1.0, 'O': 3}, 
         {'Gd': 1.0, 'Cu': 1.0, 'O': 3}, {'Hg': 1.0, 'Hf': 1.0, 'O': 3}, {'Hg': 1.0, 'Pu': 1.0, 'O': 3},
         {'Hg': 1.0, 'Zr': 1.0, 'O': 3}, {'Ho': 1.0, 'Ga': 1.0, 'O': 3}, {'Ho': 1.0, 'V': 1.0, 'O': 3},
         {'Lu': 1.0, 'Co': 1.0, 'O': 3}, {'Lu': 1.0, 'Ga': 1.0, 'O': 3}, {'Lu': 1.0, 'Ni': 1.0, 'O': 3},
         {'Nd': 1.0, 'Cu': 1.0, 'O': 3}, {'Nd': 1.0, 'Ru': 1.0, 'O': 3}, {'Pb': 1.0, 'Pa': 1.0, 'O': 3},
         {'Pb': 1.0, 'Pu': 1.0, 'O': 3}, {'Pr': 1.0, 'Cu': 1.0, 'O': 3}, {'Pr': 1.0, 'Ln': 1.0, 'O': 3},
         {'Pu': 1.0, 'Ga': 1.0, 'O': 3}, {'Sm': 1.0, 'Cu': 1.0, 'O': 3}, {'Sm': 1.0, 'Ga': 1.0, 'O': 3},
         {'Sm': 1.0, 'Ru': 1.0, 'O': 3}, {'Sr': 1.0, 'Cr': 1.0, 'O': 3}, {'Sr': 1.0, 'Np': 1.0, 'O': 3},
         {'Sr': 1.0, 'Pa': 1.0, 'O': 3}, {'Sr': 1.0, 'U': 1.0, 'O': 3}, {'Tb': 1.0, 'Cu': 1.0, 'O': 3}, 
         {'Tb': 1.0, 'Ga': 1.0, 'O': 3}, {'Tb': 1.0, 'Ni': 1.0, 'O': 3}, {'Tb': 1.0, 'Sc': 1.0, 'O': 3},
         {'Tl': 1.0, 'Mn': 1.0, 'O': 3}, {'Tm': 1.0, 'Co': 1.0, 'O': 3}, {'Tm': 1.0, 'Ga': 1.0, 'O': 3},
         {'Yb': 1.0, 'Co': 1.0, 'O': 3}, {'Yb': 1.0, 'Rh': 1.0, 'O': 3}, {'Yb': 1.0, 'Ru': 1.0, 'O': 3},
         {'Yb': 1.0, 'Sc': 1.0, 'O': 3}, {'Eu': 1.0, 'Er': 1.0, 'O': 3}, {'Eu': 1.0, 'Lr': 1.0, 'O': 3}, 
         {'Eu': 1.0, 'Tm': 1.0, 'O': 3}]
targetToData1 = np.zeros(len(target))
targetToData2 = np.zeros(len(target))
targetToData3 = np.zeros(len(target))
targetToData4 = np.zeros(len(target))

targetToDocs = np.zeros(len(target))

print(len(target))

85


Removing every cell that does not have 5 nsites, and checking if the data has the first prediction of the article. 

In [7]:

for i in range(len(target)): 
    for j in range(len(data1)): 
        if data1[j]['unit_cell_formula'] == target[i]:
            targetToData1[i] += 1
                        
for i in range(len(target)): 
    for j in range(len(docs)): 
        if docs[j]['unit_cell_formula'] == target[i]:
            targetToDocs[i] += 1
            
for i in range(len(target)):
    print (int(targetToData1[i]), int(targetToDocs[i]), target[i])

1 1 {'Cs': 1.0, 'Pa': 1.0, 'O': 3}
1 1 {'Cs': 1.0, 'U': 1.0, 'O': 3}
1 1 {'K': 1.0, 'Re': 1.0, 'O': 3}
1 1 {'K': 1.0, 'Re': 1.0, 'O': 3}
1 1 {'K': 1.0, 'W': 1.0, 'O': 3}
1 1 {'Tl': 1.0, 'Pa': 1.0, 'O': 3}
1 1 {'Tl': 1.0, 'U': 1.0, 'O': 3}
1 1 {'Eu': 1.0, 'Cu': 1.0, 'O': 3}
1 1 {'Hg': 1.0, 'Pa': 1.0, 'O': 3}
1 1 {'Na': 1.0, 'Re': 1.0, 'O': 3}
1 1 {'Ag': 1.0, 'Pa': 1.0, 'O': 3}
0 0 {'Ag': 1.0, 'U': 1.0, 'O': 3}
0 0 {'Bi': 1.0, 'Cr': 1.0, 'O': 3}
1 1 {'Bi': 1.0, 'Cu': 1.0, 'O': 3}
0 0 {'Bi': 1.0, 'Lu': 1.0, 'O': 3}
0 0 {'Bi': 1.0, 'Rh': 1.0, 'O': 3}
1 1 {'Bi': 1.0, 'V': 1.0, 'O': 3}
2 2 {'Ca': 1.0, 'Co': 1.0, 'O': 3}
0 0 {'Ca': 1.0, 'Pu': 1.0, 'O': 3}
0 0 {'Cd': 1.0, 'Pa': 1.0, 'O': 3}
0 0 {'Cd': 1.0, 'Pu': 1.0, 'O': 3}
0 0 {'Ce': 1.0, 'Co': 1.0, 'O': 3}
1 1 {'Ce': 1.0, 'Cu': 1.0, 'O': 3}
0 0 {'Ce': 1.0, 'In': 1.0, 'O': 3}
1 1 {'Ce': 1.0, 'Ni': 1.0, 'O': 3}
0 0 {'Ce': 1.0, 'Rh': 1.0, 'O': 3}
0 0 {'Ce': 1.0, 'Ru': 1.0, 'O': 3}
0 0 {'Ce': 1.0, 'Sc': 1.0, 'O': 3}
1 1 {'Dy': 1.0, 'Cu': 1.0, '

In [24]:
count = 0
for i in range(len(targetToData1)): 
    if (targetToData1[i]!=0): 
        count += 1
print ("How much of the database that is included compared to the article: \n")
print (count/len(targetToData1))

How much of the database that is included compared to the article: 

0.5411764705882353


In [9]:

countData1 = 0


countDocs = 0
for i in range(len(data1)):
    if data1[i]['nsites'] == 5: 
        countData1 +=1 
        
for i in range(len(docs)):
    if docs[i]['nsites'] == 5: 
        countDocs +=1 
        
print(len(data1), countData1)

print(len(docs), countDocs)

2398 933
4358 1721
