In [1]:
import numpy as np
import pandas as pd
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
mpdr = MPDataRetrieval(api_key='8xGzGD4F5EyCOlNzKTZ')

In [2]:
def extract_xrd_data(xrd_data):
    """
    Extracts the relevant XRD data from the dictionary obtained from MPD
    
    Parameters:
    ----------
    xrd_data : Dictionary
      The dictionary of data obtained from MPD 
    
    Returns:
    ----------
    clean_df: Pandas dataframe
        The top 10 XRD peaks and their corresponding two theta values for the material
    """
    
    # Checking if data has 10 or more peaks
    # Will return nothing and skip the material if less than 10
    if len(xrd_data['Cu']['pattern']) >= 10:
        
        # Extracting out the amplitude and two theta values from the dictionary contained inside the received data
        # then turning it into a pandas dataframe.
        dirty_df = pd.DataFrame(xrd_data['Cu']['pattern'], columns=xrd_data['Cu']['meta']) # Converts data into dataframe
        dirty_df.drop(['hkl','d_spacing'], axis=1, inplace=True) # Disposes of the hkl and d-spacing data

        # Sorting the peaks into the top 10 with the highest peaks
        dirty_df.sort_values('amplitude', ascending=False, inplace=True) # Sorts peaks from highest to smallest
        dirty_df.reset_index(drop=True, inplace=True) # Reseting index
        clean_df = dirty_df[:10] # Dropping all peaks below the top ten 
        
        return clean_df
    
    else:
        return None

In [3]:
# Function to reformat the data after cleaning
# Takes the dataframe and turns it into a dictionary wwhere all data points have a unique key
def reformat_xrd_data(xrd_data):
    """
    Reformats the cleaned data obtained from the extract_xrd_data function into a dictionary
    
    Parameters:
    ----------
    xrd_data : Dictionary
      The dictionary of data obtained from MPD 
    
    Returns:
    ----------
    clean_df: Pandas dataframe
        The top 10 XRD peaks and their corresponding two theta values for the material
    """
    
    # Checks if data was returned from the extracted data function
    # Skips material if nothing is returned
    if isinstance(extract_xrd_data(xrd_data), pd.DataFrame):
        # Cleaning data and creating empty dictionary
        clean_df = extract_xrd_data(xrd_data)
        mat_dict = {}

        # Loop to assign each data point to a key and stores it within the dictionary
        for i in range(0,20):
            if i < 10:
                amp_key = ('amplitude_' + str(i))
                mat_dict[amp_key] = clean_df['amplitude'][i]

            else:
                theta_key = ('two_theta_' + str(i-10))
                mat_dict[theta_key] = clean_df['two_theta'][i-10]

        return mat_dict
    
    else:
        return None

In [4]:
# Function 
def produce_data(df):
    """
    Reformats the cleaned data obtained from the extract_xrd_data function into a dictionary
    
    Parameters:
    ----------
    xrd_data : Dictionary
      The dictionary of data obtained from MPD 
    
    Returns:
    ----------
    clean_df: Pandas dataframe
        The top 10 XRD peaks and their corresponding two theta values for the material
    """
    
    full_dict = {}
    
    for i in range(len(df)):
        
        if reformat_xrd_data(df['xrd'].iloc[i]) != None:
            ID = df.index[i]
            mat_dict = reformat_data(df['xrd'].iloc[i])
            full_dict[ID] = mat_dict
            
        else:
            continue
    
    full_df = pd.DataFrame.from_dict(full_dict, orient='index')
    
    return full_df

In [5]:
df = mpdr.get_dataframe(criteria='Si', properties=['xrd'])
produce_data(df).head()

MPRestError: REST query returned with error status code 403. Content: b'{"valid_response": false, "error": "API_KEY is not supplied.", "version": {"db": "2018.11", "pymatgen": "2019.2.28", "rest": "2.0"}, "created_at": "2019-03-10T23:23:40.946797"}'