In [47]:
import numpy as np
import pandas as pd
from ast import literal_eval
from tqdm import tqdm

In [1]:
# This code obtains and stores the relevant data from MPD
# NOTE: only needed for regeneration of raw data

# mpdr = MPDataRetrieval() # or mpdr = MPDataRetrieval(api_key='YOUR_KEY')
# from matminer.data_retrieval.retrieve_MP import MPDataRetrieval

#properties = ['xrd.Cu', 'band_gap', 'efermi']
#criteria = {"band_gap": {'$gt': 0, '$lt': 6}, "efermi": {'$exists': True}, 'xrd.Cu': {'$exists': True}}

#res_ids = mpdr.get_dataframe(criteria=criteria, properties=["material_id"]).index.tolist()

#results = pd.DataFrame()
#for chunk in tqdm(py_.chunk(res_ids, 50)): # Number of materials at a time
#    temp_data = mpdr.get_dataframe(criteria={"material_id": {"$in": chunk}}, properties=properties)
#    print(len(temp_data))
#    results = results.append(temp_data)
#    print(len(results))

In [None]:
# Used to transform data into a .csv file
# NOTE: only needed for regeneration of raw data

#results.to_csv('MPD_Data.csv', sep='\t')

In [4]:
# Reads data from csv and turns it into a pandas dataframe

MPD_data = pd.read_csv('MPD_Data_Raw.csv', sep='\t')
MPD_data.head()

Unnamed: 0,material_id,xrd.Cu,band_gap,efermi
0,mp-760482,"{'meta': ['amplitude', 'hkl', 'two_theta', 'd_...",2.9875,-6.6472
1,mp-672234,"{'meta': ['amplitude', 'hkl', 'two_theta', 'd_...",2.9912,-6.019798
2,mp-561203,"{'meta': ['amplitude', 'hkl', 'two_theta', 'd_...",2.7932,-5.896597
3,mp-12103,"{'meta': ['amplitude', 'hkl', 'two_theta', 'd_...",1.0557,-5.252045
4,mp-622628,"{'meta': ['amplitude', 'hkl', 'two_theta', 'd_...",0.012,-5.228518


In [26]:
def extract_data(MPD_data_row):
    """
    Extracts the relevant XRD data from the dictionary obtained from MPD
    
    Parameters:
    ----------
    MPD_data_row : Pandas dataframe
         A row of data for a single material from the full MPD dataframe 
    
    Returns:
    ----------
    clean_df: Pandas dataframe
        The top 10 XRD peaks and their corresponding two theta values for the material
    """
    
    # Extracting out the amplitude and two theta values from the dictionary contained inside the received data
    # then turning it into a pandas dataframe.
    dirty_df = pd.DataFrame(literal_eval(MPD_data_row['xrd.Cu'])['pattern'], columns=literal_eval(MPD_data_row['xrd.Cu'])['meta']) # Converts data into dataframe
    dirty_df.drop(['hkl','d_spacing'], axis=1, inplace=True) # Disposes of the hkl and d-spacing data

    # Sorting the peaks into the top 10 with the highest peaks
    dirty_df.sort_values('amplitude', ascending=False, inplace=True) # Sorts peaks from highest to smallest
    dirty_df.reset_index(drop=True, inplace=True) # Reseting index
    clean_df = dirty_df[:10] # Dropping all peaks below the top ten 

    return clean_df

In [27]:
def reformat_data(MPD_data_row):
    """
    Reformats the cleaned data obtained from the extract_data function into a dictionary
    
    Parameters:
    ----------
    MPD_data_row : Pandas dataframe
         A row of data for a single material from the full MPD dataframe 
    
    Returns:
    ----------
    clean_df: Pandas dataframe
        The top 10 XRD peaks and their corresponding two theta values for the material
    """
    
    # Cleaning data and creating empty dictionary
    clean_df = extract_data(MPD_data_row)
    mat_dict = {}

    # Loop to assign each data point to a key and stores it within the dictionary
    for i in range(0,20):
        if i < 10:
            amp_key = ('amplitude_' + str(i))
            mat_dict[amp_key] = clean_df['amplitude'][i]

        else:
            theta_key = ('two_theta_' + str(i-10))
            mat_dict[theta_key] = clean_df['two_theta'][i-10]

    return mat_dict

In [48]:
def produce_data(MPD_data):
    """
    Produces the XRD and DOS data for all the materials passed to the function 
    
    Parameters:
    ----------
    MPD_data : Pandas dataframe
      The dataframe filled with data obtained from MPD 
    
    Returns:
    ----------
    full_df: Pandas dataframe
        The peaks, two theta values, band gap, and fermi energy for all the materials passed to the function
    """
    
    # Creating prelimanry containers for XRD and DOS data
    xrd_data = {}
    dos_data = MPD_data.drop(['xrd.Cu'], axis=1)
    dos_data.set_index(['material_id'], inplace=True)
    
    # Loop to run through each row of the dataframe
    for i in tqdm(range(len(MPD_data))): # tqdm is used only to monitor progress during testing. MAY NEED TO REMOVE
        
        # Conditional to skip over materials with less than 10 XRD peaks
        # or no fermi energies
        if len(literal_eval(MPD_data.iloc[i]['xrd.Cu'])['pattern']) >= 10 and np.isnan(MPD_data.iloc[i]['efermi']) == False:
            
            # Obtaining and storing the XRD data for a material into a dictionary
            ID = MPD_data.iloc[i]['material_id']
            mat_dict = reformat_data(MPD_data.iloc[i])
            xrd_data[ID] = mat_dict
            
        else:
            
            # Replaces rows that failed the conditional with NaN
            # This is for easy removal od the rows
            dos_data.iloc[i] = float('nan')
    
    # Creating the final dataframe from the obtained XRD and DOS dataframes
    dos_df = dos_data.dropna()
    xrd_df = pd.DataFrame.from_dict(xrd_data, orient='index')
    full_df = pd.concat([xrd_df, dos_df], axis=1, sort=False)
    
    return full_df

In [50]:
# Processes all the data in MPD_Data.csv
processed_data = produce_data(MPD_data)

100%|██████████████████████████████████████████████████████████████████████████| 38458/38458 [1:35:17<00:00,  6.73it/s]


In [51]:
# Converts processed data to a .csv
processed_data.to_csv('MPD_Data_Processed.csv', sep='\t')

In [54]:
# Final data frame to be used to analysis
pd.read_csv('MPD_Data_Processed.csv', sep='\t', index_col=0)

Unnamed: 0.1,Unnamed: 0,amplitude_0,amplitude_1,amplitude_2,amplitude_3,amplitude_4,amplitude_5,amplitude_6,amplitude_7,amplitude_8,...,two_theta_2,two_theta_3,two_theta_4,two_theta_5,two_theta_6,two_theta_7,two_theta_8,two_theta_9,band_gap,efermi
0,mp-1000,100.0,76.808467,60.517635,29.361717,22.132998,21.657248,18.275875,17.467588,15.621411,...,35.823897,58.193688,44.255935,64.376264,127.447337,153.100605,134.637754,108.920846,1.5930,2.087033
1,mp-10009,100.0,78.502945,61.382764,59.581084,51.834414,42.578817,34.238297,32.474708,26.611831,...,19.268570,24.866325,43.790584,158.910531,161.241471,48.251965,31.649554,62.766975,0.7804,2.669606
2,mp-1001012,100.0,80.540257,76.206648,35.729962,34.640541,30.607739,26.076216,23.924735,18.361390,...,48.832011,23.855655,44.629498,33.990068,135.125592,91.443607,145.702725,41.952291,0.5765,3.071523
3,mp-1001015,100.0,77.813956,60.044813,44.937326,38.637906,25.032308,22.305078,21.849105,19.221977,...,169.262733,32.462334,42.581227,86.418577,64.924776,124.232687,22.798656,132.065583,0.6698,2.212315
4,mp-1001016,100.0,70.020649,63.037973,48.338026,34.510549,31.212189,27.679137,26.613327,24.481458,...,27.919942,168.168263,26.708785,86.053690,55.025631,165.708924,123.501849,77.054857,0.1396,2.638915
5,mp-1001017,100.0,82.510474,62.621787,35.485360,31.972686,30.602577,25.466085,23.383696,23.296121,...,26.111929,158.425177,83.727380,41.454717,75.065414,13.550361,118.984161,56.314790,0.0318,3.047039
6,mp-1001019,100.0,89.137124,69.093395,55.574693,41.139818,28.514250,26.760181,25.650597,22.696139,...,27.525625,178.831035,164.166779,54.200623,75.796799,84.581264,56.823708,13.663195,1.0918,2.398462
7,mp-1001021,100.0,26.911942,20.841552,14.602553,8.726978,6.664802,6.361764,6.176260,5.977624,...,30.949327,25.562458,81.620167,73.255021,148.310554,40.558901,115.073169,55.049578,0.2395,1.890714
8,mp-1001023,100.0,49.914642,48.693448,35.355514,30.012210,28.376814,24.962956,15.128396,13.657916,...,37.865103,41.838454,52.284598,146.515585,162.346414,72.786877,61.728825,89.534079,1.6671,2.443932
9,mp-1001024,100.0,94.750536,91.712566,79.268906,77.773011,39.523632,32.300151,29.849701,28.259961,...,26.427008,167.411573,13.711741,41.969142,158.196285,84.950400,76.112570,121.330229,1.7768,2.190834
