In [5]:
import pandas as pd
import numpy as np
import sklearn
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import keras
from sklearn import preprocessing, metrics 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from keras.models import Model, Sequential
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import Input, Dense, Activation
from keras import optimizers
from keras import regularizers
from keras.regularizers import Regularizer

In [73]:
#NOTE: none of these fxns are meant for single point input, input should always be a pd.DataFrame with more
#than one row, not an unreasonable request since sklearn makes you reshape 1D arrays.

def single_plot(x, y, figsize, title, xtitle, ytitle, xticks, yticks):
    """Plots a single figure.
    x: x axis data
    y: y axis data
    figsize: tuple of format (xdim, ydim)
    title: tuple of format (title(string), fontsize(int))
    xtitle and ytitle similar to title
    xticks and yticks: tuple of format (array of tick values, fontsize)"""
    fig, ax1 = plt.subplots(figsize = figsize)
    plt.title(title[0], size = title[1])
    plt.xticks(xticks[0], size = xticks[1])
    plt.yticks(yticks[0], size = yticks[1])
    plt.xlabel(xtitle[0], size = xtitle[1])
    plt.ylabel(ytitle[0], size = ytitle[1])
    ax1.set_xlim(0, 6)
    ax1.scatter(x, y, marker = '.', alpha = .7)
    return


def split_and_scale(df, n, yes):
    """Splits training dataframe into predictors and properties to be predicted and returns them in 2 new dfs.
       This function assumes all of the predictors are grouped together on the right side of the df.
       df_train: training df
       n: number of properties to be predicted(number of outputs)"""
    # Splitting into properties and predictors
    properties, predictors = split(df, n)
    # Adding polynomial term columns
    predictors_polynomial = polynomialize(predictors, yes)
    # Scaling predictor data
    predictors_scaled_polynomial, predictors_scaler_polynomial = scaling(predictors_polynomial)
    return properties, predictors_scaled_polynomial, predictors_scaler_polynomial 


def polynomialize(series, yes):
    """Adds polynomial features to degree 3, including interaction features. 
    series: an input ndarray of floats to be polynomialized.
    This function returns a ndarray of all of the features specified above.
    
    series: dataframe to be polynomialized
    yes: list, array or tuple of the form:
    (Bool deciding whether to add polynomial terms, degree of highest polynomial, bool deciding whether to only provide interaction terms)
    Returns the polynomialized series."""
    # Creating polynomial object
    if yes[0]:
        poly = PolynomialFeatures(degree = yes[1], interaction_only = yes[2])
        # Adding polynomial terms
        series = poly.fit_transform(series)
    return series


def split(df, n):
    """Takes an input pd.DataFrame and returns 2 ndarrays of the properties 
    and predictors."""
    properties = df[df.columns[-n:]].values
    predictors = df[df.columns[:-n]].values
    return properties, predictors


def scaling(df_train):
    """This function takes a pd.DataFrame, creates a sklearn.StandardScaler, scales the DataFrame,
       and returns the scaled data in a pd.DataFrame as well as the sklearn.StandardScaler object
       for transforming data back to unscaled form post machine learning.
       df_train: pd.DataFrame(for our purposes should be of shape 20 columns by an arbitrary number of rows)
       
       Returns scaled dataframe and its respective scaler"""
    #Creating scaler object
    scaler = preprocessing.MinMaxScaler()
    #Scaling df_train
    scaled_data = pd.DataFrame(scaler.fit_transform(df_train))
    return scaled_data, scaler


def train_model(df_train, df_validation, model, n, yes):
    """This function takes a training DataFrame, validation DataFrame and a preconfigured model
       and trains said model on the training data followed by measuring error on validation data and 
       returning both the trained model and accuracy metric. This function assumes whatever parameter(s)
       being predicted is in the last column(s) of df_train.
       n: number of outputs
       df_validation: to measure accuracy
       model: pre initialized model object
       yes: list, array or tuple of the form:
       (Bool deciding whether to add polynomial terms, degree of highest polynomial, bool deciding whether to only provide interaction terms)
       because this function returns the trained model, more metrics can be performed later that are specific
       to whatever package it is in/the type of model it is
       Returns the model object, RMSE on validation set and the scaler for predictors
       Note: can only predict data which has been scaled with the scaler this function returns"""
    #generating scaled data and their respective scaler objects
    t_properties, t_predictors_scaled, t_predictors_scaler = split_and_scale(df_train, n, yes)
    v_properties, v_predictors_scaled, v_predictors_scaler = split_and_scale(df_validation, n, yes)
    #supervised learning of predictors and properties to fit model, note: keras does not take pd.DataFrames for
    #training, using .values fixes this
    model.fit(t_predictors_scaled, t_properties)
    #predicting output of validation set
    predictions = pd.DataFrame(model.predict(v_predictors_scaled))
    #calculating RMSE from sklearn package
    val_error = np.sqrt(metrics.mean_squared_error(predictions, v_properties))
    return model, val_error, t_predictors_scaler


def model_prediction(test_data, fitted_model, scaler, n, yes):
    """Takes a fitted model and predicts the output of test data, returns the predicted data and accuracy.
       THIS FUNCTION IS ONLY TO BE USED FOR FUTURE PREDICTIONS OR TESTING(WHICH SHOULD ONLY BE DONE ONCE).
       Do not use this while training a model, that's what the validation data will be used for. We do not 
       want to introduce bias into our model by fitting to the test data
       n = number of predictors"""
    #splitting predictors and properties
    properties, predictors = split(test_data, n)
    predictors = polynomialize(predictors, yes)
    predictors_scaled = scaler.transform(predictors)
    #predicting based on scaled input predictors
    prediction = fitted_model.predict(predictors_scaled)
    #calculating MSE
    accuracy_metric = np.sqrt(metrics.mean_squared_error(properties, prediction))

    return prediction, accuracy_metric


def neural_network(input_dimension):
    """Creates a neural network object to be passed into train_model function, can change properties of net
       here. Alternatively, a neural network object can be created using Keras and passed to the train_model 
       function to be fitted.
       
       input_dimension: the dimensionality of the input predictors"""
    def model():
        model = Sequential()
        model.add(Dense(1, input_dim=input_dimension, kernel_initializer='normal', activation='relu'))
        model.add(Dense(20, kernel_initializer='normal', activation='relu'))
        model.add(Dense(1, kernel_initializer = 'normal'))#kernel_initializer = initial values of outputs i think
        opt = optimizers.Nadam(lr=0.5, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)
        model.compile(loss='mean_squared_error', optimizer=opt)
        return model
    # Creating neural network object
    network = KerasRegressor(build_fn=model, epochs=150, batch_size=1, verbose=1)
    return network


def linear_regression():
    """Creates a linear regression object
       to be passed to the train_model fxn."""
    regr = LinearRegression()
    return regr

def coefficient_statistics(df):
    """Creates a linear regression model using statsmodels, used to get
       p values, confidence intervals and other metadata for models.
       This function has too specific of a use case for a test function."""
    fit_object = smf.ols(formula='band_gap ~ amplitude_0 + amplitude_1 + amplitude_2 + amplitude_3 + amplitude_4 + amplitude_5 + amplitude_6 + amplitude_7 + amplitude_8 + amplitude_9+ two_theta_1+ two_theta_2 + two_theta_3 + two_theta_4 + two_theta_5 + two_theta_6 + two_theta_7 + two_theta_8 + two_theta_9', data=df)
    ft = fit_object.fit()
    return ft.summary()

In [220]:
import pandas as pd
import numpy as np
from sklearn import preprocessing


def test_split():
    data = {'column1': [2, 2, 3], 'column2': [1, 3, 5]}
    df = pd.DataFrame(data)
    one, two = xrdos.split(df, 1)
    assert one[0] == 1
    assert two[0] == 2
    return

def test_scaling():
    data = {'column1': [2.0, 2.0, 3.0], 'column2': [1.0, 3.0, 5.0]}
    df = pd.DataFrame(data)
    df, scaler = scaling(df)
    assert df.loc[0].iloc[0] == 0
    assert df.loc[2].iloc[0] == 1
    return

def test_linear_regression():
    regr = xrdos.linear_regression()
    x = np.array([0.5, 1.0, 2.0])
    y = np.array([0.5, 1.0, 2.0])
    regr.fit(x.reshape(-1,1), y.reshape(-1,1))
    p = np.array([0.5, 1.0, 2.0]).reshape(-1,1)
    prediction = regr.predict(p)
    for i in range(len(prediction)):
        assert int(prediction[i]) == int(x[i])
    assert type(p) == np.ndarray
    return

def test_neural_network():
    assert type(xrdos.neural_network()) == keras.wrappers.scikit_learn.KerasRegressor
    return

def test_split_and_scale():
    data = {'column1': [2, 2, 3], 'column2': [1, 3, 5]}
    df = pd.DataFrame(data)
    x, y, z = xrdos.split_and_scale(df, 1, (False, 1, False))
    assert x[0] == 1
    assert y.iloc[2].iloc[0] == 1
    return

def test_polynomialize():
    data = {'column1': [2, 2, 3], 'column2': [1, 3, 5]}
    df = pd.DataFrame(data)
    yes = [True, 2, True]
    poly = xrdos.polynomialize(df, yes)
    print(type(poly))
    assert poly[0,0] == 1
    assert poly[2, 3] == 15
    assert type(poly) == np.ndarray
    return


def test_train_model():
    data = {'column1': [2, 2, 3], 'column2': [1, 3, 5]}
    df = pd.DataFrame(data)
    data1 = {'column1': [2.0, 2.0, 3.0], 'column2': [1.0, 3.0, 5.0]}
    df1 = pd.DataFrame(data)
    model, accuracy, scaler = xrdos.train_model(df, df1, xrdos.linear_regression(), 1, [False, 1, False])
    a = np.array(df.iloc[0]).reshape(-1,1)
    assert int(model.predict(scaler.transform(a))[0][0]) == 2
    assert type(model) == sklearn.linear_model.base.LinearRegression
    
    
def test_model_prediction():
    data = {'column1': [2, 3, 4], 'column2': [1, 3, 5],'column3': [1, 5, 10] }
    df = pd.DataFrame(data)
    properties, predictors = xrdos.split(df, 1) 
    predictors = pd.DataFrame(predictors)
    model = xrdos.linear_regression()
    fitted_model = model.fit(predictors, properties)
    scaler = preprocessing.MinMaxScaler() 
    scaler.fit(properties)
    prediction, accuracy = xrdos.model_prediction(df, fitted_model, scaler, 1, [False, 1, True])
    assert int(prediction[0][0]) == -2
    return

    

In [205]:
error

1.4483989793545256

In [206]:
test_predictions, test_error = model_prediction(df_test, model, scaler, 1, [False, 2, True])



In [207]:
test_predictions

array([2.1597717, 1.9235963, 2.0795944, ..., 2.1379144, 2.232712 ,
       2.275712 ], dtype=float32)

In [168]:
best_model = [model2, error2, scaler2]

In [185]:
linear, linerror, linscaler = train_model(df_train, df_validation, linear_regression(), 1, [True, 2, True])

In [211]:
lin_predictions, lin_error = model_prediction(df_test, linear, linscaler, 1, [True, 2, True])

In [6]:
import numpy as np
import pandas as pd
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
mpdr = MPDataRetrieval(api_key='x5He3oeSg1eCaIU4')

In [18]:
# This statements obtains and stores the relevant data from MPD
# NOTE: Si was used as the criteria only for testing purposes. It will be changed later on
MPD_data = mpdr.get_dataframe(criteria='all',properties=['xrd', 'band_gap', 'efermi'])

def extract_data(MPD_data_row):
    """
    Extracts the relevant XRD data from the dictionary obtained from MPD
    
    Parameters:
    ----------
    MPD_data_row : Pandas dataframe
         A row of data for a single material from the full MPD dataframe 
    
    Returns:
    ----------
    clean_df: Pandas dataframe
        The top 10 XRD peaks and their corresponding two theta values for the material
    """
    
    # Extracting out the amplitude and two theta values from the dictionary contained inside the received data
    # then turning it into a pandas dataframe.
    dirty_df = pd.DataFrame(MPD_data_row['xrd']['Cu']['pattern'], columns=MPD_data_row['xrd']['Cu']['meta']) # Converts data into dataframe
    dirty_df.drop(['hkl','d_spacing'], axis=1, inplace=True) # Disposes of the hkl and d-spacing data

    # Sorting the peaks into the top 10 with the highest peaks
    dirty_df.sort_values('amplitude', ascending=False, inplace=True) # Sorts peaks from highest to smallest
    dirty_df.reset_index(drop=True, inplace=True) # Reseting index
    clean_df = dirty_df[:10] # Dropping all peaks below the top ten 

    return clean_df

# Function to reformat the data after cleaning
# Takes the dataframe and turns it into a dictionary wwhere all data points have a unique key
def reformat_data(MPD_data_row):
    """
    Reformats the cleaned data obtained from the extract_data function into a dictionary
    
    Parameters:
    ----------
    MPD_data_row : Pandas dataframe
         A row of data for a single material from the full MPD dataframe 
    
    Returns:
    ----------
    clean_df: Pandas dataframe
        The top 10 XRD peaks and their corresponding two theta values for the material
    """
    
    # Cleaning data and creating empty dictionary
    clean_df = extract_data(MPD_data_row)
    mat_dict = {}

    # Loop to assign each data point to a key and stores it within the dictionary
    for i in range(0,20):
        if i < 10:
            amp_key = ('amplitude_' + str(i))
            mat_dict[amp_key] = clean_df['amplitude'][i]

        else:
            theta_key = ('two_theta_' + str(i-10))
            mat_dict[theta_key] = clean_df['two_theta'][i-10]

    return mat_dict

# Function 
def produce_data(MPD_data):
    """
    Produces the XRD and DOS data for all the materials passed to the function 
    
    Parameters:
    ----------
    MPD_data : Pandas dataframe
      The dataframe filled with data obtained from MPD 
    
    Returns:
    ----------
    full_df: Pandas dataframe
        The peaks, two theta values, band gap, and fermi energy for all the materials passed to the function
    """
    
    # Creating prelimanry containers for XRD and DOS data
    xrd_data = {}
    dos_data = MPD_data.drop(['xrd'], axis=1)
    
    # Loop to run through each row of the dataframe
    for i in range(len(MPD_data)):
        
        # Conditional to skip over materials with less than 10 XRD peaks
        # or no fermi energies
        if len(MPD_data.iloc[i]['xrd']['Cu']['pattern']) >= 10 and np.isnan(MPD_data.iloc[i]['efermi']) == False:
            
            # Obtaining and storing the XRD data for a material into a dictionary
            ID = MPD_data.index[i]
            mat_dict = reformat_data(MPD_data.iloc[i])
            xrd_data[ID] = mat_dict
            
        else:
            
            # Replaces rows that failed the conditional with NaN
            # This is for easy removal od the rows
            dos_data.iloc[i] = float('nan')
    
    # Creating the final dataframe from the obtained XRD and DOS dataframes
    dos_df = dos_data.dropna()
    xrd_df = pd.DataFrame.from_dict(xrd_data, orient='index')
    full_df = pd.concat([xrd_df, dos_df], axis=1, sort=False)
    
    return full_df

produce_data(MPD_data)

CompositionError: all is an invalid formula!

In [11]:
# make sure inputs are correct, thats where shit gets fucked up
d_train = produce_data(MPD_data)
model, val_error, scaler = train_model(d_train.iloc[0:12], d_train.iloc[12:25],linear_regression(), 2)
predictions, accuracy = model_prediction(d_train.iloc[0:2], model, scaler, 2)

In [12]:
predictions

array([[1.0565    , 5.54577704],
       [1.82      , 3.21901393]])

In [13]:
accuracy

2.7240086067659812e-15