In [21]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing, metrics 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from keras.models import Model, Sequential
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import Input, Dense, Activation
from keras import optimizers

import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline

In [16]:
matplotlib.style.use('ggplot')

In [22]:
#NOTE: none of these fxns are meant for single point input, input should always be a pd.DataFrame with more
#than one row, not an unreasonable request since sklearn makes you reshape 1D arrays.


# Data cleaning section: 

def split_and_scale(df, n, yes):
    """Splits training dataframe into predictors and properties to be predicted and returns them in 2 new dfs.
       This function assumes all of the predictors are grouped together on the right side of the df.
       df_train: training df
       n: number of properties to be predicted(number of outputs)"""
    properties, predictors = split(df, n)
    # COMMENT OUT THIS LINE IF YOU DONT WANT TO HAVE POLYNOMIAL TERMS IN YOUR TRAINING DATA
    # But note that accuracy is much better with this, but the model will have higher variance
    predictors_polynomial = polynomialize(predictors, yes)
    predictors_scaled_polynomial, predictors_scaler_polynomial = scaling(predictors_polynomial)
    return properties, predictors_scaled_polynomial, predictors_scaler_polynomial 


def polynomialize(series, yes):
    """Adds polynomial features to degree 3, including interaction features. 
    series: an input ndarray of floats to be polynomialized.
    This function returns a ndarray of all of the features specified above."""
    # Creating polynomial object
    if yes[0]:
        poly = PolynomialFeatures(degree = yes[1])
        # Adding polynomial terms
        series = poly.fit_transform(series)
    return series

# Still in development, in case we want to add more terms that aren't polynomial
# def add_nonlinear_terms(df, n):
#     properties = df[df.columns[-n:]]
#     predictors = df[df.columns[:-n]]
#     i = np.arange(len(predictors.columns) * 4)
#     x = 0
#     for column in predictors.values:
#         predictors.assign(i[x]=column**2)
#         predictors.assign(column**3)
#         predictors.assign(np.exp(column))
#         predictors.assign(np.sign(column))
#     return properties, predictors


def split(df, n):
    """Takes an input pd.DataFrame and returns 2 ndarrays of the properties 
    and predictors."""
    properties = df[df.columns[-n:]].values
    predictors = df[df.columns[:-n]].values
    return properties, predictors


def scaling(df_train):
    """This function takes a pd.DataFrame, creates a sklearn.StandardScaler, scales the DataFrame,
       and returns the scaled data in a pd.DataFrame as well as the sklearn.StandardScaler object
       for transforming data back to unscaled form post machine learning.
       df_train: pd.DataFrame(for our purposes should be of shape 20 columns by an arbitrary number of rows)"""
    #Creating scaler object
    scaler = preprocessing.MinMaxScaler()
    #Scaling df_train
    scaled_data = pd.DataFrame(scaler.fit_transform(df_train))
    
    return scaled_data, scaler

# Training/predicting


def train_model(df_train, df_validation, model, n, yes):
    """This function takes a training DataFrame, validation DataFrame and a preconfigured model
       and trains said model on the training data followed by measuring error on validation data and 
       returning both the trained model and accuracy metric. This function assumes whatever parameter(s)
       being predicted is in the last column(s) of df_train.
       n: number of outputs
       because this function returns the trained model, more metrics can be performed later that are specific
       to whatever package it is in/the type of model it is
       Parameters"""
    #generating scaled data and their respective scaler objects
    t_properties, t_predictors_scaled, t_predictors_scaler = split_and_scale(df_train, n, yes)
    v_properties, v_predictors_scaled, v_predictors_scaler = split_and_scale(df_validation, n, yes)
    #supervised learning of predictors and properties to fit model, note: keras does not take pd.DataFrames for
    #training, using .values fixes this
    model.fit(t_predictors_scaled, t_properties)
    #predicting output of validation set
    predictions = pd.DataFrame(model.predict(v_predictors_scaled))
    #calculating RMSE from sklearn package
    val_error = np.sqrt(metrics.mean_squared_error(predictions, v_properties))
    return model, val_error, t_predictors_scaler


def model_prediction(test_data, fitted_model, scaler, n):
    """Takes a fitted model and predicts the output of test data, returns the predicted data and accuracy.
       THIS FUNCTION IS ONLY TO BE USED FOR FUTURE PREDICTIONS OR TESTING(WHICH SHOULD ONLY BE DONE ONCE).
       Do not use this while training a model, that's what the validation data will be used for. We do not 
       want to introduce bias into our model by fitting to the test data
       n = number of predictors"""
    #splitting predictors and properties
    properties, predictors = split(test_data, n)
    predictors = polynomialize(predictors)
    predictors_scaled = scaler.transform(predictors)
    #predicting based on scaled input predictors
    prediction = fitted_model.predict(predictors_scaled)
    #calculating MSE
    accuracy_metric = np.sqrt(metrics.mean_squared_error(properties, prediction))

    return prediction, accuracy_metric

# Below functions initialize all the different types of models we are looking at:


def neural_network():
    """Creates a neural network object to be passed into train_model function, can change properties of net
       here."""
    def model():
        model = Sequential()
        model.add(Dense(50, input_dim=20, kernel_initializer='normal', activation='relu'))
        model.add(Dense(50, kernel_initializer='normal', activation='relu'))
        model.add(Dense(20, kernel_initializer='normal', activation='relu'))
        model.add(Dense(1, kernel_initializer = 'normal'))#kernel_initializer = initial values of outputs i think
        model.compile(optimizer=optimizers.Adam(lr=1.0e-4), loss='mse', metrics = ['accuracy'])
        return model
    network = KerasRegressor(build_fn=model, epochs=150, batch_size=50, verbose=1)
#     network.fit(x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0.0, 
    return network


def linear_regression():
    """creates a linear regression object"""
    regr = LinearRegression()
    return regr



In [18]:
# to go in cleaning section
# import clean

def test_split():
    data = {'column1': [2, 2, 3], 'column2': [1, 3, 5]}
    df = pd.DataFrame(data)
    one, two = clean.split(df, 1)
    assert one[0] == 1
    assert two[0] == 2
    return

def test_scaling():
    data = {'column1': [2.0, 2.0, 3.0], 'column2': [1.0, 3.0, 5.0]}
    df = pd.DataFrame(data)
    df, scaler = clean.scaling(df)
    assert df.loc[0].iloc[0] == 0
    assert df.loc[2].iloc[0] == 1
    return

# def test_linear_regression():
#     return

regr = LinearRegression()
type(regr)

sklearn.linear_model.base.LinearRegression

In [19]:
def train_test_split(df): 
    train, test_and_val = sklearn.model_selection.train_test_split(df, test_size=.30)
    test, val = sklearn.model_selection.train_test_split(test_and_val, test_size=.15)
    df_train = pd.DataFrame(train)
    df_val = pd.DataFrame(val)
    df_test = pd.DataFrame(test)
    df_train.drop(columns = 'efermi',inplace = True)
    df_val.drop(columns = 'efermi',inplace = True)
    df_test.drop(columns = 'efermi',inplace = True) 
    return df_train, df_val, df_test

In [6]:
data = pd.read_csv('MPD_Data_Processed.csv', sep = '\t', index_col=0)
data.head()

Unnamed: 0,amplitude_0,amplitude_1,amplitude_2,amplitude_3,amplitude_4,amplitude_5,amplitude_6,amplitude_7,amplitude_8,amplitude_9,...,two_theta_2,two_theta_3,two_theta_4,two_theta_5,two_theta_6,two_theta_7,two_theta_8,two_theta_9,band_gap,efermi
mp-1000,100.0,76.808467,60.517635,29.361717,22.132998,21.657248,18.275875,17.467588,15.621411,15.480452,...,35.823897,58.193688,44.255935,64.376264,127.447337,153.100605,134.637754,108.920846,1.593,2.087033
mp-10009,100.0,78.502945,61.382764,59.581084,51.834414,42.578817,34.238297,32.474708,26.611831,24.362379,...,19.26857,24.866325,43.790584,158.910531,161.241471,48.251965,31.649554,62.766975,0.7804,2.669606
mp-1001012,100.0,80.540257,76.206648,35.729962,34.640541,30.607739,26.076216,23.924735,18.36139,15.331991,...,48.832011,23.855655,44.629498,33.990068,135.125592,91.443607,145.702725,41.952291,0.5765,3.071523
mp-1001015,100.0,77.813956,60.044813,44.937326,38.637906,25.032308,22.305078,21.849105,19.221977,16.104948,...,169.262733,32.462334,42.581227,86.418577,64.924776,124.232687,22.798656,132.065583,0.6698,2.212315
mp-1001016,100.0,70.020649,63.037973,48.338026,34.510549,31.212189,27.679137,26.613327,24.481458,20.327996,...,27.919942,168.168263,26.708785,86.05369,55.025631,165.708924,123.501849,77.054857,0.1396,2.638915


In [7]:
df_train, df_validation, df_test = train_test_split(data)
df_train.head()

Unnamed: 0,amplitude_0,amplitude_1,amplitude_2,amplitude_3,amplitude_4,amplitude_5,amplitude_6,amplitude_7,amplitude_8,amplitude_9,...,two_theta_1,two_theta_2,two_theta_3,two_theta_4,two_theta_5,two_theta_6,two_theta_7,two_theta_8,two_theta_9,band_gap
mp-651008,100.0,78.877404,69.497273,62.020021,54.668569,52.144988,49.189664,48.272557,47.844256,41.007436,...,23.659384,18.81094,13.402824,27.416565,33.367866,31.209016,31.849605,26.993525,30.830455,0.3275
mp-1112544,100.0,72.609265,62.918017,37.510808,29.9594,26.65173,25.835442,25.555973,24.884538,24.666135,...,41.119585,24.837604,175.147257,67.458867,159.187804,20.226261,48.635985,50.949291,136.604576,1.0046
mp-556334,100.0,74.303418,73.613068,51.507414,50.961419,43.98273,38.078149,32.232925,30.398982,28.006856,...,31.622812,32.957886,31.565976,25.085241,33.121726,18.91325,21.185866,46.40554,39.46587,3.5264
mp-540341,100.0,98.579284,68.292901,53.313382,47.230059,46.441547,43.30382,30.677619,30.473473,26.28689,...,30.515703,18.918462,31.756397,33.55803,27.45845,13.630336,47.053775,41.510396,37.875211,1.2423
mp-24428,100.0,36.204012,22.793257,22.325667,22.166651,18.716592,18.243829,17.294773,15.764271,15.758957,...,28.542301,27.335451,43.498485,39.131744,37.138187,40.815408,33.059746,25.084479,14.160052,2.0569


In [23]:
model2, error2, scaler2 = train_model(df_train, df_validation, neural_network(), 1, [False])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [24]:
error2

1.4125969888165737

In [21]:
best_model = [model, error, scaler]

In [None]:
model1, error1, scaler1 = train_model(df_train, df_validation, linear_regression(), 1)

In [94]:
error1

2.309984279022966

In [4]:
# TO REDUCE OVERFITTING: reduce degree of polynomial terms

In [5]:
#model might just not have the right things
#to reorder columns:
#cols = df.columns.tolist()
#df = df[cols] 