In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation

# simple: input_dim = 100, output_dim = 4
# snap: input_dim = 102, output_dim = 31

def get_seq_model(input_dim, output_dim, nlayers, nunits, activation="elu"):
    model = Sequential()

    model.add(Dense(units=nunits, input_dim=input_dim))
    model.add(Activation(activation))
    
    for i in range(nlayers):
        model.add(Dense(units=nunits))
        model.add(Activation(activation))

    model.add(Dense(units=output_dim))
    
    return model

Using Theano backend.


In [2]:
model = get_seq_model(100, 4, 2, 500)

In [3]:
from keras import optimizers

#optimizer = optimizers.Adam(lr=0.00001, beta_1=0.95, beta_2=0.999, epsilon=1e-08, decay=0.0)
optimizer = optimizers.SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=["mse",'accuracy'])

In [4]:
import json
import numpy as np


def load_xas_data(data_file):
    X, Y, xas_ids, problem_ids = [], [], [], []
    
    with open(data_file) as f:
        all_data = json.load(f)
    for d in all_data:
        if d[0][-1] < 0:
            problem_ids.append(d[2])
            continue
        X.append(d[1]) # spectrum
        Y.append(d[0]) # site
        xas_ids.append(d[2]) # xas id
        
    return np.array(X), np.array(Y), xas_ids, problem_ids

In [5]:
data_file = "/global/homes/k/kmathew/NNxas/all_data_processed_120000.json"
X, Y, xas_ids, problem_ids = load_xas_data(data_file)

print(X.shape, Y.shape)

def get_xas_data(num_training, num_validation, num_test, return_mask=False):
        
    # sample the data
    n_total = num_training + num_validation + num_test
    n_tv = num_training + num_validation
    
    test_mask = np.zeros(n_total, dtype=bool)
    test_choice = np.random.choice(n_total, num_test, replace=False)
    test_mask[test_choice] = True
    
    X_test = X[test_mask]
    Y_test = Y[test_mask]
    
    X_tv = X[~test_mask]
    Y_tv = Y[~test_mask]
    
    val_mask = np.zeros(n_tv, dtype=bool)
    val_choice = np.random.choice(n_tv, num_validation, replace=False)
    val_mask[val_choice] = True
    
    X_val = X_tv[val_mask]
    Y_val = Y_tv[val_mask]
    
    X_train = X_tv[~val_mask]
    Y_train = Y_tv[~val_mask]

    # Normalize the data: subtract the mean image
    #mean_image = np.mean(X_train, axis=0)
    #X_train -= mean_image
    #X_val -= mean_image
    #X_test -= mean_image

    # Reshape data to rows
    X_train = X_train.reshape(num_training, -1)
    X_val = X_val.reshape(num_validation, -1)
    X_test = X_test.reshape(num_test, -1)

    if return_mask:
        return X_train, Y_train, X_val, Y_val, X_test, Y_test, test_mask, val_mask
        
    return X_train, Y_train, X_val, Y_val, X_test, Y_test

((113101, 100), (113101, 4))


In [6]:
print(len(problem_ids))

1793


In [7]:
num_total = X.shape[0]
validation_precentage = 10
test_precentage = 10
num_epochs = 50

In [8]:
num_test = int(num_total * test_precentage /100.)
num_validation = int(num_total * validation_precentage /100.)
num_training = num_total - num_validation - num_test

X_train, Y_train, X_val, Y_val, X_test, Y_test, test_mask, val_mask = get_xas_data(num_training, num_validation, num_test, return_mask=True)

print('Train data shape: ', X_train.shape)
print('Train labels shape: ', Y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', Y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', Y_test.shape)

('Train data shape: ', (90481, 100))
('Train labels shape: ', (90481, 4))
('Validation data shape: ', (11310, 100))
('Validation labels shape: ', (11310, 4))
('Test data shape: ', (11310, 100))
('Test labels shape: ', (11310, 4))


In [9]:
# print random sample's Y
Y_train[np.random.choice(num_training)]

array([ 8.        , -2.        ,  3.        ,  0.72309418])

In [10]:
test_xas_ids = np.array(xas_ids)[test_mask]

In [11]:
test_xas_ids[np.random.choice(num_test)]

u'mp-505633-6-XANES-K'

# Train

In [12]:
history = model.fit(X_train, Y_train, epochs=num_epochs, verbose=0, 
                    validation_data=(X_val, Y_val), batch_size=32)
#model.fit(X_train, Y_train, epochs=num_epochs, verbose=1, 
#                    validation_data=(X_val, Y_val), batch_size=32)

In [15]:
history.history["loss"]

[9.6794530173234765,
 3.8929937837060131,
 3.1745974531018875,
 2.8074022308521456,
 2.6429751398268766,
 2.5182256541298145,
 2.4412430792055857,
 2.3660550358108856,
 2.3067497197065587,
 2.2744944410416532,
 2.2284898657356136,
 2.2001236363300269,
 2.1575391527720549,
 2.1323347026128867,
 2.1055383907070255,
 2.0867083388442267,
 2.054877438840454,
 2.0391693094984302,
 2.0193312894620674,
 1.9978162237754693,
 1.9861530818952489,
 1.9742800200159811,
 1.9586015937446413,
 1.9423956556254105,
 1.9260563988912272,
 1.9167377171827509,
 1.9054567643190348,
 1.8909028903188574,
 1.880240913147891,
 1.8746346775228884,
 1.8616071767151732,
 1.8508569535695436,
 1.8434278563249136,
 1.8384546994664948,
 1.8228463467082774,
 1.8187068078966946,
 1.8099655626575908,
 1.8044240265401874,
 1.7932644615351889,
 1.7875862585252031,
 1.782038761807619,
 1.7734547982242916,
 1.7661756443340473,
 1.7618784457265635,
 1.7542480156669642,
 1.749845852909153,
 1.7399298646648433,
 1.73519197938070

# Test

In [16]:
score = model.evaluate(X_test, Y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

('Test accuracy:', 1.8544317812122779)


# Samples

In [17]:
nchoice = np.random.choice(num_test, 1)

# predictions
sample_predictions = model.predict(X_test[nchoice])
print(sample_predictions)
#print(np.round(sample_predictions))

# original
print(Y_test[nchoice])

[[ 28.03184319   1.08210742   5.40386534   1.22905898]]
[[ 28.           2.           4.           0.94442041]]


In [16]:
test_xas_ids[nchoice]

array([u'mp-11026-28-XANES-K'], 
      dtype='<U22')

In [None]:
# check problem_ids for negative average bond length

Tensormol: 3 hidden layers, 1000 neirons, with relu
adam optimizer, minibatch method with l2 loss minimization, learning rate=0.00001, learning momentum=0.95
check: TFMolInstance.py

inverse problem: given spectrum, predict structure
define structure(descriptor): [spacegroup, formula(or just absorbing atom symbol), a, b, c]
                  --> 225 len vec + 110 len vec + 3

keras.preprocessing.sequence.pad_sequences(sequences, maxlen=None, dtype='int32',
    padding='pre', truncating='pre', value=0.)

Transform a list of num_samples sequences (lists of scalars) into a 2D Numpy array of shape (num_samples, num_timesteps). num_timesteps is either the maxlen argument if provided, or the length of the longest sequence otherwise. Sequences that are shorter than num_timesteps are padded with value at the end. Sequences longer than num_timesteps are truncated so that it fits the desired length. Position where padding or truncation happens is determined by padding or truncating, respectively.

#Generate dummy data
import numpy as np
data = np.random.random((1000, 100))
labels = np.random.randint(2, size=(1000, 1))

#Train the model, iterating on the data in batches of 32 samples
model.fit(data, labels, epochs=10, batch_size=32)

#Chi
KerasClassifier
KFold
cross_val_score

def basic_mlp():
    lr = 5e-3
    input_dim = 55
    layer_sizes = [100, 50, 55]
    model = Sequential()
    model.add(Dense(layer_sizes[0], input_dim=input_dim, activation='elu'))
    for i in range(1, len(layer_sizes)):
        if i == len(layer_sizes)-1:
            act = 'softmax'
        else:
            act = 'elu'
        model.add(Dense(layer_sizes[i], activation=act))
    model.compile(loss='categorical_crossentropy', optimizer= Adam(lr=lr), metrics=['accuracy'])
    return model