# Train NN v0.5: Imporved version of v0.4

- generates N training samples with varied energies $K_{\alpha_1}$, energy splitting and ratios
- trains one the MLP regressor for each parameter
- The trained NNs are tested on training set, dev set and test set
- No optimization have been made.

- TODO:
    - Split large datasets into several zipped containers

Changes:

- Removed
    - unnecessary fiiting of superposition in gen_set()
- New:
    - Train separate NN for each parameter but with same training set
    - Files now named after size of data set and costum comment
    - Convergence control training the NN (but not live yet) 
- Bug fixes
    - Found that paramters also got scaled, fixed it to normal scale ( result was that training was super fast, but had bad results due to too early stopping after few iteration steps ) 
- gen_set()
    - removed fitting of splines to superposition of voigt profiles 
- read_input_data()
    - Now rescales the data matrix correctly without scaling the labels

In [None]:
"""
Import functions and tools needed
"""
import plotly.offline as py
import plotly.tools as tls
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np
import pandas as pd
import matplotlib
import math
import scipy.stats as sc
from scipy.special import wofz
from astropy.modeling.functional_models import Voigt1D 
from scipy.interpolate import UnivariateSpline
from sklearn.neural_network import MLPRegressor as MLPR
import time
from sklearn.externals import joblib
# Redirect stdout
import io 
import sys
# Use subpreocesses
import subprocess
# Control figure size
matplotlib.rcParams['figure.figsize']=(7,5)
py.init_notebook_mode()
# Use plotly as gifure output
def plotly_show():
    fig = plt.gcf()
    py.iplot_mpl(fig)
    plt.grid(True)

In [None]:
def calc_area(x, v):
    """
    Calculates the area beneath a spectrum
    """
    width = (x[len(x)-1]-x[0])/(len(x)-1)
    area = 0
    for i in range(len(x)):
        area += v[i] * width
    return area

In [None]:
def gen_set(N, x, noise, Set, verbosity): 
    """
    - Generates a set with N spectra by using the superposition of TWO Voigt profiles with randomly choosen
        parameters 
            gamma1: HWHM of Lorentzian part of Voigt profile 1 
            gamma2: HWHM of Lorentzian part of Voigt profile 2
            sigma1: Standard uncertainty of Gaussian part of Voigt profile 1
            sigma2: Standard uncertainty of Gaussian part of Voigt profile 2
            epsilon: offset to energy E
        The Energy E (K alpha1) is centered around 2014eV 
    """
    if (verbosity > 0):
        start = time.time()
    # Definition of some parameters
    gamma1, sigma1 = 0.345, 0.07
    gamma2, sigma2 = 0.36, 0.08
    labels = np.array(0)
    labels = np.delete(labels, 0)
    # Creating the empty data matrix with dimensions N x d+1
    X = np.zeros((N,len(x)+3))
    runtime = np.array(0)
    runtime = np.delete(runtime, 0)
    """
    For loop loops N times to create N spectra. The single spectrum is evaluate and fitted
    on range x to get equal x values as features (Note: When trained on grid defined by x then
    real data must also be sampled on same grid!). File format:
        File dimensions: N x (2 + d), where d is number of grid points resulting from grid x
        [E dE x1 x2 ... xd]
    """
    if (verbosity > 0):
        loop_time_start = time.time()
    for i in range(N):
        # Generate random distribution (+- 1) around central value of energie E
        E_epsilon = (np.random.random_sample()-0.5)*2
        # Generate random distribution (+- 0.1) around central value of energie dE
        dE_epsilon = (np.random.random_sample()-0.5)*(0.4)
        # Generate random distribution (+- 0.05) around central value of amplitude L1
        dL1 = (np.random.random_sample()-0.5)/15
        # Generate random distribution (+- 0.05) around central value of amplitude L2
        dL2 = (np.random.random_sample()-0.5)/15
        L1 = 0.18 + dL1
        L2 = 0.3 + dL2
        E = 2014 + E_epsilon
        dE = 0.85 + dE_epsilon
        v1 = Voigt1D(x_0=E-dE, amplitude_L=L1, fwhm_L=2*gamma1, fwhm_G=2*sigma1*np.sqrt(2*np.log(2)))
        v2 = Voigt1D(x_0=E, amplitude_L=L2, fwhm_L=2*gamma2, fwhm_G=2*sigma2*np.sqrt(2*np.log(2)))
        # Calculate the ratio of the areas
        R = calc_area(x, v2(x))/calc_area(x,v1(x))
        # Save values in two columns in form of [x,v(x)]
        X[i,0] = E
        X[i,1] = dE
        X[i,2] = R
        if (noise == True):
            # Apply poisson noise to the data 
            append = v1(x)+v2(x)
            # Magnify amplitudes to get poisson function working
            append *= 1e4
            append = np.random.poisson(append)
            for j in range(len(x)):
                X[i,j+3] = append[j]
        else: 
            append = sp(x)
            append *= 1e4
            for j in range(len(x)):
                X[i,j+3] = append[j]
        # Runtime control
        if (verbosity > 0):
            if ( i % (N/10) == 0 ):
                loop_time_end = time.time()
                time_diff = loop_time_end-loop_time_start
                runtime = np.append(runtime, time_diff)
                print("Progress: %i/%i, time for loop: %3.2fs" % (i , N, time_diff))
                loop_time_start = time.time()
    # Create pandas dataframe
    df_spectrum = pd.DataFrame(X)
    df_spectrum.to_msgpack('./data/'+Set+'.spectrum')
    # Plot verbosity information about the loop
    if (verbosity > 0):
        end = time.time()
        print("Time for generating the "+Set+" set:", end-start)
        if (verbosity > 1):
            plt.figure()
            plt.title("Runtime for generating the data set "+Set)
            plt.plot(runtime, label="Runtime per N/10 loops")
            plt.grid(True)
            plt.legend()
            plotly_show()

In [None]:
def read_set(Set):
    """ 
    Read data and store it in (Nxd) Martix, where N donates 
    the observation (single spectrum) and d the dth feature 
    (datapoint given by choosing x). The data gets fitted 
    by the Splines fit. Also, noise is added when reading the
    data if flag is set.
    """
    start = time.time()
    
    X = pd.read_msgpack('./data/'+Set+'.spectrum') 
    X = X.as_matrix()
    # Rescale input between 0 and 1
    #X /= 1e4 # factor results of rescaling in gen_set()
    X = X.T
    for i in range(3, len(X)):
        X[i] /= 1e4
    X = X.T
    end = time.time()
    print("Time for reading "+Set+" set: %3.2fs" % (end-start))
    return X

In [None]:
def scale_input(X):
    """
    Feature skaling for NN apporach. It is "highly recommended" to scale input data to either [0:1] or [-1:+1] 
    or standardize it to have mean 0 and variance 1
    Source:
    http://scikit-learn.org/stable/modules/neural_networks_supervised.html#regression
    This function standardizes X 
    """
    from sklearn.preprocessing import StandardScaler  
    scaler = StandardScaler()  
    # Don't cheat - fit only on training data
    N = len(X)
    d = len(X[0]+3)
    y = X[:,:3]
    X = X[:,3:]
    scaler.fit(X)  
    X = scaler.transform(X) 
    X = np.append(y,X, axis=1)
    #for i in range(len(X)):
    #    plt.plot(x,X[i])
    #plt.grid(True)
    #plotly_show()
    return X

In [None]:
def NN_train(X, model, parameter, scaling, verbosity):
    """
    Trains given model on data X and labels y. Returns trainings score
    """
    start = time.time()
    y = np.ravel(X[:,parameter])
    if (scaling == True):
        X = scale_input(X)
    X = X[:,3:]
    # Set out pipe to catch stdout for getting verbosity output of model.fit
    if (verbosity > 0):
        old_stdout = sys.stdout
        sys.stdout = mystdout = io.StringIO()
    model.fit(X, y)
    # Delete pipe
    if (verbosity > 0):
        sys.stdout = old_stdout
    # Save verbosity output (training loss) in variable
    loss = np.array(0)
    loss = np.delete(loss, 0)
    if (verbosity > 0):
        verbosity_output = mystdout.getvalue()
        verbosity_output = np.array(verbosity_output.split(' '))
        for i in range(4,len(verbosity_output),4):
            if (verbosity_output[i].split('\n')[0] == 'improve'):
                break
            else:
                loss = np.append(loss, float(verbosity_output[i].split('\n')[0]))
    # Save score of training
    score = model.score(X,y)
    end = time.time()
    # Print training statistics depending onb verbosity level
    if (verbosity > 0):
        print("Training time: %3.2f " % (end - start))
        print("Training score: %3.2f " % (score))
        if (verbosity > 1):
            plt.figure()
            plt.title("Training loss per epoch")
            plt.semilogy(loss, label="Loss")
            plt.grid(True)
            plt.legend()
            plotly_show()       
    return score

In [None]:
def test_envir(X_train, X_dev, X_test, model, param, verbosity, scaling):
    """
    Trains and tests a NN on a given label
    """
    param = 0
    score = NN_train(X_train, model, param, scaling, verbosity = verbosity)
    # Save model via
    #joblib.dump(model, './data/1_neural_network.pkl')
    # Load model via
    #model2 = joblib.load('./data/1_neural_network.pkl')
    predict_train = model.predict(X_train[:,3:])
    predict_dev = model.predict(X_dev[:,3:])
    plt.plot(X_train[:,param]-predict_train, label=("Train: Loss energy"))
    plt.plot(X_dev[:,0]-predict_dev, label=("Dev: Loss energy"))
    plt.xlabel("datapoint")
    plt.ylabel("Error in arb. units")
    plt.title("Error on true label")
    abserr_train = np.absolute(X_train[:,param]-predict_train)
    abserr_train = np.sum(abserr_train)
    abserr_dev = np.absolute(X_dev[:,param]-predict_dev)
    abserr_dev = np.sum(abserr_dev)
    plt.legend()
    plt.grid(True)
    print("Mean error per prediction in training set in run %3.2f " % (abserr_train/len(X_train)))
    print("Mean error per prediction in dev set in run %3.2f " % (abserr_dev/len(X_dev)))
    plotly_show()    

### Begin of testing the algorithms

In [None]:
"""
Defining (Hyper)Parameters
"""
exp = 5 # Exponent defining the size of the file
factor = 2
N = int(factor*10**(exp)) # Actual value
noisee = True
comment = "noise_wosp" # Comment for data file name
data_size = int(N/1000) # Value for labeling the data (in "kilo samples") 
x = np.arange(2008,2018,0.01) # Grid for creating and importing data
# Following the definition of the different 
"""
model_energy = MLPR(hidden_layer_sizes=(100, ), activation="relu", solver="adam", alpha=0.0001, batch_size="auto", 
             learning_rate="constant", learning_rate_init=0.001, power_t=0.5, max_iter=20000, shuffle=True, 
             random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, 
             early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model_splitting = MLPR(hidden_layer_sizes=(100, ), activation="relu", solver="adam", alpha=0.0001, batch_size="auto", 
             learning_rate="constant", learning_rate_init=0.001, power_t=0.5, max_iter=20000, shuffle=True, 
             random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, 
             early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model_ratio = MLPR(hidden_layer_sizes=(100, ), activation="relu", solver="adam", alpha=0.0001, batch_size="auto", 
             learning_rate="constant", learning_rate_init=0.001, power_t=0.5, max_iter=20000, shuffle=True, 
             random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, 
             early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
"""
#model_energy = MLPR(hidden_layer_sizes=(100, ), max_iter=200, activation="relu", verbose = True, 
#                    solver = 'adam',learning_rate = 'constant',learning_rate_init = 1e-3, batch_size = 200)
model_energy = MLPR(max_iter=2000,  activation="relu", verbose = True)

In [None]:
gen_set(N, x, noise = noisee, Set = "train"+str(factor)+"x"+str(data_size)+"k"+comment, verbosity = 1)
gen_set(int(N/10), x, noise = noisee, Set = "dev"+str(factor)+"x"+str(int(data_size/10))+"k"+comment, verbosity = 0)
gen_set(int(N/10), x, noise = noisee, Set = "test"+str(factor)+"x"+str(int(data_size/10))+"k"+comment, verbosity = 0)

In [None]:
X_train = read_set(Set="train"+str(factor)+"x"+str(data_size)+"k"+comment)
X_dev = read_set(Set="dev"+str(factor)+"x"+str(int(data_size/10))+"k"+comment)
X_test = read_set(Set="test"+str(factor)+"x"+str(int(data_size/10))+"k"+comment)

In [None]:
X_train_100k = read_set(Set = "train1x100knoise_wosp")
X_dev_100k = read_set(Set = "dev1x10knoise_wosp")
X_test_100k = read_set(Set = "test1x10knoise_wosp")
X_train_200k = read_set(Set = "train2x200knoise_wosp")
X_dev_200k = read_set(Set = "dev2x20knoise_wosp")
X_test_200k = read_set(Set = "test2x20knoise_wosp")

In [None]:
test_envir(X_train_100k, X_dev_100k, X_test_100k, model_energy, param = 0, verbosity = 2, scaling = False)
test_envir(X_train_200k, X_dev_200k, X_test_200k, model_energy, param = 0, verbosity = 2, scaling = False)
test_envir(X_train_100k, X_dev_100k, X_test_100k, model_energy, param = 1, verbosity = 2, scaling = False)
test_envir(X_train_200k, X_dev_200k, X_test_200k, model_energy, param = 1, verbosity = 2, scaling = False)
test_envir(X_train_100k, X_dev_100k, X_test_100k, model_energy, param = 2, verbosity = 2, scaling = False)
test_envir(X_train_200k, X_dev_200k, X_test_200k, model_energy, param = 2, verbosity = 2, scaling = False)

In [None]:
for i in range(1):
    plt.plot(x, X_train[i+12,3:])
plotly_show()