# Train NN v0.4: Imporved version of v0.3

- generates N training samples with varied energies $K_{\alpha_1}$ 
- trains the MLP regressor
- The trained NN is tested with the training set
- No optimization have been made.

Changes:
- New:
    - Add noise
    - Create costum set of spectra
    - Save trained regressor as model with joblib
- Bug fixes
    - Fixed a bug where the range x was different for generating the data and fitting the data, 
    now results in better error rate
- gen_train_set()
    - renamed to gen_set()
    - implemented option to generate costum filename
    - Using msgpack instead of csv for imporving speed
    - now fits splines instead of read_input_data (for real data thou read should fit data)
- read_input_data()
    - renamed to read_set()
    - implementd option to select costum filename
    - now possible to load data with poisson distributed noise
    - doesn't fit data anymore 

In [7]:
"""
Import functions and tools needed
"""
import plotly.offline as py
import plotly.tools as tls
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np
import pandas as pd
import matplotlib
import math
import scipy.stats as sc
from scipy.special import wofz
from astropy.modeling.functional_models import Voigt1D 
from scipy.interpolate import UnivariateSpline
from sklearn.neural_network import MLPRegressor as MLPR
import time
from sklearn.externals import joblib

matplotlib.rcParams['figure.figsize']=(7,5)
py.init_notebook_mode()

def plotly_show():
    fig = plt.gcf()
    py.iplot_mpl(fig)
    plt.grid(True)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [116]:
def gen_set(N, x, noise, Set): 
    """
    - Generates a set with N spectra by using the superposition of TWO Voigt profiles with randomly choosen
        parameters 
            gamma1: HWHM of Lorentzian part of Voigt profile 1 
            gamma2: HWHM of Lorentzian part of Voigt profile 2
            sigma1: Standard uncertainty of Gaussian part of Voigt profile 1
            sigma2: Standard uncertainty of Gaussian part of Voigt profile 2
            epsilon: offset to energy E
        The Energy E (K alpha1) is centered around 2014eV
    """
    start = time.time()
    # Definition of some parameters
    gamma1, sigma1 = 0.2, 0.2
    gamma2, sigma2 = 0.8, 0.4
    labels = np.array(0)
    labels = np.delete(labels, 0)
    # Creating the empty data matrix
    X = np.array(0)
    X = np.delete(X, 0)
    """
    For loop loops N times to create N spectra. The single spectrum is evaluate and fitted
    on range x to get equal x values as features (Note: When trained on grid defined by x then
    real data must also be sampled on same grid!). File format:
        File dimensions: N x (1 + d), where d is number of grid points resulting from x
        [Label x1 x2 ... xd]
    """
    for i in range(N):
        # Generate random distribution (+- 1) around central value of energie E
        epsilon = (np.random.random_sample()-0.5)*2
        E = 2014 + epsilon
        dE = 1.5
        v1 = Voigt1D(x_0=E-dE, amplitude_L=0.15, fwhm_L=2*gamma1, fwhm_G=2*sigma1*np.sqrt(2*np.log(2)))
        v2 = Voigt1D(x_0=E, amplitude_L=0.25, fwhm_L=2*gamma2, fwhm_G=2*sigma2*np.sqrt(2*np.log(2)))
        #x = np.arange(1990,2030,0.1)
        #x = np.linspace(E-dE - 30*gamma1, E + 16*gamma2, 1000)
        # Fit the superposition of the voigt profiles with splines
        sp = UnivariateSpline(x,v1(x)+v2(x),s=10e-5)
        # Save values in two columns in form of [x,v(x)]
        X = np.append(X, E)
        if (noise == True):
            noise_arr = np.random.poisson(1000, len(x))
            noise_arr = noise_arr / 10000
            X = np.append(X, sp(x)+noise_arr).reshape(i+1,len(x)+1)
        else:
            X = np.append(X, sp(x)).reshape(i+1,len(x)+1)
    # Create pandas dataframe
    df_spectrum = pd.DataFrame(X)
    df_spectrum.to_msgpack('./data/'+Set+'.spectrum')
    end = time.time()
    print("Time for generating the "+Set+" set:", end-start)

In [111]:
def read_set(Set):
    """ 
    Read data and store it in (Nxd) Martix, where N donates 
    the observation (single spectrum) and d the dth feature 
    (datapoint given by choosing x). The data gets fitted 
    by the Splines fit. Also, noise is added when reading the
    data if flag is set.
    """
    start = time.time()
    
    X = pd.read_msgpack('./data/'+Set+'.spectrum') 

    end = time.time()
    print("Time for reading "+Set+"set: ", end-start)
    return X

In [121]:
x = np.arange(2005,2025,0.01)
N=1000

gen_set(N, x, noise=True, Set="test1k")
#X = read_set(Set="train")

Time for generating the test1k set: 26.55924677848816


In [177]:
X = read_set(Set="train")
X = X.as_matrix()
print(X[:,:1])

Time for reading trainset:  0.0551447868347168
[[ 2014.09709274]
 [ 2014.67481737]
 [ 2013.25856513]
 [ 2013.39110187]
 [ 2013.81528026]
 [ 2013.92900942]
 [ 2014.15642965]
 [ 2013.28833231]
 [ 2013.3806714 ]
 [ 2013.97307334]
 [ 2014.36053909]
 [ 2014.76230244]
 [ 2013.33857891]
 [ 2014.44859891]
 [ 2014.26054126]
 [ 2014.03892635]
 [ 2014.70930587]
 [ 2013.15576996]
 [ 2014.30831186]
 [ 2013.12124663]
 [ 2013.56829753]
 [ 2014.71869821]
 [ 2014.6925077 ]
 [ 2014.13797528]
 [ 2013.24244216]
 [ 2014.63588484]
 [ 2013.36558578]
 [ 2013.18419474]
 [ 2014.49494701]
 [ 2014.57416534]
 [ 2014.20710759]
 [ 2014.85924079]
 [ 2013.51252986]
 [ 2013.28318597]
 [ 2014.07059277]
 [ 2013.07891717]
 [ 2013.03866068]
 [ 2013.54416461]
 [ 2013.63091095]
 [ 2014.12027959]
 [ 2013.80723307]
 [ 2013.14343254]
 [ 2013.4611037 ]
 [ 2014.47238022]
 [ 2014.58578074]
 [ 2014.33688847]
 [ 2013.20515933]
 [ 2013.762271  ]
 [ 2013.53763685]
 [ 2013.55202675]
 [ 2014.37431611]
 [ 2014.18936189]
 [ 2013.31370803]

In [496]:
def scale_input(X):
    """
    Feature skaling for NN apporach. It is "highly recommended" to scale input data to either [0:1] or [-1:+1] 
    or standardize it to have mean 0 and variance 1
    Source:
    http://scikit-learn.org/stable/modules/neural_networks_supervised.html#regression
    This function standardizes X 
    """
    from sklearn.preprocessing import StandardScaler  
    scaler = StandardScaler()  
    # Don't cheat - fit only on training data
    scaler.fit(X)  
    X = scaler.transform(X)  
    #for i in range(len(X)):
    #    plt.plot(x,X[i])
    #plt.grid(True)
    #plotly_show()
    return X

In [219]:
def NN_train(X, model):
    """
    Trains given model on data X and labels y. Returns trainings score
    """
    start = time.time()
    y = np.ravel(X[:,:1])
    X = X[:,1:]
    model.fit(X, y)
    score = model.score(X,y)
    end = time.time()
    print("Training time: %3.2f " % (end - start))
    print("Training score: %3.2f " % (score))
    return score
    

### Begin of testing the algorithms

In [220]:
"""
Defining (Hyper)Parameters
"""
x = np.arange(2005,2025,0.01)
model = MLPR(max_iter=20000, activation="relu")

In [221]:
X_train = read_set(Set="train10k")
X_train = X_train.as_matrix()
X_dev = read_set(Set="dev1k")
X_dev = X_dev.as_matrix()
X_test = read_set(Set="test1k")
X_test = X_test.as_matrix()

Time for reading train10kset:  0.8794088363647461
Time for reading dev1kset:  0.08221602439880371
Time for reading test1kset:  0.08171463012695312


In [218]:
y = np.ravel(X_train[:,:1])
print(np.shape(y),np.shape(X_train[:,1:]))

(10000,) (10000, 2000)


In [222]:
for i in range(1):
    print("########### RUN", i+1, "###########")
    score = NN_train(X_train, model)
    # Save model via
    joblib.dump(model, './data/1_neural_network.pkl')
    # Load model via
    model2 = joblib.load('./data/1_neural_network.pkl')
    predict_train = model.predict(X_train)
    predict_dev = model.predict(X_dev)
    plt.plot(y_train-predict_train, label=("Train set"))
    plt.plot(y_dev-predict_dev, label=("dev set"))
    plt.xlabel("datapoint")
    plt.ylabel("Error in arb. units")
    plt.title("Error on true label")
    abserr_train = np.absolute(y_train-predict_train)
    abserr_train = np.sum(abserr_train)
    plt.legend()
    plt.grid(True)
    print("Mean error per prediction in training set in run %i: %3.2f " % (i+1, abserr_train/len(X_train)))
plotly_show()

########### RUN 1 ###########
Training time: 105.14 
Training score: -2.40 


ValueError: shapes (10000,2001) and (2000,100) not aligned: 2001 (dim 1) != 2000 (dim 0)