In [1]:
import numpy as np
import matplotlib.pyplot as pyplot
import pandas as pd
from functools import reduce

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split, learning_curve, StratifiedShuffleSplit

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import DataPreprocessing

In [2]:
##Obtain & process data NPY
me_train, mom_train = DataPreprocessing.npy('NLO/MG_uuxg/me_1000000.npy', 'NLO/MG_uuxg/mom_1000000.npy', [], 1000, frac=0.1)
me_test, mom_test = DataPreprocessing.npy('NLO/MG_uuxg/me_300000.npy', 'NLO/MG_uuxg/mom_300000.npy', [], 1000, frac=0.1)

In [4]:
print(me_test[:,3]/me_test[:,0])

[-0.10642161 -0.10642161 -0.10642161 ... -0.10642161 -0.10642161
 -0.10642161]


In [41]:
def calc_RMSE(mom_train, mom_test, me_train, me_test, var_name):
    ##Data transformation
    pipeline = Pipeline([
        
        ('scaler', StandardScaler()), #Rescale Data.
        ('kbins', KBinsDiscretizer(n_bins=1000)) #Checking convergence as bins.
    ])
 
    mom_train = pipeline.fit_transform(mom_train) #Rescale on training set
    mom_test = pipeline.transform(mom_test) #Rescale on test set

    
    linreg = LinearRegression().fit(mom_train, me_train) #Linear fit
    pred = linreg.predict(mom_test) #Prediction on test set
    
    mse = mean_squared_error(me_test, pred) #Mean squared error on test set
    print('{} RMSE: {}'.format(var_name, np.sqrt(mse)))

    perc = np.mean(100*np.divide(np.abs(me_test - pred), abs(me_test)))
    print('{} Percentage Error: {}'.format(var_name, perc))
    
for i, name in enumerate(['Born', 'Real', 'Sing', 'Doub']):
    div_train = reduce(np.multiply, DataPreprocessing.mandel_creation(['1,3','2,3'], mom_train))
    div_test = reduce(np.multiply, DataPreprocessing.mandel_creation(['1,3','2,3'], mom_test))

    temp_train = np.multiply(div_train, me_train[:,i])
    temp_test = np.multiply(div_test, me_test[:,i])
        
    calc_RMSE(mom_train, mom_test, temp_train, temp_test, name)

Born RMSE: 49.297094974472174
Born Percentage Error: 0.048707223406025485
Real RMSE: 8861.5473544747
Real Percentage Error: 157.28853324185485
Sing RMSE: 1237.5522245235413
Sing Percentage Error: 4.247558545536715
Doub RMSE: 5.246275975816848
Doub Percentage Error: 0.04870722334029269


Plotting Learning Curve

## Accuracy Vs Number of Bins