In [None]:
import numpy as np
import matplotlib.pyplot as pyplot
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split, learning_curve, StratifiedShuffleSplit

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import DataPreprocessing

## Obtaining and nicely arranging data

In [None]:
##Obtain & process data CSV
me_train, mom_train = DataPreprocessing.csv('NLO/MG_uux/me_2Jet_1000000.csv','NLO/MG_uux/mom_2Jet_1000000.csv', frac=0.1)
me_test, mom_test = DataPreprocessing.csv('NLO/MG_uux/me_2Jet_100000.csv', 'NLO/MG_uux/mom_2Jet_100000.csv', frac=0.1)

In [None]:
##Obtain & process data NPY
me_train, mom_train = DataPreprocessing.npy('LO/MG_uuxg/3Jet_3000000.npy', 'LO/LO_3_0.01_NJet/PS3_0.01_3000000.npy', ['1,3', '2,3'], 1000, frac=0.1)
me_test, mom_test = DataPreprocessing.npy('LO/MG_uuxg/3Jet_500000.npy', 'LO/LO_3_0.01_NJet/PS3_0.01_500000.npy', ['1,3', '2,3'], 1000, frac=0.1)

Adding some extra features

## Single Test

In [None]:
##Data transformation
pipeline = Pipeline([
    ('scaler', StandardScaler()), #Rescale Data.
    ('kbins', KBinsDiscretizer(n_bins=1000)) #Checking convergence as bins.
])
 
mom_train = pipeline.fit_transform(mom_train) #Rescale on training set
mom_test = pipeline.transform(mom_test) #Rescale on test set

In [None]:
linreg = LinearRegression().fit(mom_train, me_train)

me_predict_lin = linreg.predict(mom_test) #Prediction on test set
lin_mse = mean_squared_error(me_test, me_predict_lin) 
print('RMSE: {}').format(np.sqrt(lin_mse))


lin_perc = np.mean(100*np.divide(np.abs(me_test - me_predict_lin), me_test))
print('Percentage Error: {}').format(lin_perc)

In [None]:
print(np.std(me_test))

Plotting Learning Curve

In [None]:
##Plotting a Learning Curve
split = StratifiedShuffleSplit() #Collects data evenly about mean to put into validation sets
train = split.split(mom_train, pd.cut(me_train, bins = 200)) #But are we testing on something we've trained..?

train_sizes, train_scores, test_scores = learning_curve(
        linreg, mom_train, me_train, scoring='neg_mean_squared_error',
        cv=5, shuffle=True, train_sizes=np.linspace(0.01, 0.1, 6), 
        n_jobs=1, verbose=1)

In [None]:
pyplot.figure()

train_scores_mean = np.sqrt(-np.mean(train_scores, axis=1))
test_scores_mean = np.sqrt(-np.mean(test_scores, axis=1))

pyplot.xlabel("Training examples")
pyplot.ylabel("RMSE")
pyplot.savefig('LearningCurveKBins')

In [None]:
np.std(me_test)

## Plot accuracy vs number of bins

In [None]:
def changeBins(n_bins, input_train, input_test, output_train, output_test):
    ##Data transformation
    pipeline = Pipeline([
        ('scaler', StandardScaler()), #Rescale Data.
        ('kbins', KBinsDiscretizer(n_bins=n_bins)) #Checking convergence as bins.
    ])

    temp_train = pipeline.fit_transform(input_train) #Rescale on training set
    temp_test = pipeline.transform(input_test) #Rescale on test set
    
    linreg = LinearRegression().fit(temp_train, output_train)

    me_predict = linreg.predict(temp_test) #Prediction on test set
    lin_mse = mean_squared_error(output_test, me_predict) 
    
    lin_perc = np.mean(100*np.divide(np.abs(output_test - me_predict), output_test))
    
    return np.sqrt(lin_perc)

In [None]:
##Obtain & process data
me_train, mom_train = DataPreprocessing.npy('LO/MG_uuxg/3Jet_3000000.npy', 'LO/LO_3_0.01_NJet/PS3_0.01_3000000.npy', frac=0.1)
me_test, mom_test = DataPreprocessing.npy('LO/MG_uuxg/3Jet_500000.npy', 'LO/LO_3_0.01_NJet/PS3_0.01_500000.npy', frac=0.1)

n_bins_arr = range(10000, 15000, 5000)
result = []
for n_bins in n_bins_arr:
    temp = changeBins(n_bins, mom_train, mom_test, me_train, me_test)
    print(temp)
    result.append(temp)

In [None]:
pyplot.xlabel('Number of Bins')
pyplot.ylabel('RMSE')
pyplot.title('Point k at which kBins overfits')
pyplot.plot(n_bins_arr, result)
pyplot.savefig('RMSE_kBins_Overfit')

In [None]:
print(np.mean(me_test))

In [None]:
print(np.std(me_test))