In [37]:
from training_set import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
import numpy as np
import pandas as pd
import glob
import os

In [41]:
#filename = train_files[1]
#filename

# Training Set -> Dataframes

### Getting list of files holding training data

In [42]:
train_files = []
#datapath = "../origen/origen-data/30nov2017_actinides/"
datapath = "../origen-data/30nov2017_actinides/"
for i in range(0, len(O_RXTRS)):
    o_rxtr = O_RXTRS[i]
    for j in range(0, len(ENRICH[i])):
        enrich = ENRICH[i][j]
        rxtrpath = datapath + o_rxtr + "/"
        #ecsv = o_rxtr + "_enr" + str(enrich) + "_nucs.csv"
        ecsv = o_rxtr + "_enr" + str(enrich) + "_gammas.csv"
        trainpath = os.path.join(rxtrpath, ecsv)
        train_files.append(trainpath)

### Supporting Functions

In [74]:
def loop_labels(burnup, cooling):
    steps_per_case = len(COOLING_INTERVALS) + 1
    burnup_lbl = [0, ]
    cooling_lbl = [0, ]
    for case in range(0, len(burnup)):
        for step in range(0, steps_per_case):
            if (step == 0):
                burnup_lbl.append(burnup[case])
                cooling_lbl.append(0)
            else:
                burnup_lbl.append(burnup[case])
                cooling_lbl.append(COOLING_INTERVALS[step-1])
    return burnup_lbl, cooling_lbl

def label_data(labels, data):
    col = len(data.columns)
    burnups, coolings = loop_labels(labels['Burnup'], labels['CoolingInts'])
    # inserting 4 labels into columns
    data.insert(loc = col, column = 'ReactorType', value = labels['ReactorType'])
    data.insert(loc = col+1, column = 'Enrichment', value = labels['Enrichment'])
    data.insert(loc = col+2, column = 'Burnup', value = burnups)
    data.insert(loc = col+3, column = 'CoolingTime', value = coolings)
    return data

def format_df(filename):
    data = pd.read_csv(filename, header=5, index_col=0).T
    data.drop_duplicates(keep='last', inplace=True)
    data.drop('subtotal', axis=1, inplace=True)
    return data

def format_gdf(filename):
    time_idx = []
    spectrum = []
    spectra = []
    gamma_bins = ()
    with open(filename) as f:
        gamma = csv.reader(f, delimiter=',')
        i = 1
        for row in gamma:
            if len(row) > 0:
                if i < 6:
                    pass
                elif i == 6:
                    time_idx.append(row[0])
                elif row[1]=='days':
                    spectra.append(spectrum)
                    time_idx.append(row[0])
                    spectrum = []
                else:
                    if i in range(7, 209):
                        if (i > 7 and gamma_bins[-1]==row[0]):
                            row[0] = row[0] + '.1'
                        gamma_bins = gamma_bins + (row[0],)    
                    spectrum.append(row[1])
                i = i + 1
        spectra.append(spectrum)
    data = pd.DataFrame(spectra, index=time_idx, columns=gamma_bins)
    data.drop_duplicates(keep='last', inplace=True)
    return data

### Main formatting function

In [85]:
def dataframeXY(all_files):
    all_data = []
    for f in all_files:
        idx = all_files.index(f)
        #data = format_df(f)
        data = format_gdf(f)
        labels = {'ReactorType': TRAIN_LABELS['ReactorType'][idx],
                  #'OrigenReactor': TRAIN_LABELS['OrigenReactor'][idx],
                  'Enrichment': TRAIN_LABELS['Enrichment'][idx], 
                  'Burnup': TRAIN_LABELS['Burnup'][idx], 
                  'CoolingInts': COOLING_INTERVALS
                  }
        labeled = label_data(labels, data)
        all_data.append(labeled)
    dfXY = pd.concat(all_data)
    dfXY.fillna(value=0, inplace=True)
    return dfXY
trainXY = dataframeXY(train_files)

In [76]:
trainXY.shape

(9779, 206)

### Split into different Y's for separate ML models

In [86]:
def splitXY(dfXY):
    lbls = ['ReactorType', 'CoolingTime', 'Enrichment', 'Burnup']
    dfX = dfXY.drop(lbls, axis=1)
    r_dfY = dfXY.loc[:, lbls[0]]
    c_dfY = dfXY.loc[:, lbls[1]]
    e_dfY = dfXY.loc[:, lbls[2]]
    b_dfY = dfXY.loc[:, lbls[3]]
    return dfX, r_dfY, c_dfY, e_dfY, b_dfY

In [87]:
trainX, rY, cY, eY, bY = splitXY(trainXY)

In [88]:
trainX.head()

Unnamed: 0,1.000e-11,1.315e-11,1.315e-11.1,1.728e-11,1.728e-11.1,2.272e-11,2.272e-11.1,2.987e-11,2.987e-11.1,3.927e-11,...,2.546e+00.1,3.348e+00,3.348e+00.1,4.401e+00,4.401e+00.1,5.786e+00,5.786e+00.1,7.607e+00,7.607e+00.1,1.000e+01
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4807.0,4807.0,1731.0,1731.0,483.6,483.6,87.17,87.17,10.05,10.05
100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.859e+16,7.859e+16,3.147e+16,3.147e+16,3250000000000000.0,3250000000000000.0,111700000000000.0,111700000000000.0,1711000000.0,1711000000.0
107.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,111900000000000.0,111900000000000.0,5100000000.0,5100000000.0,4121.0,4121.0,736.7,736.7,84.35,84.35
130.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32910000000000.0,32910000000000.0,4831000000.0,4831000000.0,4053.0,4053.0,724.4,724.4,82.93,82.93
465.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,631600000000.0,631600000000.0,2585000000.0,2585000000.0,3545.0,3545.0,632.8,632.8,72.37,72.37


In [83]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
import numpy as np
import pandas as pd

In [89]:
trainX = scale(trainX, with_mean=False)

In [90]:
# Reactor Type
# L1 norm is Manhattan Distance
# L2 norm is Euclidian Distance 
l1knc = KNeighborsClassifier(metric='l1', p=1)
l2knc = KNeighborsClassifier(metric='l2', p=2)
l1knc_pred = cross_val_predict(l1knc, trainX, rY, cv=10)
l2knc_pred = cross_val_predict(l2knc, trainX, rY, cv=10)
# Accuracy
print(metrics.classification_report(rY, l1knc_pred))
print(metrics.classification_report(rY, l2knc_pred))

             precision    recall  f1-score   support

        bwr       0.99      0.99      0.99      4572
       phwr       1.00      0.88      0.94       381
        pwr       0.99      1.00      0.99      4826

avg / total       0.99      0.99      0.99      9779

             precision    recall  f1-score   support

        bwr       0.99      0.99      0.99      4572
       phwr       1.00      0.90      0.95       381
        pwr       0.99      1.00      0.99      4826

avg / total       0.99      0.99      0.99      9779



In [91]:
# Enrichment
el1knr = KNeighborsRegressor(metric='l1', p=1)
el2knr = KNeighborsRegressor(metric='l2', p=2)
el1knr_pred = cross_val_predict(el1knr, trainX, eY, cv=10)
el2knr_pred = cross_val_predict(el2knr, trainX, eY, cv=10)
print(metrics.mean_squared_error(eY, el1knr_pred))
print(metrics.mean_squared_error(eY, el2knr_pred))
print(metrics.explained_variance_score(eY, el1knr_pred))
print(metrics.explained_variance_score(eY, el2knr_pred))

0.239846734934
0.268509084868
0.87083113315
0.854826358701


In [92]:
# Cooling Time
cl1knr = KNeighborsRegressor(metric='l1', p=1)
cl2knr = KNeighborsRegressor(metric='l2', p=2)
cl1knr_pred = cross_val_predict(cl1knr, trainX, cY, cv=10)
cl2knr_pred = cross_val_predict(cl2knr, trainX, cY, cv=10)
print(metrics.mean_squared_error(cY, cl1knr_pred))
print(metrics.mean_squared_error(cY, cl2knr_pred))
print(metrics.explained_variance_score(cY, cl1knr_pred))
print(metrics.explained_variance_score(cY, cl2knr_pred))

5441.71460826
4560.70909032
0.992141072627
0.993413150323


In [93]:
# Burnup
bl1knr = KNeighborsRegressor(metric='l1', p=1)
bl2knr = KNeighborsRegressor(metric='l2', p=2)
bl1knr_pred = cross_val_predict(bl1knr, trainX, bY, cv=10)
bl2knr_pred = cross_val_predict(bl2knr, trainX, bY, cv=10)
print(metrics.mean_squared_error(bY, bl1knr_pred))
print(metrics.mean_squared_error(bY, bl2knr_pred))
print(metrics.explained_variance_score(bY, bl1knr_pred))
print(metrics.explained_variance_score(bY, bl2knr_pred))

27671363.1251
27925438.1839
0.920394908355
0.919651817902
