In [2]:
import pandas as pd

## Aligning SFCOMPO test cases to training set design

### A. Reactors
1. Delete RBMK, MAGNOX, AGR from test set (did not simulate these)
2. Ensure ReactorType column matches options from training set, and preserve this information by adding that text to the ReactorName column

### B. Metadata, other information, DUPLICATES!
1. Confirm units match training set (burnup)
2. Check script to allow for additional columns (sample ref, etc) as well as OrigenReactor vs. ReactorName (may decide to delete for now to get script running faster)
3. Discovered duplicates. How to handle? Average? Take value with lower uncertainty? Right now (10 March), keeping first duplicate entry.

### C. Isotopes
1. Melt isotope concentrations from single column to their own columns. 
  - Using concentration instead of value for unit conformity. 
  - Need to double check, but I think uncertainties apply to 'values' and not 'concentration'
  - Keeping concentrations, but train set is in g-atoms (moles), so only ratios will be useable until new gram-output training set is simulated.
2. Keep only list of 15 nuclides

In [3]:
pklfile = '~/prep-pkls/nucmoles_opusupdate_aug2019/not-scaled_15nuc.pkl'
sfcompofile = '../clean/sfcompoDB_clean.csv'

In [4]:
train = pd.read_pickle(pklfile)
test = pd.read_csv(sfcompofile)

In [5]:
train.head()

Unnamed: 0,Burnup,CoolingTime,Enrichment,OrigenReactor,ReactorType,ba136,ba138,cs133,cs134,cs135,...,eu153,eu154,pu239,pu240,pu241,pu242,sm149,sm150,sm152,total
0.000e+00,0.0,0.0,0.53,ce14x14,pwr,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4202.0
5.733e+01.1,2006.54,0.000839,0.53,ce14x14,pwr,0.000593,0.6109,0.5142,0.002411,0.1187,...,0.0155,0.000451,2.575,0.1198,0.006937,0.000123,0.008141,0.08695,0.04212,4211.0
5.792e+01,2006.54,0.592485,0.53,ce14x14,pwr,0.0006,0.6114,0.5204,0.002411,0.1209,...,0.01568,0.000451,2.605,0.1198,0.006937,0.000123,0.009107,0.08695,0.04212,4211.0
5.844e+01,2006.54,1.109213,0.53,ce14x14,pwr,0.000607,0.6114,0.5256,0.00241,0.1224,...,0.0158,0.000451,2.627,0.1198,0.006936,0.000123,0.00982,0.08695,0.04212,4211.0
6.579e+01,2006.54,8.4604,0.53,ce14x14,pwr,0.000681,0.6114,0.5759,0.002394,0.1238,...,0.01637,0.00045,2.746,0.1198,0.006929,0.000123,0.01347,0.08695,0.04212,4211.0


In [6]:
test.head(5)

Unnamed: 0,ReactorName,ReactorType,Fuel type,Burnup,BurnupUnit,Enrichment,EnrichmentUnit,SampleRef,Measurement,Isotope,Value,Unit,Concentration,ConcentrationUnit,Sigma,Uncertainty,UncertaintyUnit
0,Balakovo-2,VVER-1000,UO2,45.1,GW*d/tUi,4.4,wt%,BAL-2|1476|42|15,Isotopic Concentration,Am241,0.048,mg/gUi,0.048,mg/gUi,2.0,4.17,%
1,Balakovo-2,VVER-1000,UO2,45.1,GW*d/tUi,4.4,wt%,BAL-2|1476|42|15,Isotopic Concentration,Am243,0.141,mg/gUi,0.141,mg/gUi,2.0,2.84,%
2,Balakovo-2,VVER-1000,UO2,45.1,GW*d/tUi,4.4,wt%,BAL-2|1476|42|15,Isotopic Concentration,Cm242,0.021,mg/gUi,0.021,mg/gUi,2.0,9.52,%
3,Balakovo-2,VVER-1000,UO2,45.1,GW*d/tUi,4.4,wt%,BAL-2|1476|42|15,Isotopic Concentration,Cm244,0.055,mg/gUi,0.055,mg/gUi,2.0,10.91,%
4,Balakovo-2,VVER-1000,UO2,45.1,GW*d/tUi,4.4,wt%,BAL-2|1476|42|15,Isotopic Concentration,Nd142,0.033,mg/gUi,0.033,mg/gUi,2.0,3.03,%


In [7]:
test.ReactorType.unique()

array(['VVER-1000', 'MAGNOX', 'PWR', 'BWR', 'AGR', 'VVER-440', 'RBMK',
       'CANDU'], dtype=object)

In [8]:
len(test.SampleRef.unique())

652

### A. Changes to ReactorName and ReactorType

In [9]:
test = test[(test.ReactorType != 'MAGNOX') & (test.ReactorType != 'AGR') & (test.ReactorType != 'RBMK')]

In [10]:
test.ReactorType.unique()

array(['VVER-1000', 'PWR', 'BWR', 'VVER-440', 'CANDU'], dtype=object)

In [11]:
len(test.SampleRef.unique())

544

In [12]:
test.ReactorName = test.ReactorName + '_' + test.ReactorType
test.ReactorName

0        Balakovo-2_VVER-1000
1        Balakovo-2_VVER-1000
2        Balakovo-2_VVER-1000
3        Balakovo-2_VVER-1000
4        Balakovo-2_VVER-1000
                 ...         
12210            Yankee-1_PWR
12211            Yankee-1_PWR
12212            Yankee-1_PWR
12213            Yankee-1_PWR
12214            Yankee-1_PWR
Name: ReactorName, Length: 9944, dtype: object

In [13]:
rtypes = {'PWR' : 'pwr', 'BWR' : 'bwr', 'CANDU' : 'phwr', 'VVER-1000' : 'pwr', 'VVER-440' : 'pwr'}
test.ReactorType.replace(rtypes, inplace=True)

### B. Changes to Burnup Units

In [14]:
# only run once you idiot!
test.Burnup = 1000 * test.Burnup
test.BurnupUnit.unique()

array(['GW*d/tUi'], dtype=object)

In [15]:
test.BurnupUnit.replace({'GW*d/tUi' : 'MW*d/tUi'}, inplace=True)

### C. Matching Iso Column Format

In [16]:
test.ConcentrationUnit.unique()

array(['mg/gUi'], dtype=object)

In [17]:
test.Isotope = test.Isotope.str.lower()

In [18]:
nuc15 = ['cs133', 'cs134', 'cs135', 'cs137', 'eu153', 
         'eu154', 'ba136', 'ba138', 'sm149', 'sm150', 
         'sm152', 'pu239', 'pu240', 'pu241', 'pu242'
        ]
test = test[test.Isotope.isin(nuc15)]

#### Testing pivot

In [19]:
import numpy as np
sample = [1, 1, 1, 2, 2, 3]
iso = ['A', 'B', 'C', 'A', 'B', 'A']
val = np.random.rand(6)

df = pd.DataFrame({'SampleRef' : sample, 'Isotope' : iso, 'Concentration' : val})
df

Unnamed: 0,SampleRef,Isotope,Concentration
0,1,A,0.316762
1,1,B,0.69192
2,1,C,0.405783
3,2,A,0.170334
4,2,B,0.723071
5,3,A,0.442258


In [20]:
#df.duplicated(['SampleRef', 'Isotope'])
df.pivot(index='SampleRef', columns='Isotope', values='Concentration')

Isotope,A,B,C
SampleRef,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.316762,0.69192,0.405783
2,0.170334,0.723071,
3,0.442258,,


### Deleting Duplicate Entries

In [21]:
test.drop_duplicates(['SampleRef', 'Isotope'], inplace=True)

#### pivot attempt

In [22]:
df = test[['SampleRef', 'Isotope', 'Concentration']]
df.head()

Unnamed: 0,SampleRef,Isotope,Concentration
10,BAL-2|1476|42|15,pu239,6.07
11,BAL-2|1476|42|15,pu240,2.58
12,BAL-2|1476|42|15,pu241,1.7
13,BAL-2|1476|42|15,pu242,0.69
22,BAL-2|1476|42|31,pu239,6.19


#### pivot on full test db

In [23]:
features = df.pivot(index='SampleRef', columns='Isotope', values='Concentration')
features

Isotope,cs133,cs134,cs135,cs137,eu153,eu154,pu239,pu240,pu241,pu242,sm149,sm150,sm152
SampleRef,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
BAL-2|1476|42|15,,,,,,,6.070,2.580,1.700,0.690,,,
BAL-2|1476|42|31,,,,,,,6.190,2.650,1.700,0.700,,,
BAL-2|1476|42|6,,,,,,,6.100,2.670,1.760,0.720,,,
BAL-3|1591|23|581,,,,,,,6.080,2.680,1.760,0.880,,,
BAL-3|1591|23|912,,,,,,,6.260,2.620,1.760,0.750,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
YNK-1|F5|C-F6|G-119,,,,,,,6.810,1.598,1.154,0.208,,,
YNK-1|F5|C-F6|G-120,,,,,,,6.965,1.607,1.115,0.216,,,
YNK-1|F5|C-F6|G-121,,,,,,,5.539,0.994,0.590,0.067,,,
YNK-1|F5|SE-F5|G-122,,,,,,,6.373,2.182,1.438,0.403,,,


In [24]:
# tried groupby functionality, but drop_duplicates worked faster!
to_remove = ['Isotope', 'Concentration', 'Value', 'Unit', 'Sigma', 'Uncertainty', 'UncertaintyUnit', 'Measurement']
labels = test.drop(columns=to_remove)
labels.drop_duplicates(labels.columns.tolist(), inplace=True)
labels.set_index('SampleRef', inplace = True)
labels

Unnamed: 0_level_0,ReactorName,ReactorType,Fuel type,Burnup,BurnupUnit,Enrichment,EnrichmentUnit,ConcentrationUnit
SampleRef,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BAL-2|1476|42|15,Balakovo-2_VVER-1000,pwr,UO2,45100.0,MW*d/tUi,4.4,wt%,mg/gUi
BAL-2|1476|42|31,Balakovo-2_VVER-1000,pwr,UO2,45800.0,MW*d/tUi,4.4,wt%,mg/gUi
BAL-2|1476|42|6,Balakovo-2_VVER-1000,pwr,UO2,46700.0,MW*d/tUi,4.4,wt%,mg/gUi
BAL-3|1591|23|912,Balakovo-3_VVER-1000,pwr,UO2,46200.0,MW*d/tUi,4.4,wt%,mg/gUi
BAL-3|1591|23|581,Balakovo-3_VVER-1000,pwr,UO2,47900.0,MW*d/tUi,4.4,wt%,mg/gUi
...,...,...,...,...,...,...,...,...
YNK-1|E6|NW-A1|G-127,Yankee-1_PWR,pwr,UO2,42500.0,MW*d/tUi,3.4,wt%,mg/gUi
YNK-1|E6|NW-A1|G-128,Yankee-1_PWR,pwr,UO2,42500.0,MW*d/tUi,3.4,wt%,mg/gUi
YNK-1|E6|NW-A1|N-21,Yankee-1_PWR,pwr,UO2,42600.0,MW*d/tUi,3.4,wt%,mg/gUi
YNK-1|E6|NW-A1|T-165,Yankee-1_PWR,pwr,UO2,43200.0,MW*d/tUi,3.4,wt%,mg/gUi


Ensure sample refs are the same

In [25]:
labels_idx = labels.index.tolist().sort()
features_idx = features.index.tolist().sort()
labels_idx == features_idx

True

Checking out nulls for this case -- most samples only include Pu and some Pu/U (A few have all 13)

In [26]:
#features['null_count'] = features.isnull().sum(axis=1)
#print(features['null_count'].mean())
#features = features.sort_values('null_count', ascending=True).drop('null_count', axis=1)

In [27]:
formatted = pd.concat([labels, features], axis=1)
formatted

Unnamed: 0,ReactorName,ReactorType,Fuel type,Burnup,BurnupUnit,Enrichment,EnrichmentUnit,ConcentrationUnit,cs133,cs134,...,cs137,eu153,eu154,pu239,pu240,pu241,pu242,sm149,sm150,sm152
BAL-2|1476|42|15,Balakovo-2_VVER-1000,pwr,UO2,45100.0,MW*d/tUi,4.4,wt%,mg/gUi,,,...,,,,6.070,2.580,1.700,0.690,,,
BAL-2|1476|42|31,Balakovo-2_VVER-1000,pwr,UO2,45800.0,MW*d/tUi,4.4,wt%,mg/gUi,,,...,,,,6.190,2.650,1.700,0.700,,,
BAL-2|1476|42|6,Balakovo-2_VVER-1000,pwr,UO2,46700.0,MW*d/tUi,4.4,wt%,mg/gUi,,,...,,,,6.100,2.670,1.760,0.720,,,
BAL-3|1591|23|912,Balakovo-3_VVER-1000,pwr,UO2,46200.0,MW*d/tUi,4.4,wt%,mg/gUi,,,...,,,,6.260,2.620,1.760,0.750,,,
BAL-3|1591|23|581,Balakovo-3_VVER-1000,pwr,UO2,47900.0,MW*d/tUi,4.4,wt%,mg/gUi,,,...,,,,6.080,2.680,1.760,0.880,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YNK-1|E6|NW-A1|G-127,Yankee-1_PWR,pwr,UO2,42500.0,MW*d/tUi,3.4,wt%,mg/gUi,,,...,,,,5.937,2.533,1.751,0.831,,,
YNK-1|E6|NW-A1|G-128,Yankee-1_PWR,pwr,UO2,42500.0,MW*d/tUi,3.4,wt%,mg/gUi,,,...,,,,5.849,2.552,1.761,0.803,,,
YNK-1|E6|NW-A1|N-21,Yankee-1_PWR,pwr,UO2,42600.0,MW*d/tUi,3.4,wt%,mg/gUi,,,...,,,,6.461,2.718,1.839,0.794,,,
YNK-1|E6|NW-A1|T-165,Yankee-1_PWR,pwr,UO2,43200.0,MW*d/tUi,3.4,wt%,mg/gUi,,,...,,,,6.199,2.679,1.810,0.823,,,


#### For now, formatting to match ORIGEN DB exactly

In [28]:
formatted.rename(columns = {'Fuel type' : 'FuelType', 'ReactorName' : 'OrigenReactor'}, inplace = True)
to_remove = ['FuelType', 'BurnupUnit', 'EnrichmentUnit', 'ConcentrationUnit']
formatted = formatted.drop(columns=to_remove)
formatted['ba136'] = 0.0
formatted['ba138'] = 0.0
formatted['CoolingTime'] = 0.0
formatted.fillna(0, inplace=True)
#hard code column order for now
cols = ['Burnup', 'CoolingTime', 'Enrichment', 'OrigenReactor', 'ReactorType', 'ba136', 'ba138', 'cs133', 'cs134', 'cs135', 'cs137', 'eu153', 'eu154', 'pu239', 'pu240', 'pu241', 'pu242', 'sm149', 'sm150', 'sm152']
formatted = formatted[cols]
formatted

Unnamed: 0,Burnup,CoolingTime,Enrichment,OrigenReactor,ReactorType,ba136,ba138,cs133,cs134,cs135,cs137,eu153,eu154,pu239,pu240,pu241,pu242,sm149,sm150,sm152
BAL-2|1476|42|15,45100.0,0.0,4.4,Balakovo-2_VVER-1000,pwr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.070,2.580,1.700,0.690,0.0,0.0,0.0
BAL-2|1476|42|31,45800.0,0.0,4.4,Balakovo-2_VVER-1000,pwr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.190,2.650,1.700,0.700,0.0,0.0,0.0
BAL-2|1476|42|6,46700.0,0.0,4.4,Balakovo-2_VVER-1000,pwr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.100,2.670,1.760,0.720,0.0,0.0,0.0
BAL-3|1591|23|912,46200.0,0.0,4.4,Balakovo-3_VVER-1000,pwr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.260,2.620,1.760,0.750,0.0,0.0,0.0
BAL-3|1591|23|581,47900.0,0.0,4.4,Balakovo-3_VVER-1000,pwr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.080,2.680,1.760,0.880,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YNK-1|E6|NW-A1|G-127,42500.0,0.0,3.4,Yankee-1_PWR,pwr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.937,2.533,1.751,0.831,0.0,0.0,0.0
YNK-1|E6|NW-A1|G-128,42500.0,0.0,3.4,Yankee-1_PWR,pwr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.849,2.552,1.761,0.803,0.0,0.0,0.0
YNK-1|E6|NW-A1|N-21,42600.0,0.0,3.4,Yankee-1_PWR,pwr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.461,2.718,1.839,0.794,0.0,0.0,0.0
YNK-1|E6|NW-A1|T-165,43200.0,0.0,3.4,Yankee-1_PWR,pwr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.199,2.679,1.810,0.823,0.0,0.0,0.0


In [29]:
formatted.to_pickle('sfcompo_formatted.pkl')

In [45]:
# Testing pandas functionality for different script
testme = formatted.sample(5)
lbls = ['ReactorType', 'CoolingTime', 'Enrichment', 'Burnup', 'OrigenReactor']

In [47]:
for sim_idx, row in testme.iterrows():
    test_sample = testme.loc[testme.index == sim_idx].drop(lbls, axis=1)
    print(isinstance(test_sample, pd.DataFrame))
    test_answer = testme.loc[testme.index == sim_idx, lbls]
    #print(test_answer)
    test_test_sample = row.drop(lbls)
    print(isinstance(test_test_sample, pd.Series))
    test_test_answer = row[lbls]
    #print(test_test_answer)

True
True
True
True
True
True
True
True
True
True
