# Cleaning Cross Section Data Text File

Let us import the necessary modules.

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 50)

# Cleaning Data

Data contains whitespace and special characters that we need to deal with. Additionally, we see that some columns do not have values but they have a value: a string of spaces. Pandas does not recognizes them as NaN values so we have to manually take care of them. We will also drop the references the YY and the SubEntry Number. 

In [2]:
colnames = ["Prj", "Targ", "Target_Meta_State", "MF", "MT", "PXC",  "Energy",  "dEnergy",  "Data", \
            "dData",   "Cos/LO",   "dCos/LO",   "ELV/HL",  "dELV/HL", "I78", "Refer", "(YY)", "EntrySubP"]
df = pd.read_csv("../ML_Data/all_cross_sections_v1.txt", names=colnames, header=None, index_col=False, sep=";")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# make string version of original column
df['Targ'] = df['Targ'].astype(str)

# Making Sure all rows have the same number of values
max_length = 5
df.Targ = df.Targ.apply(lambda x: '0'*(max_length - len(x)) + x)

# Target feature is formated as ZZAAA
df['Z'] = df['Targ'].str[0:2].astype(int).fillna(0)
df['M'] = df['Targ'].str[2:5].astype(int).fillna(0)

# Calculating number of neutrons = mass number - protons
df['N'] = df['M'] - df["Z"]

We assume that `Target_Meta_State` with unknown values are `G` for Ground State.

In [4]:
# unmarked rows are assumed to be at  ground state so we assign G value
df["Target_Meta_State"] = df["Target_Meta_State"].replace(to_replace=" ", value="G")

We assume that the `Frame` feature unknown values are `L` for Lab Frame and `Product_Meta_State`'s missing values are `G` for Ground State.

In [5]:
# PXC describes three different variables
# We extract the data to independent features
df['Product_Meta_State'] = df['PXC'].astype(str).str[0:1].replace(to_replace=" ", value="G")
df['Frame'] = df['PXC'].astype(str).str[2:3].replace(to_replace=" ", value="L")

In [6]:
df.drop(columns=["(YY)", "EntrySubP", 'Targ', "PXC"], inplace=True)

In [7]:
df.head()

Unnamed: 0,Prj,Target_Meta_State,MF,MT,Energy,dEnergy,Data,dData,Cos/LO,dCos/LO,ELV/HL,dELV/HL,I78,Refer,Z,M,N,Product_Meta_State,Frame
0,1,G,3,1,8.8200+7,882000.0,0.03,1.5232-3,,,,,,"D.F.MEASDAY,ET.AL. (66)",0,1,1,G,L
1,1,G,3,1,9.8100+7,981000.0,0.0291,1.5162-3,,,,,,"D.F.MEASDAY,ET.AL. (66)",0,1,1,G,L
2,1,G,3,1,1.1000+8,1100000.0,0.0279,1.4147-3,,,,,,"D.F.MEASDAY,ET.AL. (66)",0,1,1,G,L
3,1,G,3,1,1.1960+8,1196000.0,0.0264,1.4031-3,,,,,,"D.F.MEASDAY,ET.AL. (66)",0,1,1,G,L
4,1,G,3,1,1.2940+8,1294000.0,0.0256,1.3972-3,,,,,,"D.F.MEASDAY,ET.AL. (66)",0,1,1,G,L


# Fixing numerical features formatting.

In [8]:
# Defining Numerical Columns to Fix and casting them as strings
cols = ["Energy", "dEnergy", "Data", "dData", "Cos/LO", "dCos/LO", "ELV/HL", "dELV/HL"]
df[cols] = df[cols].astype(str)

In [9]:
# df[cols] = df[cols].replace(to_replace="         ", value="0.0000000")
df[cols] = df[cols].replace(to_replace="         ", value=np.nan)

# We now strip values that may contain quatation marks and starting and trailing spaces
for col in cols:
    df[col] = df[col].str.strip("\"")
    df[col] = df[col].str.strip()
    
# df[cols] = df[cols].replace(to_replace="", value="0.0000000")
df[cols] = df[cols].replace(to_replace="", value=np.nan)

In [10]:
# For the numerical values we know per formatting that each of them should be 9 characters in length
max_length = 9

for col in cols:
    df[col] = df[col].apply(lambda x: x if pd.isnull(x) else ' '*(max_length - len(x)) + x) 

In [11]:
# Add appropiate formating for python to recognize it as numerical 
for col in cols:
    new_col = []
    values = df[col].values
    for x in values:
        if pd.isnull(x):
            new_col.append(x)
        elif "+" == x[7]:
            y = x[0:7]
            z = x[7:]
            new_col.append(y + "E" + z)
        elif "+" == x[6]:
            y = x[0:6]
            z = x[6:]
            new_col.append(y + "E" + z)
        elif "-" == x[7]:
            y = x[0:7]
            z = x[7:]
            new_col.append(y + "E" + z)
        elif "-" == x[6]:
            y = x[0:6]
            z = x[6:]
            new_col.append(y + "E" + z)
        else:
            new_col.append(x)
    df[col] = new_col

In [12]:
# We now convert the columns to numerical
for col in cols:
    df[col] = df[col].astype(float)
    print("Finish converting {} to float.".format(col))

Finish converting Energy to float.
Finish converting dEnergy to float.
Finish converting Data to float.
Finish converting dData to float.
Finish converting Cos/LO to float.
Finish converting dCos/LO to float.
Finish converting ELV/HL to float.
Finish converting dELV/HL to float.


# Specifying Categorical Columns

In [13]:
cat_cols = ["Target_Meta_State", "MF", "MT", "I78", "Product_Meta_State", "Frame"]

# Convering all columns to strings and stripping whitespace
for col in cat_cols:
    df[col] = df[col].astype(str)
    df[col] = df[col].str.strip("\"")
    df[col] = df[col].str.strip()

In [14]:
# Replace empty values in I78 for L representing Low
df["I78"] = df["I78"].replace(to_replace="", value="L")

# Exporting Cleaned Data

In [79]:
df.to_csv("../ML_Data/working_xs.csv", index=False)

# Appending Additional Information from EXFOR

In [15]:
dirpath = "Extracted_Text/"

In [16]:
# Reading experiments reaction notation 
df1 = pd.read_csv(dirpath + "reaction_notation.txt", delim_whitespace=True, header=None)
df1.columns = ["Reaction", "Type"]

# Reading Experiment Titles
df2 = pd.read_csv(dirpath + "titles.txt", sep="#TITLE      ", header=None, engine="python")
df2.columns = ["Keyword", "Title"]

# Reading Data Points per Experiment
df3 = pd.read_csv(dirpath + "data_points_per_experiment_refined.txt",  delim_whitespace=True, header=None)
df3.columns = ["Data", "Multiple"]

# Reading Experiment Year 
df4 = pd.read_csv(dirpath + "years.txt", delim_whitespace=True, header=None)
df4.columns = ["Keyword", "Year"]

# Reading Experiment Date 
df5 = pd.read_csv(dirpath + "authors.txt", sep="    ", header=None, engine="python")
df5.columns = ["Keyword", "Author"]

# Reading Experiment Institute 
df6 = pd.read_csv(dirpath + "institude.txt", sep="  ", header=None, engine="python")
df6.columns = ["Keyword", "Institute"]

# Reading Experiment Year 
df7 = pd.read_csv(dirpath + "dates.txt", delim_whitespace=True, header=None)
df7.columns = ["Keyword", "Date"]

# Reading Experiment Refere
df8 = pd.read_csv(dirpath + "references.txt", sep="#REFERENCE  ", header=None, engine="python")
df8.columns = ["Keyword", "Reference"]

In [17]:
# Merging Datapoints, notation and titles and expanding based on datapoints
pre_final = pd.concat([df3, df1, df2, df4, df5, df6, df7, df8], axis=1)
final = pre_final.reindex(pre_final.index.repeat(pre_final.Multiple))
final['position'] = final.groupby(level=0).cumcount() + 1

# Extracting projectile and outogoing particle
final["reaction_notation"] = final.Type.str.extract('.*\((.*)\).*')

In [18]:
final["reaction_notation2"] = final["reaction_notation"].apply(lambda x: x.split(')')[0])
final = pd.concat([final, final["reaction_notation2"].str.split(',', expand=True)], axis=1)

In [19]:
final.head()

Unnamed: 0,Data,Multiple,Reaction,Type,Keyword,Title,Keyword.1,Year,Keyword.2,Author,Keyword.3,Institute,Keyword.4,Date,Keyword.5,Reference,position,reaction_notation,reaction_notation2,0,1
0,#DATA,7,#REACTION,"0-NN-1(N,TOT),,SIG",,"NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",#YEAR,1966,#AUTHOR1,D.F.Measday+,#INSTITUTE,(1USAHRV),#DATE,19800804,,"Jour. Nuclear Physics Vol.85, p.142, 1966",1,"N,TOT","N,TOT",N,TOT
0,#DATA,7,#REACTION,"0-NN-1(N,TOT),,SIG",,"NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",#YEAR,1966,#AUTHOR1,D.F.Measday+,#INSTITUTE,(1USAHRV),#DATE,19800804,,"Jour. Nuclear Physics Vol.85, p.142, 1966",2,"N,TOT","N,TOT",N,TOT
0,#DATA,7,#REACTION,"0-NN-1(N,TOT),,SIG",,"NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",#YEAR,1966,#AUTHOR1,D.F.Measday+,#INSTITUTE,(1USAHRV),#DATE,19800804,,"Jour. Nuclear Physics Vol.85, p.142, 1966",3,"N,TOT","N,TOT",N,TOT
0,#DATA,7,#REACTION,"0-NN-1(N,TOT),,SIG",,"NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",#YEAR,1966,#AUTHOR1,D.F.Measday+,#INSTITUTE,(1USAHRV),#DATE,19800804,,"Jour. Nuclear Physics Vol.85, p.142, 1966",4,"N,TOT","N,TOT",N,TOT
0,#DATA,7,#REACTION,"0-NN-1(N,TOT),,SIG",,"NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",#YEAR,1966,#AUTHOR1,D.F.Measday+,#INSTITUTE,(1USAHRV),#DATE,19800804,,"Jour. Nuclear Physics Vol.85, p.142, 1966",5,"N,TOT","N,TOT",N,TOT


In [20]:
# Formatting Columns
new_columns = list(final.columns)[:19]
new_columns.extend(["Projectile", "Out"])
final.columns = new_columns

In [21]:
# Indexing only required information and saving file
final = final[["Type", "Title", "Year", "Institute", "Author", "Date", "Reference", "Out"]]

# Verify all data matches.
df.shape[0] == final.shape[0]

# Reset Indexes to make copying faster
df = df.reset_index(drop=True)
final = final.reset_index(drop=True)

In [22]:
final.head()

Unnamed: 0,Type,Title,Year,Institute,Author,Date,Reference,Out
0,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,(1USAHRV),D.F.Measday+,19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
1,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,(1USAHRV),D.F.Measday+,19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
2,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,(1USAHRV),D.F.Measday+,19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
3,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,(1USAHRV),D.F.Measday+,19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
4,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,(1USAHRV),D.F.Measday+,19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT


In [23]:
# Assign newly extracted data to main dataframe
df["Type"] = final["Type"]
df["Title"] = final["Title"]
df["Year"] = final["Year"]
df["Author"] = final["Author"]
df["Institute"] = final["Institute"]
df["Date"] = final["Date"]
df["Reference"] = final["Reference"]
df["Out"] = final["Out"]

In [24]:
df.head()

Unnamed: 0,Prj,Target_Meta_State,MF,MT,Energy,dEnergy,Data,dData,Cos/LO,dCos/LO,ELV/HL,dELV/HL,I78,Refer,Z,M,N,Product_Meta_State,Frame,Type,Title,Year,Author,Institute,Date,Reference,Out
0,1,G,3,1,88200000.0,882000.0,0.03,0.001523,,,,,L,"D.F.MEASDAY,ET.AL. (66)",0,1,1,G,L,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,D.F.Measday+,(1USAHRV),19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
1,1,G,3,1,98100000.0,981000.0,0.0291,0.001516,,,,,L,"D.F.MEASDAY,ET.AL. (66)",0,1,1,G,L,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,D.F.Measday+,(1USAHRV),19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
2,1,G,3,1,110000000.0,1100000.0,0.0279,0.001415,,,,,L,"D.F.MEASDAY,ET.AL. (66)",0,1,1,G,L,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,D.F.Measday+,(1USAHRV),19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
3,1,G,3,1,119600000.0,1196000.0,0.0264,0.001403,,,,,L,"D.F.MEASDAY,ET.AL. (66)",0,1,1,G,L,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,D.F.Measday+,(1USAHRV),19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
4,1,G,3,1,129400000.0,1294000.0,0.0256,0.001397,,,,,L,"D.F.MEASDAY,ET.AL. (66)",0,1,1,G,L,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,D.F.Measday+,(1USAHRV),19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT


Shape must be 6007126

In [25]:
df.Title = df.Title.fillna("No Title")
df = df[df.N != -1]

In [26]:
df.shape

(6002504, 27)

In [92]:
df.shape

(6002504, 27)

In [37]:
# Save Dataframe
df.to_csv("../ML_Data/working_xs.csv", index=False)

# Merging EXFOR and AME Data

In [93]:
df_workxs = pd.read_csv("../ML_Data/working_xs.csv")

In [28]:
df_workxs.columns

Index(['Prj', 'Target_Meta_State', 'MF', 'MT', 'Energy', 'dEnergy', 'Data',
       'dData', 'Cos/LO', 'dCos/LO', 'ELV/HL', 'dELV/HL', 'I78', 'Refer', 'Z',
       'M', 'N', 'Product_Meta_State', 'Frame', 'Type', 'Title', 'Year',
       'Author', 'Institute', 'Date', 'Reference', 'Out'],
      dtype='object')

In [29]:
masses = pd.read_csv("./AME_Files/AME_Isotopic_Properties.csv").rename(
    columns={'N': 'Neutrons', 'A': 'Mass_Number', 'Neutrons':'N', 'Mass_Number':'M'})
masses.head()

Unnamed: 0,Neutrons,Z,Mass_Number,EL,O,Mass_Excess,dMass_Excess,Binding_Energy,dBinding_Energy,B_Decay_Energy,dB_Decay_Energy,Atomic_Mass_Micro,dAtomic_Mass_Micro,S(2n),dS(2n),S(2p),dS(2p),Q(a),dQ(a),Q(2B-),dQ(2B-),Q(ep),dQ(ep),Q(B-n),dQ(B-n),S(n),dS(n),S(p),dS(p),Q(4B-),dQ(4B-),"Q(d,a)","dQ(d,a)","Q(p,a)","dQ(p,a)","Q(n,a)","dQ(n,a)","Q(g,p)","Q(g,n)","Q(g,pn)","Q(g,d)","Q(g,t)","Q(g,He3)","Q(g,2p)","Q(g,2n)","Q(g,a)","Q(p,n)","Q(p,2p)","Q(p,pn)","Q(p,d)","Q(p,2n)","Q(p,t)","Q(p,3He)","Q(n,2p)","Q(n,np)","Q(n,d)","Q(n,2n)","Q(n,t)","Q(n,3He)","Q(d,t)","Q(d,3He)","Q(3He,t)","Q(3He,a)","Q(t,a)",N,M,Flag
0,1,0,1,n,Other,8071.31713,0.00046,0.0,0.0,782.347,0.0,1008665.0,0.00049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0005,0.0,-0.0,2224.566,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,6257.229,0.0,763.755,20577.6194,0.0,1,1,I
1,0,1,1,H,Other,7288.97061,9e-05,0.0,0.0,18244.328,289.9558,1007825.0,9e-05,2025.412,292.506,0.0,0.0,0.0,0.0,13762.268,719.024,0.0,0.0,17514.9925,362.0875,1096.973333,256.595,0.0,0.0,8007.5,1511.5,20717.915,0.0,20613.86,50.0,0.0,0.0,-0.0,-1096.973333,-5353.1789,-3128.6129,799.9951,0.0,0.0,-2025.412,0.0,17461.9815,-0.0,-1096.973333,1127.592667,16732.646,6456.3829,2364.8615,0.0,-0.0,2224.566,-1096.973333,3128.6161,0.0,5160.255667,5493.4744,18225.736,19480.646067,19813.8649,0,1,I
2,1,1,2,H,Other,13135.72176,0.00011,1112.283,0.0,18244.328,289.9558,2014102.0,0.00012,2025.412,292.506,0.0,0.0,0.0,0.0,13762.268,719.024,0.0,0.0,17514.9925,362.0875,2224.57,0.0,2224.57,0.0,8007.5,1511.5,23846.53,0.0,20613.86,50.0,0.0,0.0,-2224.57,-2224.57,-2224.5639,0.0021,799.9951,0.0,0.0,-2025.412,0.0,17461.9815,-2224.57,-2224.57,-0.004,16732.646,6456.3829,5493.4765,0.0,-2224.57,-0.004,-2224.57,6257.2311,0.0,4032.659,3268.9044,18225.736,18353.0494,17589.2949,1,2,I
3,2,1,3,H,Other,14949.80993,0.00022,2827.265,0.0,18.592,0.0,3016049.0,0.00023,8481.79,0.0,0.0,0.0,0.0,0.0,-13717.0,2000.0,0.0,0.0,17514.9925,362.0875,6257.23,0.0,1112.285,0.0,8007.5,1511.5,17589.3,0.0,19813.86,0.0,0.0,0.0,-1112.285,-6257.23,-8481.7939,-6257.2279,-0.0049,0.0,0.0,-8481.79,0.0,-763.7545,-1112.285,-6257.23,-4032.664,16732.646,0.0049,-763.7535,0.0,-1112.285,1112.281,-6257.23,0.0011,0.0,-0.001,4381.1894,0.0,14320.3894,18701.5799,2,3,I
4,1,2,3,He,Other,14931.21793,0.00021,2572.68,0.0,-13736.0,2000.0,3016029.0,0.00022,4013.16,30.3,7718.04,0.0,367.5,10.0,12743.205,359.296667,0.0,0.0,-2571.224286,344.341429,3176.184286,29.417143,5493.47,0.0,14022.91,52.653333,18353.05,0.0,4173.344286,210.882857,20577.62,0.0,-5493.47,-3176.184286,-7718.0439,-5493.4779,-15640.520614,0.0006,-7718.04,-4013.16,367.5,-14518.3465,-5493.47,-3176.184286,-951.618286,-3353.570786,4468.6349,-0.0035,0.0,-5493.47,-3268.904,-3176.184286,763.7511,0.0004,3081.044714,0.0044,-13754.592,17401.435114,14320.3949,1,3,I


In [34]:
df_workxs = df_workxs.reset_index(drop=True)
masses = masses.reset_index(drop=True)

In [35]:
df_workxs.shape

(6002504, 27)

In [36]:
df = df_workxs.merge(masses, on=['N', 'Z'], how='left')

In [37]:
df.shape

(6002504, 92)

# Neutron Induced Cross Section vs Energy Data 

MF are ENDF labels and are used to store different types of data:

- MF=1 contains descriptive and miscellaneous data,
- MF=2 contains resonance parameter data,
- MF=3 contains reaction cross sections vs energy,
- MF=4 contains angular distributions,
- MF=5 contains energy distributions,
- MF=6 contains energy-angle distributions,
- MF=7 contains thermal scattering data,
- MF=8 contains radioactivity data
- MF=9-10 contain nuclide production data,
- MF=12-15 contain photon production data, and
- MF=30-36 contain covariance data.

In [38]:
df.MF = df.MF.astype(str)
df.MT = df.MT.astype(str)

In [39]:
df = df[df["MF"] == "3"]
df = df[df["MT"] < "999"] # Cross Section Ratios

In [40]:
df.shape

(4642144, 92)

In [41]:
df.columns

Index(['Prj', 'Target_Meta_State', 'MF', 'MT', 'Energy', 'dEnergy', 'Data',
       'dData', 'Cos/LO', 'dCos/LO', 'ELV/HL', 'dELV/HL', 'I78', 'Refer', 'Z',
       'M_x', 'N', 'Product_Meta_State', 'Frame', 'Type', 'Title', 'Year',
       'Author', 'Institute', 'Date', 'Reference', 'Out', 'Neutrons',
       'Mass_Number', 'EL', 'O', 'Mass_Excess', 'dMass_Excess',
       'Binding_Energy', 'dBinding_Energy', 'B_Decay_Energy',
       'dB_Decay_Energy', 'Atomic_Mass_Micro', 'dAtomic_Mass_Micro', 'S(2n)',
       'dS(2n)', 'S(2p)', 'dS(2p)', 'Q(a)', 'dQ(a)', 'Q(2B-)', 'dQ(2B-)',
       'Q(ep)', 'dQ(ep)', 'Q(B-n)', 'dQ(B-n)', 'S(n)', 'dS(n)', 'S(p)',
       'dS(p)', 'Q(4B-)', 'dQ(4B-)', 'Q(d,a)', 'dQ(d,a)', 'Q(p,a)', 'dQ(p,a)',
       'Q(n,a)', 'dQ(n,a)', 'Q(g,p)', 'Q(g,n)', 'Q(g,pn)', 'Q(g,d)', 'Q(g,t)',
       'Q(g,He3)', 'Q(g,2p)', 'Q(g,2n)', 'Q(g,a)', 'Q(p,n)', 'Q(p,2p)',
       'Q(p,pn)', 'Q(p,d)', 'Q(p,2n)', 'Q(p,t)', 'Q(p,3He)', 'Q(n,2p)',
       'Q(n,np)', 'Q(n,d)', 'Q(n,2n)', 'Q(n,t)',

In [42]:
columns_drop = ["MF", "Cos/LO", "dCos/LO", "Prj", "M_x", "M_y", "N"]
df = df.drop(columns=columns_drop)

In [43]:
df["O"].fillna(value="Other", inplace=True)
df["Title"].fillna(value="No Title", inplace=True)

In [44]:
df = df[~df.Neutrons.isna()]

In [45]:
df["Neutrons"] = df["Neutrons"].astype(int)
df["Mass_Number"] = df["Mass_Number"].astype(int)

In [46]:
df["Reference"] = df["Author"] + " " + df["Reference"]
df = df.drop(columns=["Refer", "Author"])

In [47]:
df = df.rename(columns={"Z":"Protons", "EL":"Element", "O":"Origin", "Type":"Reaction_Notation"})

In [48]:
df.columns

Index(['Target_Meta_State', 'MT', 'Energy', 'dEnergy', 'Data', 'dData',
       'ELV/HL', 'dELV/HL', 'I78', 'Protons', 'Product_Meta_State', 'Frame',
       'Reaction_Notation', 'Title', 'Year', 'Institute', 'Date', 'Reference',
       'Out', 'Neutrons', 'Mass_Number', 'Element', 'Origin', 'Mass_Excess',
       'dMass_Excess', 'Binding_Energy', 'dBinding_Energy', 'B_Decay_Energy',
       'dB_Decay_Energy', 'Atomic_Mass_Micro', 'dAtomic_Mass_Micro', 'S(2n)',
       'dS(2n)', 'S(2p)', 'dS(2p)', 'Q(a)', 'dQ(a)', 'Q(2B-)', 'dQ(2B-)',
       'Q(ep)', 'dQ(ep)', 'Q(B-n)', 'dQ(B-n)', 'S(n)', 'dS(n)', 'S(p)',
       'dS(p)', 'Q(4B-)', 'dQ(4B-)', 'Q(d,a)', 'dQ(d,a)', 'Q(p,a)', 'dQ(p,a)',
       'Q(n,a)', 'dQ(n,a)', 'Q(g,p)', 'Q(g,n)', 'Q(g,pn)', 'Q(g,d)', 'Q(g,t)',
       'Q(g,He3)', 'Q(g,2p)', 'Q(g,2n)', 'Q(g,a)', 'Q(p,n)', 'Q(p,2p)',
       'Q(p,pn)', 'Q(p,d)', 'Q(p,2n)', 'Q(p,t)', 'Q(p,3He)', 'Q(n,2p)',
       'Q(n,np)', 'Q(n,d)', 'Q(n,2n)', 'Q(n,t)', 'Q(n,3He)', 'Q(d,t)',
       'Q(d,3He)', '

In [49]:
# Assuming Unknown values are ground state
df["Product_Meta_State"] = df["Product_Meta_State"].astype(str)
df["Product_Meta_State"] = df["Product_Meta_State"].replace(to_replace="?", value="G")

In [50]:
df["Element_w_A"] = df["Mass_Number"].astype(str) + df.Element

# Uncertainty Missing Values

The uncertainty is not given for every experiment. Missing values happen when they are not specified in the entries and are given in the respective paper, or are simply not given. In any case, it will be very tidius to go one by one finding uncertanties. For this, we take the mean of the current uncertanties and fill missing values using the mean uncertantity multiply times the energy values. 

**it would be better to assign mean uncertainty per facility, per author, or per dataset**

In [51]:
df.columns

Index(['Target_Meta_State', 'MT', 'Energy', 'dEnergy', 'Data', 'dData',
       'ELV/HL', 'dELV/HL', 'I78', 'Protons', 'Product_Meta_State', 'Frame',
       'Reaction_Notation', 'Title', 'Year', 'Institute', 'Date', 'Reference',
       'Out', 'Neutrons', 'Mass_Number', 'Element', 'Origin', 'Mass_Excess',
       'dMass_Excess', 'Binding_Energy', 'dBinding_Energy', 'B_Decay_Energy',
       'dB_Decay_Energy', 'Atomic_Mass_Micro', 'dAtomic_Mass_Micro', 'S(2n)',
       'dS(2n)', 'S(2p)', 'dS(2p)', 'Q(a)', 'dQ(a)', 'Q(2B-)', 'dQ(2B-)',
       'Q(ep)', 'dQ(ep)', 'Q(B-n)', 'dQ(B-n)', 'S(n)', 'dS(n)', 'S(p)',
       'dS(p)', 'Q(4B-)', 'dQ(4B-)', 'Q(d,a)', 'dQ(d,a)', 'Q(p,a)', 'dQ(p,a)',
       'Q(n,a)', 'dQ(n,a)', 'Q(g,p)', 'Q(g,n)', 'Q(g,pn)', 'Q(g,d)', 'Q(g,t)',
       'Q(g,He3)', 'Q(g,2p)', 'Q(g,2n)', 'Q(g,a)', 'Q(p,n)', 'Q(p,2p)',
       'Q(p,pn)', 'Q(p,d)', 'Q(p,2n)', 'Q(p,t)', 'Q(p,3He)', 'Q(n,2p)',
       'Q(n,np)', 'Q(n,d)', 'Q(n,2n)', 'Q(n,t)', 'Q(n,3He)', 'Q(d,t)',
       'Q(d,3He)', '

In [52]:
df.Reference = df.Reference.fillna(df["Title"])

In [53]:
df.drop(columns=["ELV/HL", "dELV/HL"], inplace=True)

In [54]:
df.columns[df.isna().any()].tolist()

['dEnergy', 'dData']

# Exploring Uncertainty

In [55]:
import matplotlib.pyplot as plt
import seaborn as sns

In [56]:
df["Uncertainty_E"] = df["dEnergy"]/df["Energy"]
df["Uncertainty_D"] = df["dData"]/df["Data"]

In [57]:
df_copy = df.copy()

In [244]:
df = df_copy.copy()

In [58]:
df[["Uncertainty_E", "Uncertainty_D"]].isna().sum()

Uncertainty_E    3915499
Uncertainty_D     828266
dtype: int64

In [59]:
df["Uncertainty_E"] = df.groupby("Institute")['Uncertainty_E'].transform(lambda x: x.fillna(x.mean()))
df["Uncertainty_D"] = df.groupby("Institute")['Uncertainty_D'].transform(lambda x: x.fillna(x.mean()))

In [60]:
df[["Uncertainty_E", "Uncertainty_D"]].isna().sum()

Uncertainty_E    90933
Uncertainty_D      943
dtype: int64

In [61]:
df["Uncertainty_E"] = df.groupby("Element_w_A")['Uncertainty_E'].transform(lambda x: x.fillna(x.mean()))
df["Uncertainty_D"] = df.groupby("Element_w_A")['Uncertainty_D'].transform(lambda x: x.fillna(x.mean()))

In [62]:
df[["Uncertainty_E", "Uncertainty_D"]].isna().sum()

Uncertainty_E    4
Uncertainty_D    0
dtype: int64

In [63]:
df.Uncertainty_E = df.Uncertainty_E.fillna(df.Uncertainty_E.mean())

In [64]:
df[["Uncertainty_E", "Uncertainty_D"]].isna().sum()

Uncertainty_E    0
Uncertainty_D    0
dtype: int64

In [65]:
df["Nuc_Radius_fm"] = 1.25 * np.power(df["Mass_Number"], 1/3)
df["Neut_Nuc_Rad_Ratio"] = 0.8 / df["Nuc_Radius_fm"]

In [66]:
df.shape

(4641257, 86)

In [67]:
df[["dEnergy", "dData"]].isna().sum()

dEnergy    3915499
dData       828266
dtype: int64

In [68]:
df.dEnergy = df.dEnergy.fillna(df.Energy * df.Uncertainty_E)
df.dData = df.dData.fillna(df.Data * df.Uncertainty_D)

In [69]:
df.Uncertainty_D = df.Uncertainty_D.replace(to_replace=np.inf, value=0)

In [70]:
df[["dEnergy", "dData"]].isna().sum()

dEnergy       0
dData      4079
dtype: int64

In [71]:
df.fillna(value=0, inplace=True)

In [72]:
df[["dEnergy", "dData"]].isna().sum()

dEnergy    0
dData      0
dtype: int64

# Renaming

In [73]:
df.columns

Index(['Target_Meta_State', 'MT', 'Energy', 'dEnergy', 'Data', 'dData', 'I78',
       'Protons', 'Product_Meta_State', 'Frame', 'Reaction_Notation', 'Title',
       'Year', 'Institute', 'Date', 'Reference', 'Out', 'Neutrons',
       'Mass_Number', 'Element', 'Origin', 'Mass_Excess', 'dMass_Excess',
       'Binding_Energy', 'dBinding_Energy', 'B_Decay_Energy',
       'dB_Decay_Energy', 'Atomic_Mass_Micro', 'dAtomic_Mass_Micro', 'S(2n)',
       'dS(2n)', 'S(2p)', 'dS(2p)', 'Q(a)', 'dQ(a)', 'Q(2B-)', 'dQ(2B-)',
       'Q(ep)', 'dQ(ep)', 'Q(B-n)', 'dQ(B-n)', 'S(n)', 'dS(n)', 'S(p)',
       'dS(p)', 'Q(4B-)', 'dQ(4B-)', 'Q(d,a)', 'dQ(d,a)', 'Q(p,a)', 'dQ(p,a)',
       'Q(n,a)', 'dQ(n,a)', 'Q(g,p)', 'Q(g,n)', 'Q(g,pn)', 'Q(g,d)', 'Q(g,t)',
       'Q(g,He3)', 'Q(g,2p)', 'Q(g,2n)', 'Q(g,a)', 'Q(p,n)', 'Q(p,2p)',
       'Q(p,pn)', 'Q(p,d)', 'Q(p,2n)', 'Q(p,t)', 'Q(p,3He)', 'Q(n,2p)',
       'Q(n,np)', 'Q(n,d)', 'Q(n,2n)', 'Q(n,t)', 'Q(n,3He)', 'Q(d,t)',
       'Q(d,3He)', 'Q(3He,t)', 'Q(3He,a)'

In [74]:
df = df.rename(columns={"Protons":"Target_Protons", "Neutrons":"Target_Neutrons", "Mass_Number":"Target_Mass_Number"})
df = df.drop(columns=["Uncertainty_D", "Uncertainty_E"])
df["Compound_Neutrons"] = df["Target_Neutrons"] + 1
df["Compound_Mass_Number"] = df["Target_Mass_Number"] + 1

In [75]:
df.to_csv("../ML_Data/working_xs_v1.csv", index=False)

# Unskewing Data Points (Energy)

In [76]:
skewed_cols = ["Energy", "dEnergy"]
for col in skewed_cols:
    log_values = np.log10(df[col].values)
    df[col] = log_values

  This is separate from the ipykernel package so we can avoid doing imports until


In [77]:
df["Out"].value_counts()

TOT       3266572
G          541400
F          533930
INL        174304
EL          37417
X           26852
P           17599
A           16856
2N          11759
ABS          7750
T            2094
NON          1291
SCT          1225
3N            601
D             391
N+P           246
T+A           141
N+A           140
4N            135
N+T           128
2N+A           98
HE3            62
2A             52
N+2A           42
P+A            42
2P             36
N+D            23
6N             18
5N             17
2N+P           11
7N             11
8N              6
N+D+A           5
T+2A            2
2N+P+A          1
Name: Out, dtype: int64

In [78]:
df.to_csv("../ML_Data/working_xs_v1_unsk.csv", index=False)