# Cleaning Cross Section Data Text File

Let us import the necessary modules.

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 50)

# Cleaning Data

Data contains whitespace and special characters that we need to deal with. Additionally, we see that some columns do not have values but they have a value: a string of spaces. Pandas does not recognizes them as NaN values so we have to manually take care of them. We will also drop the references the YY and the SubEntry Number. 

In [181]:
colnames = ["Prj", "Targ", "Target_Meta_State", "MF", "MT", "PXC",  "Energy",  "dEnergy",  "Data", \
            "dData",   "Cos/LO",   "dCos/LO",   "ELV/HL",  "dELV/HL", "I78", "Refer", "(YY)", "EntrySubP"]
df = pd.read_csv("../ML_Data/all_cross_sections_v1.txt", names=colnames, header=None, index_col=False, sep=";")

  interactivity=interactivity, compiler=compiler, result=result)


In [182]:
# make string version of original column
df['Targ'] = df['Targ'].astype(str)

# Making Sure all rows have the same number of values
max_length = 5
df.Targ = df.Targ.apply(lambda x: '0'*(max_length - len(x)) + x)

# Target feature is formated as ZZAAA
df['Z'] = df['Targ'].str[0:2].astype(int).fillna(0)
df['M'] = df['Targ'].str[2:5].astype(int).fillna(0)

# Calculating number of neutrons = mass number - protons
df['N'] = df['M'] - df["Z"]

We assume that `Target_Meta_State` with unknown values are not Ground State. Instead they are filled with `All` per IAEA instructions.

In [183]:
df["Target_Meta_State"].unique()

array([' ', 'M', '1', '2'], dtype=object)

In [184]:
# unmarked rows are assumed to be at  ground state so we assign G value
df["Target_Meta_State"] = df["Target_Meta_State"].replace(to_replace=" ", value="All")

We assume that the `Frame` feature unknown values are `L` for Lab Frame and `Product_Meta_State`'s missing values are `G` for Ground State.

In [185]:
# PXC describes three different variables
# We extract the data to independent features
df['Product_Meta_State'] = df['PXC'].astype(str).str[0:1].replace(to_replace=" ", value="All")
df['EXFOR_Status'] = df['PXC'].astype(str).str[1:2].replace(to_replace=" ", value="Other")
df['Frame'] = df['PXC'].astype(str).str[2:3].replace(to_replace=" ", value="L")

In [187]:
df["(YY)"] = df["(YY)"].astype(str) 
df["EntrySubP"] = df["(YY)"]

# Fixing numerical features formatting.

In [188]:
# Defining Numerical Columns to Fix and casting them as strings
cols = ["Energy", "dEnergy", "Data", "dData", "Cos/LO", "dCos/LO", "ELV/HL", "dELV/HL"]
df[cols] = df[cols].astype(str)

In [189]:
# df[cols] = df[cols].replace(to_replace="         ", value="0.0000000")
df[cols] = df[cols].replace(to_replace="         ", value=np.nan)

# We now strip values that may contain quatation marks and starting and trailing spaces
for col in cols:
    df[col] = df[col].str.strip("\"")
    df[col] = df[col].str.strip()
    
# df[cols] = df[cols].replace(to_replace="", value="0.0000000")
df[cols] = df[cols].replace(to_replace="", value=np.nan)

In [190]:
# For the numerical values we know per formatting that each of them should be 9 characters in length
max_length = 9

for col in cols:
    df[col] = df[col].apply(lambda x: x if pd.isnull(x) else ' '*(max_length - len(x)) + x) 

In [191]:
# Add appropiate formating for python to recognize it as numerical 
for col in cols:
    new_col = []
    values = df[col].values
    for x in values:
        if pd.isnull(x):
            new_col.append(x)
        elif "+" == x[7]:
            y = x[0:7]
            z = x[7:]
            new_col.append(y + "E" + z)
        elif "+" == x[6]:
            y = x[0:6]
            z = x[6:]
            new_col.append(y + "E" + z)
        elif "-" == x[7]:
            y = x[0:7]
            z = x[7:]
            new_col.append(y + "E" + z)
        elif "-" == x[6]:
            y = x[0:6]
            z = x[6:]
            new_col.append(y + "E" + z)
        else:
            new_col.append(x)
    df[col] = new_col

In [192]:
# We now convert the columns to numerical
for col in cols:
    df[col] = df[col].astype(float)
    print("Finish converting {} to float.".format(col))

Finish converting Energy to float.
Finish converting dEnergy to float.
Finish converting Data to float.
Finish converting dData to float.
Finish converting Cos/LO to float.
Finish converting dCos/LO to float.
Finish converting ELV/HL to float.
Finish converting dELV/HL to float.


# Specifying Categorical Columns

In [193]:
cat_cols = ["Target_Meta_State", "MF", "MT", "I78", "Product_Meta_State", "Frame"]

# Convering all columns to strings and stripping whitespace
for col in cat_cols:
    df[col] = df[col].astype(str)
    df[col] = df[col].str.strip("\"")
    df[col] = df[col].str.strip()

In [194]:
df.I78.unique()

array(['', 'L', 'E', 'D'], dtype=object)

In [195]:
# Replace empty values in I78 for L representing Low
df["I78"] = df["I78"].replace(to_replace="", value="Other")

In [196]:
df.drop(columns=["(YY)", 'Targ', "PXC"], inplace=True)

# Exporting Cleaned Data

In [99]:
df.to_csv("../ML_Data/working_xs.csv", index=False)

# Appending Additional Information from EXFOR

In [197]:
dirpath = "EXFOR/Extracted_Text/"

In [198]:
# Reading experiments reaction notation 
df1 = pd.read_csv(dirpath + "reaction_notation.txt", delim_whitespace=True, header=None)
df1.columns = ["Reaction", "Type"]

# Reading Experiment Titles
df2 = pd.read_csv(dirpath + "titles.txt", sep="#TITLE      ", header=None, engine="python")
df2.columns = ["Keyword", "Title"]

# Reading Data Points per Experiment
df3 = pd.read_csv(dirpath + "data_points_per_experiment_refined.txt",  delim_whitespace=True, header=None)
df3.columns = ["Data", "Multiple"]

# Reading Experiment Year 
df4 = pd.read_csv(dirpath + "years.txt", delim_whitespace=True, header=None)
df4.columns = ["Keyword", "Year"]

# Reading Experiment Date 
df5 = pd.read_csv(dirpath + "authors.txt", sep="    ", header=None, engine="python")
df5.columns = ["Keyword", "Author"]

# Reading Experiment Institute 
df6 = pd.read_csv(dirpath + "institude.txt", sep="  ", header=None, engine="python")
df6.columns = ["Keyword", "Institute"]

# Reading Experiment Year 
df7 = pd.read_csv(dirpath + "dates.txt", delim_whitespace=True, header=None)
df7.columns = ["Keyword", "Date"]

# Reading Experiment Refere
df8 = pd.read_csv(dirpath + "references.txt", sep="#REFERENCE  ", header=None, engine="python")
df8.columns = ["Keyword", "Reference"]

In [199]:
# Merging Datapoints, notation and titles and expanding based on datapoints
pre_final = pd.concat([df3, df1, df2, df4, df5, df6, df7, df8], axis=1)
final = pre_final.reindex(pre_final.index.repeat(pre_final.Multiple))
final['position'] = final.groupby(level=0).cumcount() + 1

# Extracting projectile and outogoing particle
final["reaction_notation"] = final.Type.str.extract('.*\((.*)\).*')

In [200]:
final["reaction_notation2"] = final["reaction_notation"].apply(lambda x: x.split(')')[0])
final = pd.concat([final, final["reaction_notation2"].str.split(',', expand=True)], axis=1)

In [201]:
# Formatting Columns
new_columns = list(final.columns)[:19]
new_columns.extend(["Projectile", "Out"])
final.columns = new_columns

In [202]:
# Indexing only required information and saving file
final = final[["Type", "Title", "Year", "Institute", "Author", "Date", "Reference", "Out"]]

# Verify all data matches.
df.shape[0] == final.shape[0]

# Reset Indexes to make copying faster
df = df.reset_index(drop=True)
final = final.reset_index(drop=True)

In [203]:
final.head()

Unnamed: 0,Type,Title,Year,Institute,Author,Date,Reference,Out
0,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,(1USAHRV),D.F.Measday+,19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
1,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,(1USAHRV),D.F.Measday+,19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
2,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,(1USAHRV),D.F.Measday+,19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
3,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,(1USAHRV),D.F.Measday+,19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
4,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,(1USAHRV),D.F.Measday+,19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT


In [204]:
# Assign newly extracted data to main dataframe
df["Type"] = final["Type"]
df["Title"] = final["Title"]
df["Year"] = final["Year"]
df["Author"] = final["Author"]
df["Institute"] = final["Institute"]
df["Date"] = final["Date"]
df["Reference"] = final["Reference"]
df["Out"] = final["Out"]

In [205]:
df.head()

Unnamed: 0,Prj,Target_Meta_State,MF,MT,Energy,dEnergy,Data,dData,Cos/LO,dCos/LO,ELV/HL,dELV/HL,I78,Refer,EntrySubP,Z,M,N,Product_Meta_State,EXFOR_Status,Frame,Type,Title,Year,Author,Institute,Date,Reference,Out
0,1,All,3,1,88200000.0,882000.0,0.03,0.001523,,,,,Other,"D.F.MEASDAY,ET.AL. (66)",11152,0,1,1,All,D,L,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,D.F.Measday+,(1USAHRV),19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
1,1,All,3,1,98100000.0,981000.0,0.0291,0.001516,,,,,Other,"D.F.MEASDAY,ET.AL. (66)",11152,0,1,1,All,D,L,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,D.F.Measday+,(1USAHRV),19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
2,1,All,3,1,110000000.0,1100000.0,0.0279,0.001415,,,,,Other,"D.F.MEASDAY,ET.AL. (66)",11152,0,1,1,All,D,L,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,D.F.Measday+,(1USAHRV),19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
3,1,All,3,1,119600000.0,1196000.0,0.0264,0.001403,,,,,Other,"D.F.MEASDAY,ET.AL. (66)",11152,0,1,1,All,D,L,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,D.F.Measday+,(1USAHRV),19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT
4,1,All,3,1,129400000.0,1294000.0,0.0256,0.001397,,,,,Other,"D.F.MEASDAY,ET.AL. (66)",11152,0,1,1,All,D,L,"0-NN-1(N,TOT),,SIG","NEUTRON TOTAL CROSS SECTIONS FOR NEUTRONS, PRO...",1966,D.F.Measday+,(1USAHRV),19800804,"Jour. Nuclear Physics Vol.85, p.142, 1966",TOT


Shape must be 6007126

In [206]:
df.Title = df.Title.fillna("No Title")
# df = df[df.N != -1]

In [207]:
df.shape

(6007126, 29)

In [114]:
# Save Dataframe
df.to_csv("../ML_Data/working_xs.csv", index=False)

# Merging EXFOR and AME Data

In [208]:
df_workxs = df.copy()
# df_workxs = pd.read_csv("../ML_Data/working_xs.csv")

In [209]:
df_workxs.columns

Index(['Prj', 'Target_Meta_State', 'MF', 'MT', 'Energy', 'dEnergy', 'Data',
       'dData', 'Cos/LO', 'dCos/LO', 'ELV/HL', 'dELV/HL', 'I78', 'Refer',
       'EntrySubP', 'Z', 'M', 'N', 'Product_Meta_State', 'EXFOR_Status',
       'Frame', 'Type', 'Title', 'Year', 'Author', 'Institute', 'Date',
       'Reference', 'Out'],
      dtype='object')

In [210]:
masses = pd.read_csv("./AME_Files/AME_Isotopic_Properties.csv").rename(
    columns={'N': 'Neutrons', 'A': 'Mass_Number', 'Neutrons':'N', 'Mass_Number':'M'})
masses.head()

Unnamed: 0,Neutrons,Z,Mass_Number,EL,O,Mass_Excess,dMass_Excess,Binding_Energy,dBinding_Energy,B_Decay_Energy,dB_Decay_Energy,Atomic_Mass_Micro,dAtomic_Mass_Micro,S(2n),dS(2n),S(2p),dS(2p),Q(a),dQ(a),Q(2B-),dQ(2B-),Q(ep),dQ(ep),Q(B-n),dQ(B-n),S(n),dS(n),S(p),dS(p),Q(4B-),dQ(4B-),"Q(d,a)","dQ(d,a)","Q(p,a)","dQ(p,a)","Q(n,a)","dQ(n,a)","Q(g,p)","Q(g,n)","Q(g,pn)","Q(g,d)","Q(g,t)","Q(g,He3)","Q(g,2p)","Q(g,2n)","Q(g,a)","Q(p,n)","Q(p,2p)","Q(p,pn)","Q(p,d)","Q(p,2n)","Q(p,t)","Q(p,3He)","Q(n,2p)","Q(n,np)","Q(n,d)","Q(n,2n)","Q(n,t)","Q(n,3He)","Q(d,t)","Q(d,3He)","Q(3He,t)","Q(3He,a)","Q(t,a)",N,M,Flag
0,1,0,1,n,Other,8071.31713,0.00046,0.0,0.0,782.347,0.0,1008665.0,0.00049,15404.483723,162.252148,13771.880283,158.524008,-1125.3436,142.081942,-232.1475,160.8227,-6859.135662,158.184195,-7754.629285,161.152354,0.0,0.0,6889.086305,162.896488,-343.812798,175.128817,11405.545508,176.651049,5916.436032,171.776354,6730.114706,163.516658,-6889.086305,-0.0,-14665.548392,-12440.982392,-13897.428868,-13847.504694,-13771.880283,-15404.483723,-1125.3436,0.0005,-6889.086305,-0.0,2224.566,-8536.975785,-6922.688823,-6947.507992,-6076.789162,-6889.086305,-4664.520305,-0.0,-6183.753392,-6053.839883,6257.229,-1395.611905,763.755,20577.6194,12924.778595,1,1,I
1,0,1,1,H,Other,7288.97061,9e-05,0.0,0.0,18244.328,289.9558,1007825.0,9e-05,2025.412,292.506,13771.880283,158.524008,-1125.3436,142.081942,13762.268,719.024,-6859.135662,158.184195,17514.9925,362.0875,1096.973333,256.595,0.0,0.0,8007.5,1511.5,20717.915,0.0,20613.86,50.0,6730.114706,163.516658,-0.0,-1096.973333,-5353.1789,-3128.6129,799.9951,-13847.504694,-13771.880283,-2025.412,-1125.3436,17461.9815,-0.0,-1096.973333,1127.592667,16732.646,6456.3829,2364.8615,-6076.789162,-0.0,2224.566,-1096.973333,3128.6161,-6053.839883,5160.255667,5493.4744,18225.736,19480.646067,19813.8649,0,1,I
2,1,1,2,H,Other,13135.72176,0.00011,1112.283,0.0,18244.328,289.9558,2014102.0,0.00012,2025.412,292.506,13771.880283,158.524008,-1125.3436,142.081942,13762.268,719.024,-6859.135662,158.184195,17514.9925,362.0875,2224.57,0.0,2224.57,0.0,8007.5,1511.5,23846.53,0.0,20613.86,50.0,6730.114706,163.516658,-2224.57,-2224.57,-2224.5639,0.0021,799.9951,-13847.504694,-13771.880283,-2025.412,-1125.3436,17461.9815,-2224.57,-2224.57,-0.004,16732.646,6456.3829,5493.4765,-6076.789162,-2224.57,-0.004,-2224.57,6257.2311,-6053.839883,4032.659,3268.9044,18225.736,18353.0494,17589.2949,1,2,I
3,2,1,3,H,Other,14949.80993,0.00022,2827.265,0.0,18.592,0.0,3016049.0,0.00023,8481.79,0.0,13771.880283,158.524008,-1125.3436,142.081942,-13717.0,2000.0,-6859.135662,158.184195,17514.9925,362.0875,6257.23,0.0,1112.285,0.0,8007.5,1511.5,17589.3,0.0,19813.86,0.0,6730.114706,163.516658,-1112.285,-6257.23,-8481.7939,-6257.2279,-0.0049,-13847.504694,-13771.880283,-8481.79,-1125.3436,-763.7545,-1112.285,-6257.23,-4032.664,16732.646,0.0049,-763.7535,-6076.789162,-1112.285,1112.281,-6257.23,0.0011,-6053.839883,-0.001,4381.1894,0.0,14320.3894,18701.5799,2,3,I
4,1,2,3,He,Other,14931.21793,0.00021,2572.68,0.0,-13736.0,2000.0,3016029.0,0.00022,4013.16,30.3,7718.04,0.0,367.5,10.0,12743.205,359.296667,-6859.135662,158.184195,-2571.224286,344.341429,3176.184286,29.417143,5493.47,0.0,14022.91,52.653333,18353.05,0.0,4173.344286,210.882857,20577.62,0.0,-5493.47,-3176.184286,-7718.0439,-5493.4779,-15640.520614,0.0006,-7718.04,-4013.16,367.5,-14518.3465,-5493.47,-3176.184286,-951.618286,-3353.570786,4468.6349,-0.0035,-6076.789162,-5493.47,-3268.904,-3176.184286,763.7511,0.0004,3081.044714,0.0044,-13754.592,17401.435114,14320.3949,1,3,I


In [211]:
df_workxs = df_workxs.reset_index(drop=True)
masses = masses.reset_index(drop=True)

In [212]:
df_workxs.shape

(6007126, 29)

In [213]:
df = df_workxs.merge(masses, on=['N', 'Z'], how='left')

In [214]:
df.shape

(6007126, 94)

# Neutron Induced Cross Section vs Energy Data 

MF are ENDF labels and are used to store different types of data:

- MF=1 contains descriptive and miscellaneous data,
- MF=2 contains resonance parameter data,
- MF=3 contains reaction cross sections vs energy,
- MF=4 contains angular distributions,
- MF=5 contains energy distributions,
- MF=6 contains energy-angle distributions,
- MF=7 contains thermal scattering data,
- MF=8 contains radioactivity data
- MF=9-10 contain nuclide production data,
- MF=12-15 contain photon production data, and
- MF=30-36 contain covariance data.

In [215]:
df.MF = df.MF.astype(str)
df.MT = df.MT.astype(int)

In [216]:
df = df[df["MF"] == "3"]

In [217]:
df = df[df["MT"] < 999] # Cross Section Ratios

In [218]:
df.shape

(4534386, 94)

In [219]:
df.columns

Index(['Prj', 'Target_Meta_State', 'MF', 'MT', 'Energy', 'dEnergy', 'Data',
       'dData', 'Cos/LO', 'dCos/LO', 'ELV/HL', 'dELV/HL', 'I78', 'Refer',
       'EntrySubP', 'Z', 'M_x', 'N', 'Product_Meta_State', 'EXFOR_Status',
       'Frame', 'Type', 'Title', 'Year', 'Author', 'Institute', 'Date',
       'Reference', 'Out', 'Neutrons', 'Mass_Number', 'EL', 'O', 'Mass_Excess',
       'dMass_Excess', 'Binding_Energy', 'dBinding_Energy', 'B_Decay_Energy',
       'dB_Decay_Energy', 'Atomic_Mass_Micro', 'dAtomic_Mass_Micro', 'S(2n)',
       'dS(2n)', 'S(2p)', 'dS(2p)', 'Q(a)', 'dQ(a)', 'Q(2B-)', 'dQ(2B-)',
       'Q(ep)', 'dQ(ep)', 'Q(B-n)', 'dQ(B-n)', 'S(n)', 'dS(n)', 'S(p)',
       'dS(p)', 'Q(4B-)', 'dQ(4B-)', 'Q(d,a)', 'dQ(d,a)', 'Q(p,a)', 'dQ(p,a)',
       'Q(n,a)', 'dQ(n,a)', 'Q(g,p)', 'Q(g,n)', 'Q(g,pn)', 'Q(g,d)', 'Q(g,t)',
       'Q(g,He3)', 'Q(g,2p)', 'Q(g,2n)', 'Q(g,a)', 'Q(p,n)', 'Q(p,2p)',
       'Q(p,pn)', 'Q(p,d)', 'Q(p,2n)', 'Q(p,t)', 'Q(p,3He)', 'Q(n,2p)',
       'Q(n,np)', '

In [220]:
columns_drop = ["MF", "Cos/LO", "dCos/LO", "Prj", "M_x", "M_y", "N"]
df = df.drop(columns=columns_drop)

In [221]:
df["O"].fillna(value="Other", inplace=True)

In [222]:
df_copy = df.copy()

In [223]:
df.shape

(4534386, 87)

In [224]:
# df = df[~df.Neutrons.isna()]

# Checkpoint

In [138]:
df.to_csv("../ML_Data/working_xs.csv", index=False)

# Exploring Missing Values

In [225]:
# We get rid of heavy water measurments
df = df[~df.EL.isna()]

In [226]:
df["Neutrons"] = df["Neutrons"].astype(int)
df["Mass_Number"] = df["Mass_Number"].astype(int)

In [227]:
df["Reference"] = df["Author"] + " " + df["Reference"]
df = df.drop(columns=["Refer", "Author"])

In [228]:
df.columns[df.isna().any()].tolist()

['dEnergy', 'dData', 'ELV/HL', 'dELV/HL', 'Reference']

In [229]:
df = df.rename(columns={"Z":"Protons", "EL":"Element", "O":"Origin", "Type":"Reaction_Notation"})

In [230]:
df.columns

Index(['Target_Meta_State', 'MT', 'Energy', 'dEnergy', 'Data', 'dData',
       'ELV/HL', 'dELV/HL', 'I78', 'EntrySubP', 'Protons',
       'Product_Meta_State', 'EXFOR_Status', 'Frame', 'Reaction_Notation',
       'Title', 'Year', 'Institute', 'Date', 'Reference', 'Out', 'Neutrons',
       'Mass_Number', 'Element', 'Origin', 'Mass_Excess', 'dMass_Excess',
       'Binding_Energy', 'dBinding_Energy', 'B_Decay_Energy',
       'dB_Decay_Energy', 'Atomic_Mass_Micro', 'dAtomic_Mass_Micro', 'S(2n)',
       'dS(2n)', 'S(2p)', 'dS(2p)', 'Q(a)', 'dQ(a)', 'Q(2B-)', 'dQ(2B-)',
       'Q(ep)', 'dQ(ep)', 'Q(B-n)', 'dQ(B-n)', 'S(n)', 'dS(n)', 'S(p)',
       'dS(p)', 'Q(4B-)', 'dQ(4B-)', 'Q(d,a)', 'dQ(d,a)', 'Q(p,a)', 'dQ(p,a)',
       'Q(n,a)', 'dQ(n,a)', 'Q(g,p)', 'Q(g,n)', 'Q(g,pn)', 'Q(g,d)', 'Q(g,t)',
       'Q(g,He3)', 'Q(g,2p)', 'Q(g,2n)', 'Q(g,a)', 'Q(p,n)', 'Q(p,2p)',
       'Q(p,pn)', 'Q(p,d)', 'Q(p,2n)', 'Q(p,t)', 'Q(p,3He)', 'Q(n,2p)',
       'Q(n,np)', 'Q(n,d)', 'Q(n,2n)', 'Q(n,t)', 'Q(n,3

In [231]:
# # Assuming Unknown values are ground state
# df["Product_Meta_State"] = df["Product_Meta_State"].astype(str)
# df["Product_Meta_State"] = df["Product_Meta_State"].replace(to_replace="?", value="G")

In [232]:
df["Element_w_A"] = df["Mass_Number"].astype(str) + df.Element

# Uncertainty Missing Values

The uncertainty is not given for every experiment. Missing values happen when they are not specified in the entries and are given in the respective paper, or are simply not given. In any case, it will be very tidius to go one by one finding uncertanties. For this, we take the mean of the current uncertanties and fill missing values using the mean uncertantity multiply times the energy values. 

**it would be better to assign mean uncertainty per facility, per author, or per dataset**

In [233]:
df.columns

Index(['Target_Meta_State', 'MT', 'Energy', 'dEnergy', 'Data', 'dData',
       'ELV/HL', 'dELV/HL', 'I78', 'EntrySubP', 'Protons',
       'Product_Meta_State', 'EXFOR_Status', 'Frame', 'Reaction_Notation',
       'Title', 'Year', 'Institute', 'Date', 'Reference', 'Out', 'Neutrons',
       'Mass_Number', 'Element', 'Origin', 'Mass_Excess', 'dMass_Excess',
       'Binding_Energy', 'dBinding_Energy', 'B_Decay_Energy',
       'dB_Decay_Energy', 'Atomic_Mass_Micro', 'dAtomic_Mass_Micro', 'S(2n)',
       'dS(2n)', 'S(2p)', 'dS(2p)', 'Q(a)', 'dQ(a)', 'Q(2B-)', 'dQ(2B-)',
       'Q(ep)', 'dQ(ep)', 'Q(B-n)', 'dQ(B-n)', 'S(n)', 'dS(n)', 'S(p)',
       'dS(p)', 'Q(4B-)', 'dQ(4B-)', 'Q(d,a)', 'dQ(d,a)', 'Q(p,a)', 'dQ(p,a)',
       'Q(n,a)', 'dQ(n,a)', 'Q(g,p)', 'Q(g,n)', 'Q(g,pn)', 'Q(g,d)', 'Q(g,t)',
       'Q(g,He3)', 'Q(g,2p)', 'Q(g,2n)', 'Q(g,a)', 'Q(p,n)', 'Q(p,2p)',
       'Q(p,pn)', 'Q(p,d)', 'Q(p,2n)', 'Q(p,t)', 'Q(p,3He)', 'Q(n,2p)',
       'Q(n,np)', 'Q(n,d)', 'Q(n,2n)', 'Q(n,t)', 'Q(n,3

In [234]:
df.Reference = df.Reference.fillna(df["Title"])

In [235]:
df.columns[df.isna().any()].tolist()

['dEnergy', 'dData', 'ELV/HL', 'dELV/HL']

# Checkpoint

In [236]:
df.to_csv("../ML_Data/working_xs.csv", index=False)

# Exploring Uncertainty

In [492]:
df = pd.read_csv("../ML_Data/working_xs.csv")

In [494]:
# missing_uncertanties_institute = df[["Institute","dEnergy"]].drop('Institute', 1).isna().groupby(df.Institute, sort=False).sum().reset_index()
# missing_uncertanties_institute = missing_uncertanties_institute[missing_uncertanties_institute.dEnergy > 0]
# missing_uncertanties_institute = missing_uncertanties_institute.sort_values('dEnergy', ascending=False)

# missing_uncertanties_reference = df[["Institute","dEnergy"]].drop('Institute', 1).isna().groupby(df.Institute, sort=False).sum().reset_index()
# missing_uncertanties_reference = missing_uncertanties_reference[missing_uncertanties_reference.dEnergy > 0]
# missing_uncertanties_reference = missing_uncertanties_reference.sort_values('dEnergy', ascending=False)

# missing_uncertanties_reference.to_csv("./Extracted_Text/missing_unc_ref.csv", index=False)
# missing_uncertanties_institute.to_csv("./Extracted_Text/missing_unc_ins.csv", index=False)

In [237]:
df["Uncertainty_E"] = df["dEnergy"]/df["Energy"]
df["Uncertainty_D"] = df["dData"]/df["Data"]
df["Uncertainty_ELV"] = df["dELV/HL"]/df["ELV/HL"]

In [270]:
df_copy = df.copy()

In [269]:
df = df_copy.copy()

In [271]:
df[["Uncertainty_E", "Uncertainty_D", "Uncertainty_ELV"]].isna().sum()

Uncertainty_E      3885139
Uncertainty_D       822976
Uncertainty_ELV    4528759
dtype: int64

### Fill by Reaction Channel

In [272]:
df["Uncertainty_E"] = df[["MT", "Uncertainty_E"]].groupby("MT").transform(lambda x: x.fillna(x.mean()))
df["Uncertainty_D"] = df[["MT", "Uncertainty_D"]].groupby("MT").transform(lambda x: x.fillna(x.mean()))
df["Uncertainty_ELV"] = df[["MT", "Uncertainty_ELV"]].groupby("MT").transform(lambda x: x.fillna(x.mean()))

In [273]:
df[["Uncertainty_E", "Uncertainty_D", "Uncertainty_ELV"]].isna().sum()

Uncertainty_E            6
Uncertainty_D            0
Uncertainty_ELV    3931364
dtype: int64

### Fill by Institute

In [274]:
df["Uncertainty_E"] = df[["Institute", "Uncertainty_E"]].groupby("Institute").transform(lambda x: x.fillna(x.mean()))
df["Uncertainty_D"] = df[["Institute", "Uncertainty_D"]].groupby("Institute").transform(lambda x: x.fillna(x.mean()))
df["Uncertainty_ELV"] = df[["Institute", "Uncertainty_ELV"]].groupby("Institute").transform(lambda x: x.fillna(x.mean()))

In [275]:
df[["Uncertainty_E", "Uncertainty_D", "Uncertainty_ELV"]].isna().sum()

Uncertainty_E           0
Uncertainty_D           0
Uncertainty_ELV    116872
dtype: int64

### Fill by Isotope

In [276]:
df["Uncertainty_E"] = df[["Element_w_A", "Uncertainty_E"]].groupby("Element_w_A").transform(lambda x: x.fillna(x.mean()))
df["Uncertainty_D"] = df[["Element_w_A", "Uncertainty_D"]].groupby("Element_w_A").transform(lambda x: x.fillna(x.mean()))
df["Uncertainty_ELV"] = df[["Element_w_A", "Uncertainty_ELV"]].groupby("Element_w_A").transform(lambda x: x.fillna(x.mean()))

In [277]:
df[["Uncertainty_E", "Uncertainty_D", "Uncertainty_ELV"]].isna().sum()

Uncertainty_E      0
Uncertainty_D      0
Uncertainty_ELV    8
dtype: int64

In [278]:
df["Uncertainty_ELV"] = df[["I78", "Uncertainty_ELV"]].groupby("I78").transform(lambda x: x.fillna(x.mean()))

In [279]:
df[["Uncertainty_E", "Uncertainty_D", "Uncertainty_ELV"]].isna().sum()

Uncertainty_E      0
Uncertainty_D      0
Uncertainty_ELV    0
dtype: int64

In [280]:
df["Nuc_Radius_fm"] = 1.25 * np.power(df["Mass_Number"], 1/3)
df["Neut_Nuc_Rad_Ratio"] = 0.8 / df["Nuc_Radius_fm"]

In [281]:
df.shape

(4533499, 91)

### Having Filled Uncertainty Fraction Values let us fill the actual Uncertainties

In [282]:
df[["dEnergy", "dData", "dELV/HL"]].isna().sum()

dEnergy    3885139
dData       822976
dELV/HL    4528690
dtype: int64

In [283]:
df.dEnergy = df.dEnergy.fillna(df.Energy * df.Uncertainty_E)
df.dData = df.dData.fillna(df.Data * df.Uncertainty_D)
df["dELV/HL"] = df["dELV/HL"].fillna(df["ELV/HL"] * df["Uncertainty_ELV"])

In [284]:
df.Uncertainty_D = df.Uncertainty_D.replace(to_replace=np.inf, value=0)

In [285]:
df.dData = df.dData.replace(to_replace=np.nan, value=0)
df["dELV/HL"] = df["dELV/HL"].replace(to_replace=np.nan, value=0)

In [286]:
df[["dEnergy", "dData", "dELV/HL"]].isna().sum()

dEnergy    0
dData      0
dELV/HL    0
dtype: int64

In [287]:
df["ELV/HL"] = df["ELV/HL"].replace(to_replace=np.nan, value=0)

In [288]:
df.fillna(value=0, inplace=True)

In [289]:
df.columns

Index(['Target_Meta_State', 'MT', 'Energy', 'dEnergy', 'Data', 'dData',
       'ELV/HL', 'dELV/HL', 'I78', 'EntrySubP', 'Protons',
       'Product_Meta_State', 'EXFOR_Status', 'Frame', 'Reaction_Notation',
       'Title', 'Year', 'Institute', 'Date', 'Reference', 'Out', 'Neutrons',
       'Mass_Number', 'Element', 'Origin', 'Mass_Excess', 'dMass_Excess',
       'Binding_Energy', 'dBinding_Energy', 'B_Decay_Energy',
       'dB_Decay_Energy', 'Atomic_Mass_Micro', 'dAtomic_Mass_Micro', 'S(2n)',
       'dS(2n)', 'S(2p)', 'dS(2p)', 'Q(a)', 'dQ(a)', 'Q(2B-)', 'dQ(2B-)',
       'Q(ep)', 'dQ(ep)', 'Q(B-n)', 'dQ(B-n)', 'S(n)', 'dS(n)', 'S(p)',
       'dS(p)', 'Q(4B-)', 'dQ(4B-)', 'Q(d,a)', 'dQ(d,a)', 'Q(p,a)', 'dQ(p,a)',
       'Q(n,a)', 'dQ(n,a)', 'Q(g,p)', 'Q(g,n)', 'Q(g,pn)', 'Q(g,d)', 'Q(g,t)',
       'Q(g,He3)', 'Q(g,2p)', 'Q(g,2n)', 'Q(g,a)', 'Q(p,n)', 'Q(p,2p)',
       'Q(p,pn)', 'Q(p,d)', 'Q(p,2n)', 'Q(p,t)', 'Q(p,3He)', 'Q(n,2p)',
       'Q(n,np)', 'Q(n,d)', 'Q(n,2n)', 'Q(n,t)', 'Q(n,3

In [290]:
# df[df.Reaction_Notation.str.contains("RAW")]

# Ordering and Renaming

In [291]:
# Use this for ordering
new_order = list(df.columns)[:24]
new_order_2 = list(df.columns)[-7:]
new_order.extend(new_order_2)
nuclear_data_target = list(df.columns)[24:-7]
new_order.extend(nuclear_data_target)

# use these for renaming
nuclear_data_target_cols = ["Target_" + s for s in nuclear_data_target]

In [292]:
df = df[new_order]

In [293]:
df.columns

Index(['Target_Meta_State', 'MT', 'Energy', 'dEnergy', 'Data', 'dData',
       'ELV/HL', 'dELV/HL', 'I78', 'EntrySubP', 'Protons',
       'Product_Meta_State', 'EXFOR_Status', 'Frame', 'Reaction_Notation',
       'Title', 'Year', 'Institute', 'Date', 'Reference', 'Out', 'Neutrons',
       'Mass_Number', 'Element', 'Flag', 'Element_w_A', 'Uncertainty_E',
       'Uncertainty_D', 'Uncertainty_ELV', 'Nuc_Radius_fm',
       'Neut_Nuc_Rad_Ratio', 'Origin', 'Mass_Excess', 'dMass_Excess',
       'Binding_Energy', 'dBinding_Energy', 'B_Decay_Energy',
       'dB_Decay_Energy', 'Atomic_Mass_Micro', 'dAtomic_Mass_Micro', 'S(2n)',
       'dS(2n)', 'S(2p)', 'dS(2p)', 'Q(a)', 'dQ(a)', 'Q(2B-)', 'dQ(2B-)',
       'Q(ep)', 'dQ(ep)', 'Q(B-n)', 'dQ(B-n)', 'S(n)', 'dS(n)', 'S(p)',
       'dS(p)', 'Q(4B-)', 'dQ(4B-)', 'Q(d,a)', 'dQ(d,a)', 'Q(p,a)', 'dQ(p,a)',
       'Q(n,a)', 'dQ(n,a)', 'Q(g,p)', 'Q(g,n)', 'Q(g,pn)', 'Q(g,d)', 'Q(g,t)',
       'Q(g,He3)', 'Q(g,2p)', 'Q(g,2n)', 'Q(g,a)', 'Q(p,n)', 'Q(p,2p)'

In [294]:
df = df.rename(columns={"Protons":"Target_Protons", "Neutrons":"Target_Neutrons", 
                        "Mass_Number":"Target_Mass_Number", "Element":"Target_Element", 
                        "Flag": "Target_Flag", "Nuc_Radius_fm":"Target_Radius", 
                        "Neut_Nuc_Rad_Ratio":"Target_Neut_Rad_Ratio", "Element_w_A":"Target_Element_w_A"})
df = df.drop(columns=["Uncertainty_D", "Uncertainty_E", "Uncertainty_ELV"])

In [298]:
new_order = list(df.columns)[:28]
nuclear_data_target = list(df.columns)[28:]
nuclear_data_target_cols = ["Target_" + s for s in nuclear_data_target]
new_order.extend(nuclear_data_target_cols)

In [299]:
df.columns = new_order

In [300]:
df.to_csv("../ML_Data/working_xs_v1.csv", index=False)

# Adding Compound Nucleus Info

In [387]:
df = pd.read_csv("../ML_Data/working_xs_v1.csv")

In [301]:
df["Compound_Neutrons"] = df.Target_Neutrons + 1
df["Compound_Mass_Number"] = df.Target_Mass_Number + 1
df["Compound_Protons"] = df.Target_Protons

In [302]:
df_copy = df.copy()

In [303]:
masses = pd.read_csv("./AME_Files/AME_Isotopic_Properties.csv")
masses = masses[masses.Flag == "I"]
masses = masses.drop(columns=["Neutrons", "Mass_Number", "Flag"])
masses = masses.rename(columns={'N': 'Neutrons', 'A': 'Mass_Number', "Z":"Protons", "O":"Origin"})

In [304]:
nuclear_data_compound = list(masses.columns)
nuclear_data_compound_cols = ["Compound_" + s for s in nuclear_data_compound]

In [305]:
masses.columns = nuclear_data_compound_cols

In [306]:
masses.head()

Unnamed: 0,Compound_Neutrons,Compound_Protons,Compound_Mass_Number,Compound_EL,Compound_Origin,Compound_Mass_Excess,Compound_dMass_Excess,Compound_Binding_Energy,Compound_dBinding_Energy,Compound_B_Decay_Energy,Compound_dB_Decay_Energy,Compound_Atomic_Mass_Micro,Compound_dAtomic_Mass_Micro,Compound_S(2n),Compound_dS(2n),Compound_S(2p),Compound_dS(2p),Compound_Q(a),Compound_dQ(a),Compound_Q(2B-),Compound_dQ(2B-),Compound_Q(ep),Compound_dQ(ep),Compound_Q(B-n),Compound_dQ(B-n),Compound_S(n),Compound_dS(n),Compound_S(p),Compound_dS(p),Compound_Q(4B-),Compound_dQ(4B-),"Compound_Q(d,a)","Compound_dQ(d,a)","Compound_Q(p,a)","Compound_dQ(p,a)","Compound_Q(n,a)","Compound_dQ(n,a)","Compound_Q(g,p)","Compound_Q(g,n)","Compound_Q(g,pn)","Compound_Q(g,d)","Compound_Q(g,t)","Compound_Q(g,He3)","Compound_Q(g,2p)","Compound_Q(g,2n)","Compound_Q(g,a)","Compound_Q(p,n)","Compound_Q(p,2p)","Compound_Q(p,pn)","Compound_Q(p,d)","Compound_Q(p,2n)","Compound_Q(p,t)","Compound_Q(p,3He)","Compound_Q(n,2p)","Compound_Q(n,np)","Compound_Q(n,d)","Compound_Q(n,2n)","Compound_Q(n,t)","Compound_Q(n,3He)","Compound_Q(d,t)","Compound_Q(d,3He)","Compound_Q(3He,t)","Compound_Q(3He,a)","Compound_Q(t,a)"
0,1,0,1,n,Other,8071.31713,0.00046,0.0,0.0,782.347,0.0,1008665.0,0.00049,15404.483723,162.252148,13771.880283,158.524008,-1125.3436,142.081942,-232.1475,160.8227,-6859.135662,158.184195,-7754.629285,161.152354,0.0,0.0,6889.086305,162.896488,-343.812798,175.128817,11405.545508,176.651049,5916.436032,171.776354,6730.114706,163.516658,-6889.086305,-0.0,-14665.548392,-12440.982392,-13897.428868,-13847.504694,-13771.880283,-15404.483723,-1125.3436,0.0005,-6889.086305,-0.0,2224.566,-8536.975785,-6922.688823,-6947.507992,-6076.789162,-6889.086305,-4664.520305,-0.0,-6183.753392,-6053.839883,6257.229,-1395.611905,763.755,20577.6194,12924.778595
1,0,1,1,H,Other,7288.97061,9e-05,0.0,0.0,18244.328,289.9558,1007825.0,9e-05,2025.412,292.506,13771.880283,158.524008,-1125.3436,142.081942,13762.268,719.024,-6859.135662,158.184195,17514.9925,362.0875,1096.973333,256.595,0.0,0.0,8007.5,1511.5,20717.915,0.0,20613.86,50.0,6730.114706,163.516658,-0.0,-1096.973333,-5353.1789,-3128.6129,799.9951,-13847.504694,-13771.880283,-2025.412,-1125.3436,17461.9815,-0.0,-1096.973333,1127.592667,16732.646,6456.3829,2364.8615,-6076.789162,-0.0,2224.566,-1096.973333,3128.6161,-6053.839883,5160.255667,5493.4744,18225.736,19480.646067,19813.8649
2,1,1,2,H,Other,13135.72176,0.00011,1112.283,0.0,18244.328,289.9558,2014102.0,0.00012,2025.412,292.506,13771.880283,158.524008,-1125.3436,142.081942,13762.268,719.024,-6859.135662,158.184195,17514.9925,362.0875,2224.57,0.0,2224.57,0.0,8007.5,1511.5,23846.53,0.0,20613.86,50.0,6730.114706,163.516658,-2224.57,-2224.57,-2224.5639,0.0021,799.9951,-13847.504694,-13771.880283,-2025.412,-1125.3436,17461.9815,-2224.57,-2224.57,-0.004,16732.646,6456.3829,5493.4765,-6076.789162,-2224.57,-0.004,-2224.57,6257.2311,-6053.839883,4032.659,3268.9044,18225.736,18353.0494,17589.2949
3,2,1,3,H,Other,14949.80993,0.00022,2827.265,0.0,18.592,0.0,3016049.0,0.00023,8481.79,0.0,13771.880283,158.524008,-1125.3436,142.081942,-13717.0,2000.0,-6859.135662,158.184195,17514.9925,362.0875,6257.23,0.0,1112.285,0.0,8007.5,1511.5,17589.3,0.0,19813.86,0.0,6730.114706,163.516658,-1112.285,-6257.23,-8481.7939,-6257.2279,-0.0049,-13847.504694,-13771.880283,-8481.79,-1125.3436,-763.7545,-1112.285,-6257.23,-4032.664,16732.646,0.0049,-763.7535,-6076.789162,-1112.285,1112.281,-6257.23,0.0011,-6053.839883,-0.001,4381.1894,0.0,14320.3894,18701.5799
4,1,2,3,He,Other,14931.21793,0.00021,2572.68,0.0,-13736.0,2000.0,3016029.0,0.00022,4013.16,30.3,7718.04,0.0,367.5,10.0,12743.205,359.296667,-6859.135662,158.184195,-2571.224286,344.341429,3176.184286,29.417143,5493.47,0.0,14022.91,52.653333,18353.05,0.0,4173.344286,210.882857,20577.62,0.0,-5493.47,-3176.184286,-7718.0439,-5493.4779,-15640.520614,0.0006,-7718.04,-4013.16,367.5,-14518.3465,-5493.47,-3176.184286,-951.618286,-3353.570786,4468.6349,-0.0035,-6076.789162,-5493.47,-3268.904,-3176.184286,763.7511,0.0004,3081.044714,0.0044,-13754.592,17401.435114,14320.3949


In [307]:
df = df.reset_index(drop=True)
masses = masses.reset_index(drop=True)

df = df.merge(masses, on=['Compound_Neutrons', 'Compound_Protons'], how='left')

In [308]:
df[df.isna().any(axis=1)].Target_Element_w_A.unique()

array(['1n'], dtype=object)

In [309]:
df = df.drop(columns=["Compound_Mass_Number_y"])
df = df.rename(columns={'Compound_Mass_Number_x': 'Compound_Mass_Number'})

In [310]:
q_value = [col for col in df.columns if 'Q' in col]
df = df.drop(columns=q_value)

In [312]:
df.shape

(4533499, 66)

In [314]:
df.to_csv("../ML_Data/working_xs_v2.csv", index=False)

# Without RAW Dataset

In [315]:
df_no_raw = df[~df.Reaction_Notation.str.contains("RAW")]

In [316]:
df_no_raw.shape

(4221987, 66)

In [317]:
df_no_raw.to_csv("../ML_Data/working_xs_v2_unraw.csv", index=False)

# Clever NaN Fillings for Compounds

- df[df.Type.str.contains("WTR")]
- df[df.Type.str.contains("BNZ")]
- df[df.Type.str.contains("D2O")]
- df[df.Type.str.contains("DXX")]
- df[df.Type.str.contains("PLE")]
- df[df.Type.str.contains("MTH")]
- df[df.Type.str.contains("CXX")]
- df[df.Type.str.contains("D2O")]

In [256]:
# df = pd.read_csv("../ML_Data/working_xs.csv")

In [258]:
to_fill = list(df.columns)[25:-1]

In [259]:
df.loc[df.Type.str.contains("WTR"), ['Z', 'Neutrons', 'Mass_Number', 'EL', 'Flag']] = 10, 8, 18, "Water", "C"
df.loc[df.Type.str.contains("D2O"), ['Z', 'Neutrons', 'Mass_Number', 'EL', 'Flag']] = 10, 10, 20, "Heavy_Water", "C"
df.loc[df.Type.str.contains("BNZ"), ['Z', 'Neutrons', 'Mass_Number', 'EL', 'Flag']] = 42, 36, 78, "Benzene", "C"

df.loc[(df.Type.str.contains("CXX")) & (df.Title.str.contains("ethanol")), 
       ['Z', 'Neutrons', 'Mass_Number', 'EL', 'Flag']] = 26, 20, 46, "1H_Ethanol", "C"

df.loc[(df.Type.str.contains("CXX")) & (df.Title.str.contains("2-propanol")), 
       ['Z', 'Neutrons', 'Mass_Number', 'EL', 'Flag']] = 34, 26, 60, "1H_Propanol", "C"

df.loc[(df.Type.str.contains("DXX")) & (df.Title.str.contains("ethanol")), 
       ['Z', 'Neutrons', 'Mass_Number', 'EL', 'Flag']] = 26, 26, 52, "2H_Ethanol", "C"

df.loc[(df.Type.str.contains("DXX")) & (df.Title.str.contains("2-propanol")), 
       ['Z', 'Neutrons', 'Mass_Number', 'EL', 'Flag']] = 34, 34, 68, "2H_Propanol", "C"

In [260]:
df.shape

(4645678, 85)

In [261]:
df = df[~df.Type.str.contains("PLE")]
df = df[~df.Type.str.contains("MTH")]

In [262]:
df.shape

(4643849, 85)

In [263]:
oxygen_16 = df[(df.Neutrons == 8) & (df.Z == 8) & (df.Flag == "I")][to_fill].drop_duplicates()
hydrogen_1 = df[(df.Neutrons == 0) & (df.Z == 1) & (df.Flag == "I")][to_fill].drop_duplicates()
hydrogen_2 = df[(df.Neutrons == 1) & (df.Z == 1) & (df.Flag == "I")][to_fill].drop_duplicates()
carbon_12 = df[(df.Neutrons == 6) & (df.Z == 6) & (df.Flag == "I")][to_fill].drop_duplicates()

In [264]:
ethanol_1 = (16/46)*oxygen_16.values + (6/46)*hydrogen_1.values + (24/46)*carbon_12.values # 1 ethanol
propanol_1 = (16/60)*oxygen_16.values + (8/60)*hydrogen_1.values + (36/60)*carbon_12.values # 1 propanol
ethanol_2 = (16/52)*oxygen_16.values + (12/52)*hydrogen_2.values + (24/52)*carbon_12.values # 2 ethanol
propanol_2 = (16/68)*oxygen_16.values + (16/68)*hydrogen_2.values + (36/68)*carbon_12.values # 2 propanol
water = (16/18)*oxygen_16.values + (2/18)*hydrogen_1.values # water
heavy_water = (16/20)*oxygen_16.values + (4/20)*hydrogen_2.values # heavywater
benzene = (72/78)*carbon_12.values + (6/78)*hydrogen_1.values # benzene

In [265]:
df.loc[df.Type.str.contains("WTR"), to_fill] = water
df.loc[df.Type.str.contains("D2O"), to_fill] = heavy_water
df.loc[df.Type.str.contains("BNZ"), to_fill] = benzene
df.loc[(df.Type.str.contains("CXX")) & (df.Title.str.contains("ethanol")), to_fill] = ethanol_1
df.loc[(df.Type.str.contains("CXX")) & (df.Title.str.contains("2-propanol")), to_fill] = propanol_1
df.loc[(df.Type.str.contains("DXX")) & (df.Title.str.contains("ethanol")), to_fill] = ethanol_2
df.loc[(df.Type.str.contains("DXX")) & (df.Title.str.contains("2-propanol")), to_fill] = propanol_2

In [266]:
df.loc[df.Type.str.contains("WTR"), "Atomic_Mass_Micro"] = 1.801528E7
df.loc[df.Type.str.contains("D2O"), "Atomic_Mass_Micro"] = 2.00276E7
df.loc[df.Type.str.contains("BNZ"), "Atomic_Mass_Micro"] = 7.811E7
df.loc[(df.Type.str.contains("CXX")) & (df.Title.str.contains("ethanol")), "Atomic_Mass_Micro"] = 4.6069E7
df.loc[(df.Type.str.contains("CXX")) & (df.Title.str.contains("2-propanol")), "Atomic_Mass_Micro"] = 6.0096E7
df.loc[(df.Type.str.contains("DXX")) & (df.Title.str.contains("ethanol")), "Atomic_Mass_Micro"] = 5.2E7
df.loc[(df.Type.str.contains("DXX")) & (df.Title.str.contains("2-propanol")), "Atomic_Mass_Micro"] = 6.8E7

In [None]:
# df.loc[df.Type.str.contains("WTR"), to_fill] = np.nan
# df.loc[df.Type.str.contains("D2O"), to_fill] = np.nan
# df.loc[df.Type.str.contains("BNZ"), to_fill] = np.nan
# df.loc[(df.Type.str.contains("CXX")) & (df.Title.str.contains("ethanol")), to_fill] = np.nan
# df.loc[(df.Type.str.contains("CXX")) & (df.Title.str.contains("2-propanol")), to_fill] = np.nan
# df.loc[(df.Type.str.contains("DXX")) & (df.Title.str.contains("ethanol")), to_fill] = np.nan
# df.loc[(df.Type.str.contains("DXX")) & (df.Title.str.contains("2-propanol")), to_fill] = np.nan