# Extracting and Formating Data from the EXFOR files

We first import the necessary modules.

In [1]:
import natsort
from natsort import natsorted
import os
import shutil

First we create a list containing the file names of all entries per element in the `neutrons` directory which was extracted from the EXFOR database.

In [2]:
# Search all files withing the EXFOR neutrons directory
exfor_directory = "./EXFOR/neutrons_2019_07_18/"

print("Searching directory for .c4 files...")
names = []
for root, dirs, files in os.walk(exfor_directory):
    for file in files:
        if file.endswith(".c4"):
            names.append(os.path.join(root, file))
            
print("Gathered {} .c4 files.".format(len(names)))
names = natsorted(names)

Searching directory for .c4 files...
Gathered 623 .c4 files.


A directory will be specified to store all our extracted text data.

In [4]:
dirpath = "Extracted_Text"
if os.path.isdir(dirpath):
    print("Resetting Extracted_Test directory...")
    shutil.rmtree(dirpath)
    os.makedirs(dirpath)
else:
    os.makedirs(dirpath)

We now extract the experimental data avaliable in each file in the `neutrons` directory and format it to be read by python. The data format is given by EXFOR so we will insert commas at the specified positions. Tools like `pandas` can read character-delimited files but the EXFOR files contain no delimiter. Because of this we insert the commas at position: 5, 11, 12, 15, 19, 22, 31, 40, 49, 58, 67, 76, 85, 94, 95, 122, and 127. 

In [19]:
print("Extracting numerical experimental data from collected EXFOR neutron files ...")
for i in names:
    with open(i) as infile, open('../ML_Data/all_cross_sections.txt', 'a') as outfile:
        copy = False
        for line in infile:
            if line.startswith(r"#---><---->o<-><-->ooo<-------><-------><-------><-------><-------><-------><-------><-------><-><-----------------------><---><->o"):
                copy = True
                continue
            elif line.startswith(r"#/DATA"):
                copy = False
                continue
            elif copy:
                outfile.write(line)
print("Finished extracting all experimental data.")

Extracting numerical experimental data from collected EXFOR neutron files ...
Finished extracting all experimental data.


In [20]:
# Using the document with all data we insert commas following the EXFOR format
print("Formatting EXFOR cross section data...")
with open('../ML_Data/all_cross_sections.txt') as infile, open('../ML_Data/all_cross_sections_v1.txt', 'w') as outfile:
    for line in infile:
        if line.strip():
            string = list(line)
            for i, j in enumerate([5, 11, 12, 15, 19, 22, 31, 40, 49, 58, 67, 76, 85, 94, 95, 122, 127]):
                string.insert(i + j, ';')
            outfile.write("".join(string))
print("Finished formating EXFOR data.")
os.remove('../ML_Data/all_cross_sections.txt')

Formatting EXFOR cross section data...
Finished formating EXFOR data.


The EXFOR files contain more useful information worth extracting which can be deleted later at convenience. Some of these features include:

- #AUTHOR1
- #YEAR
- #INSTITUTE 
- #TITLE
- #REFERENCE
- #DATE
- #REACTION
- #DATA (for data points)

We now extract the `Author` for each experiment,

In [8]:
print("Extracting AUTHOR1 ...")
for i in names:
    with open(i) as infile, open(dirpath + '/authors.txt', 'a') as outfile:
        for line in infile:
            if line.startswith(r"#AUTHOR1"):
                outfile.write(line)
print("Finished extracting AUTHOR1s.")

Extracting AUTHOR1 ...
Finished extracting AUTHOR1s.


the `Year` when the experiment took place,

In [9]:
print("Extracting YEAR ...")
for i in names:
    with open(i) as infile, open(dirpath + '/years.txt', 'a') as outfile:
        for line in infile:
            if line.startswith(r"#YEAR"):
                outfile.write(line)
print("Finished extracting YEARs.")

Extracting YEAR ...
Finished extracting TITLES.


the `Institute` where the experiment took place,

In [10]:
# #INSTITUTE
print("Extracting Institute ...")
for i in names:
    with open(i) as infile, open(dirpath + '/institude.txt', 'a') as outfile:
        for line in infile:
            if line.startswith(r"#INSTITUTE"):
                outfile.write(line)
print("Finished extracting Institudes.")

Extracting Institute ...
Finished extracting Institudes.


the `date` when the experiment took place,

In [11]:
# We use the list of documents to extract only the data we need
print("Extracting DATE ...")
for i in names:
    with open(i) as infile, open(dirpath + '/dates.txt', 'a') as outfile:
        for line in infile:
            if line.startswith(r"#DATE"):
                outfile.write(line)
print("Finished extracting DATEs.")

Extracting DATE ...
Finished extracting DATEs.


We now extract the `Title` for each experiment,

In [12]:
# #AUTHOR1
print("Extracting TITLE ...")
for i in names:
    with open(i, "r") as infile, open(dirpath + '/titles.txt', 'a') as outfile:
        lines = infile.readlines()
        for z, line in enumerate(lines):
            if line.startswith(r"#TITLE"):
                if lines[z + 2].startswith(r"#+"):
                    if lines[z + 4].startswith(r"#+"):
                        if lines[z + 6].startswith(r"#+"):
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " + 
                                          str(lines[z+4].strip('#+').strip()) + " " +
                                          str(lines[z+6].strip('#+').strip()) + "\n")
                        else:
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " +
                                          str(lines[z+4].strip('#+').strip()) + "\n")
                    else:
                        outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + "\n")
                else:
                    outfile.write(line)
print("Finished extracting TITLEs.")

Extracting TITLE ...
Finished extracting AUTHOR1s.


and the `reference`.

In [14]:
# #AUTHOR1
print("Extracting References ...")
for i in names:
    with open(i, "r") as infile, open(dirpath + '/references.txt', 'a') as outfile:
        lines = infile.readlines()
        for z, line in enumerate(lines):
            if line.startswith(r"#REFERENCE"):
                if lines[z + 2].startswith(r"#+"):
                    if lines[z + 4].startswith(r"#+"):
                        if lines[z + 6].startswith(r"#+"):
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " + 
                                          str(lines[z+4].strip('#+').strip()) + " " +
                                          str(lines[z+6].strip('#+').strip()) + "\n")
                        else:
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " +
                                          str(lines[z+4].strip('#+').strip()) + "\n")
                    else:
                        outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + "\n")
                else:
                    outfile.write(line)
print("Finished extracting references.")

Extracting References ...
Finished extracting references.


We will now extract the experimental reaction in `reaction notation` avaliable in each EXFOR experimental data entry.

In [15]:
# #AUTHOR1
print("Extracting reaction notation ...")
for i in names:
    with open(i, "r") as infile, open(dirpath + '/reaction_notation.txt', 'a') as outfile:
        lines = infile.readlines()
        for z, line in enumerate(lines):
            if line.startswith(r"#REACTION"):
                if lines[z + 2].startswith(r"#+"):
                    if lines[z + 4].startswith(r"#+"):
                        if lines[z + 6].startswith(r"#+"):
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " + 
                                          str(lines[z+4].strip('#+').strip()) + " " +
                                          str(lines[z+6].strip('#+').strip()) + "\n")
                        else:
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " +
                                          str(lines[z+4].strip('#+').strip()) + "\n")
                    else:
                        outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + "\n")
                else:
                    outfile.write(line)
print("Finished extracting REACTION NOTATION.")

Extracting reaction notation ...
Finished extracting REACTION NOTATION.


Next, we extract the `number of datapoints per experiment`.

In [18]:
# #AUTHOR1
print("Extracting reaction notation ...")
for i in names:
    with open(i, "r") as infile, open(dirpath + '/data_points_per_experiment_refined.txt', 'a') as outfile:
        lines = infile.readlines()
        for z, line in enumerate(lines):
            if line.startswith(r"#DATA "):
                if lines[z + 2].startswith(r"#+"):
                    if lines[z + 4].startswith(r"#+"):
                        if lines[z + 6].startswith(r"#+"):
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " + 
                                          str(lines[z+4].strip('#+').strip()) + " " +
                                          str(lines[z+6].strip('#+').strip()) + "\n")
                        else:
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " +
                                          str(lines[z+4].strip('#+').strip()) + "\n")
                    else:
                        outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + "\n")
                else:
                    outfile.write(line)
print("Finished extracting REACTION NOTATION.")

Extracting reaction notation ...
Finished extracting REACTION NOTATION.
