# Extracting and Formating Data from the EXFOR files

We first import the necessary modules.

In [1]:
import natsort
from natsort import natsorted
import os
import shutil

# Search all files withing the EXFOR neutrons directory
exfor_directory = "./EXFOR/neutrons_2019_07_18/"

print("Searching directory for .c4 files...")
names = []
for root, dirs, files in os.walk(exfor_directory):
    for file in files:
        if file.endswith(".c4"):
            names.append(os.path.join(root, file))
            
print("Gathered {} .c4 files.".format(len(names)))
names = natsorted(names)


dirpath = "./EXFOR/Extracted_Text"
if os.path.isdir(dirpath):
    print("Resetting Extracted_Test directory...")
    shutil.rmtree(dirpath)
    os.makedirs(dirpath)
else:
    os.makedirs(dirpath)

print("Extracting numerical experimental data from collected EXFOR neutron files ...")
for i in names:
    with open(i) as infile, open('../ML_Data/all_cross_sections.txt', 'a') as outfile:
        copy = False
        for line in infile:
            if line.startswith(r"#---><---->o<-><-->ooo<-------><-------><-------><-------><-------><-------><-------><-------><-><-----------------------><---><->o"):
                copy = True
                continue
            elif line.startswith(r"#/DATA"):
                copy = False
                continue
            elif copy:
                outfile.write(line)
print("Finished extracting all experimental data.")

# Using the document with all data we insert commas following the EXFOR format
print("Formatting EXFOR cross section data...")
with open('../ML_Data/all_cross_sections.txt') as infile, open('../ML_Data/all_cross_sections_v1.txt', 'w') as outfile:
    for line in infile:
        if line.strip():
            string = list(line)
            for i, j in enumerate([5, 11, 12, 15, 19, 22, 31, 40, 49, 58, 67, 76, 85, 94, 95, 122, 127]):
                string.insert(i + j, ';')
            outfile.write("".join(string))
print("Finished formating EXFOR data.")
os.remove('../ML_Data/all_cross_sections.txt')

- #AUTHOR1
- #YEAR
- #INSTITUTE 
- #TITLE
- #REFERENCE
- #DATE
- #REACTION
- #DATA (for data points)


print("Extracting AUTHOR1 ...")
for i in names:
    with open(i) as infile, open(dirpath + '/authors.txt', 'a') as outfile:
        for line in infile:
            if line.startswith(r"#AUTHOR1"):
                outfile.write(line)
print("Finished extracting AUTHOR1s.")


print("Extracting YEAR ...")
for i in names:
    with open(i) as infile, open(dirpath + '/years.txt', 'a') as outfile:
        for line in infile:
            if line.startswith(r"#YEAR"):
                outfile.write(line)
print("Finished extracting YEARs.")


# #INSTITUTE
print("Extracting Institute ...")
for i in names:
    with open(i) as infile, open(dirpath + '/institude.txt', 'a') as outfile:
        for line in infile:
            if line.startswith(r"#INSTITUTE"):
                outfile.write(line)
print("Finished extracting Institudes.")


# We use the list of documents to extract only the data we need
print("Extracting DATE ...")
for i in names:
    with open(i) as infile, open(dirpath + '/dates.txt', 'a') as outfile:
        for line in infile:
            if line.startswith(r"#DATE"):
                outfile.write(line)
print("Finished extracting DATEs.")


# #AUTHOR1
print("Extracting TITLE ...")
for i in names:
    with open(i, "r") as infile, open(dirpath + '/titles.txt', 'a') as outfile:
        lines = infile.readlines()
        for z, line in enumerate(lines):
            if line.startswith(r"#TITLE"):
                if lines[z + 2].startswith(r"#+"):
                    if lines[z + 4].startswith(r"#+"):
                        if lines[z + 6].startswith(r"#+"):
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " + 
                                          str(lines[z+4].strip('#+').strip()) + " " +
                                          str(lines[z+6].strip('#+').strip()) + "\n")
                        else:
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " +
                                          str(lines[z+4].strip('#+').strip()) + "\n")
                    else:
                        outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + "\n")
                else:
                    outfile.write(line)
print("Finished extracting TITLEs.")


# #AUTHOR1
print("Extracting References ...")
for i in names:
    with open(i, "r") as infile, open(dirpath + '/references.txt', 'a') as outfile:
        lines = infile.readlines()
        for z, line in enumerate(lines):
            if line.startswith(r"#REFERENCE"):
                if lines[z + 2].startswith(r"#+"):
                    if lines[z + 4].startswith(r"#+"):
                        if lines[z + 6].startswith(r"#+"):
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " + 
                                          str(lines[z+4].strip('#+').strip()) + " " +
                                          str(lines[z+6].strip('#+').strip()) + "\n")
                        else:
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " +
                                          str(lines[z+4].strip('#+').strip()) + "\n")
                    else:
                        outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + "\n")
                else:
                    outfile.write(line)
print("Finished extracting references.")


# #AUTHOR1
print("Extracting reaction notation ...")
for i in names:
    with open(i, "r") as infile, open(dirpath + '/reaction_notation.txt', 'a') as outfile:
        lines = infile.readlines()
        for z, line in enumerate(lines):
            if line.startswith(r"#REACTION"):
                if lines[z + 2].startswith(r"#+"):
                    if lines[z + 4].startswith(r"#+"):
                        if lines[z + 6].startswith(r"#+"):
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " + 
                                          str(lines[z+4].strip('#+').strip()) + " " +
                                          str(lines[z+6].strip('#+').strip()) + "\n")
                        else:
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " +
                                          str(lines[z+4].strip('#+').strip()) + "\n")
                    else:
                        outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + "\n")
                else:
                    outfile.write(line)
print("Finished extracting REACTION NOTATION.")


# #AUTHOR1
print("Extracting reaction notation ...")
for i in names:
    with open(i, "r") as infile, open(dirpath + '/data_points_per_experiment_refined.txt', 'a') as outfile:
        lines = infile.readlines()
        for z, line in enumerate(lines):
            if line.startswith(r"#DATA "):
                if lines[z + 2].startswith(r"#+"):
                    if lines[z + 4].startswith(r"#+"):
                        if lines[z + 6].startswith(r"#+"):
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " + 
                                          str(lines[z+4].strip('#+').strip()) + " " +
                                          str(lines[z+6].strip('#+').strip()) + "\n")
                        else:
                            outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + " " +
                                          str(lines[z+4].strip('#+').strip()) + "\n")
                    else:
                        outfile.write(str(line.strip('\n')) + " " + str(lines[z+2].strip('#+').strip()) + "\n")
                else:
                    outfile.write(line)
print("Finished extracting REACTION NOTATION.")