# OLD DATA REFORMATTING

### How this Notebook Works:
* Run the cell below and follow the prompts
    * Select a directory containing the files you wish to convert to pandas friendly csv
    * Once completed, select whether or not to delete the old files
* This script searches through a given directory and finds all files with a specific format
* Once files are found it breaks up the headers and extracts relavent information
* X and Y values and relavent information is saved in a new CSV file for easier readability

In [1]:
import os
import re
import csv

# scan through directory finding all unprocessed files
def find_files_to_edit(directory):
    # filename ends with this datetime formatting
    name_finder = re.compile(r'_\d\d\d\d-\d\d-\d\d-\d\d\d\d\d\d$')
    file_list = []
    for filename in os.listdir(directory):
        if name_finder.search(filename) is not None:
            file_list.append(directory + '\\' + filename)
            print('Added file')
        else:
            print('Not adding file: ' + filename)

    return file_list


# take file and make it a csv formatted to be easily read as a pandas dataframe
def unprocessed_to_csv(file):
    headers = []  # list of strings of ALL processed headers
    x, y, x2 = [], [], []
    misc_headers = []  # unprocessed misc headers
    misc_val = []  # values associated with misc headers
    r = 6
    x2_flag = False
    # open the file, check the first 5-6 rows and pull out headers and values, goal is to make single header line
    with open(file, 'r') as f:
        data = f.readlines()
        data_set = data[7].split(' ')
        for i in range(6):
            words = " ".join(data[i].split()).split(' ') # removes extra spaces
            # no gaussmeter data
            if words[0] == 'Number' and len(data_set) == 3:
                headers.append(words[1].rstrip())
                headers.append(words[2].rstrip())
            # accounts for x2 values from gaussmeter
            elif words[0] == 'Number' and len(data_set) == 4:
                headers.append(words[1].rstrip())
                headers.append(words[2].rstrip())
                headers.append(words[3].rstrip())
                x2_flag = True
            # if not a blank line, check if float. Data may have 5-6 rows of headers
            elif words[0] != '\n':
                try:
                    type(float(words[0])) == float
                    r = 5
                except Exception:
                    j = ''.join([w + ' ' for w in words])
                    misc_headers.append(j)
            else:
                pass
        for q in range(r, len(data)):  # read x y datapoints
            d = data[q].split(' ')
            x.append(d[1])
            y.append(d[2])
            if x2_flag:  # include x2 data if there
                x2.append(d[3])
        f.close()

        # process the misc headers to be title (units) with an associated value
        for string in misc_headers:
            s = string.split(':')
            if len(s) == 2:  # most common case of header: value(units)
                clean_string(s[0], s[1], headers, misc_val)
            elif len(s) == 3:  # case where there is header: value(units) header: value(units) STFMR Data
                mid = s[1].split('S')
                mid[0] = mid[0].rstrip()
                mid[1] = 'S' + mid[1]
                clean_string(s[0], mid[0], headers, misc_val)
                clean_string(mid[1], s[2], headers, misc_val)
            elif len(s) == 1:  # weird case where no : is included ie header value(units)
                value = re.findall(r"[-+]?\d*\.?\d+|[-+]?\d+", s[0])
                title = re.sub(r"[-+]?\d*\.?\d+|[-+]?\d+", '', s[0])

                headers.append(title.rstrip())
                if value == []:
                    misc_val.append(title.rstrip())
                else:
                    misc_val.append(value[0].rstrip())
            else:
                print("Error couldn't process headers for for file: ", f)

    return headers, x, y, x2, misc_val

# from two strings sets with a formatting like title: value(units) find string title + units and string value, append to lists


def clean_string(first_half, second_half, h, m):
    nums = re.findall(r"[-+]?\d*\.?\d+|[-+]?\d+", second_half)
    if nums != []:
        units = re.sub(r"[-+]?\d*\.?\d+|[-+]?\d+", '', second_half)
        # header including the units
        title = (first_half.rstrip() + units.rstrip())
        value = (nums[0].rstrip())  # float value associated
    else:
        title = (first_half.rstrip())  # header title, no known units
        # associated value (won't be a float)
        value = (second_half.rstrip().lstrip())

    h.append(title)
    m.append(value)
    
    
# directory = r'C:\Users\Neuromancer\Desktop\Data-Analysis\AMR-data'
directory = input('Please select the file directory to clean: ')
file_list = find_files_to_edit(directory)

for file in file_list:
    headers, x, y, x2, misc_val = unprocessed_to_csv(file)
    with open(file + '.csv', 'w', newline='', encoding='utf-8') as csvfile:
        rewriter = csv.writer(csvfile, dialect='excel')
        rewriter.writerow(headers)
        if len(x2) == 0:
            for counter, elem in enumerate(x):
                rewriter.writerow([elem, y[counter]] + misc_val)
        else:
            for counter, elem in enumerate(x):
                rewriter.writerow(
                    [elem, y[counter], x2[counter]] + misc_val)
        csvfile.close()

q = input('Do you wish to delete old files? (y/n): ')
if q == 'y' or q == 'Y':
    for file in file_list:
        os.remove(file)
    print('All files deleted.')
else:
    print('Not deleting files.')

Please select the file directory to clean:  C:\Users\Neuromancer\Desktop\Presentation\Pulse_Switching_Example


Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
Added file
['Hx', 'field:', '-100.0(Oe)']
['0.05', '(s)', 'pulse', 'width']
['Initial', 'Resistance:', '1454.675Ohm']
['Reset', 'Field', 'strength', 'at:', '0.0(Oe)', 'Reset', 'field', 'applied', 'for:', '1.0(s)']
['Number', 'Applied', 'Current', '(mA)', 'Resistance(Ohm)']
['0', '-12.0', '-0.0854909598']


IndexError: list index out of range