In [2]:
from pathlib import Path

In [3]:
data_folder = "clean_star/"

In [5]:

def process_line(line):
    #return re.sub("[^\w]", " ",line).split()
    return line[0:6].replace("-", "").split()
    


file_list = Path(data_folder).glob("*.txt")

for file in file_list:
    with open(file, "r") as f:
        fields = set([process_line(line)[0] for line in f])

fields

{'CAPT',
 'CAT',
 'CNTRY',
 'COMM',
 'COND',
 'DOI',
 'DOR',
 'EDIT',
 'FOLD',
 'IIRN',
 'IMAGE',
 'IU',
 'KYWDS',
 'NAME',
 'NU',
 'ORG',
 'PAGE',
 'PDS',
 'PHYS',
 'PWC',
 'REC',
 'REEL',
 'REELC',
 'REFNO',
 'RPTNO',
 'RSCHR',
 'SEC',
 'SERV',
 'SRCNO',
 'SUBJ',
 'TTL',
 'TYPE',
 'VOL'}

In [33]:
header_file_name = "headers.txt"

with open(header_file_name, "w") as f:
    for field in sorted(fields):
        f.write(field+"\n")

## Parsing Data Files

In [5]:
import csv
from pathlib import Path

In [61]:
rec_dictionary = { field:"" for field in fields}


In [58]:
test_data = """--REC--1
IU     09/26/2005
TYPE   Message
DOR    04 05 04
DOI    04 05 04
IIRN   042037ZMAY04
SUBJ   IIR 6024015304 Two crash sites, one wounded American and one possible American killed in action in Xieng Khouang Province, Laos
CAT    MISC PW FILES
PAGE   15-16
REEL   523
REELC  1
FOLD   122-2
CNTRY  Laos
KYWDS  Crash site
RSCHR  dlo
SUBJ   IIR 6024015304 Two crash sites, one wounded American and one possible American killed in action in Xieng Khouang Province, Laos
CAT    MISC PW FILES
PAGE   15-16
REEL   523
REELC  1
FOLD   122-2
CNTRY  Laos
KYWDS  Crash site
RSCHR  dlo
--REC--2
IU     10/09/2003
NAME   Egger, J.C., Jr.
REFNO  0890-0-01
SERV   USAF
TYPE   Message
DOR    94 08 15
DOI    94 08 15
IIRN   151045zAUG94
ORG    CJMAO
SUBJ   Identification of remains
CAT    Casualty files
PAGE   3-4
REEL   512
REELC  0
FOLD   115-8
CNTRY  North Vietnam, pre-1975
KYWDS  Remains
RSCHR  jego
COMM   Cases 1623, 0245 also cited.
SUBJ   Identification of remains
CAT    Casualty files
PAGE   3-4
REEL   512
REELC  0
FOLD   115-8
CNTRY  North Vietnam, pre-1975
KYWDS  Remains
RSCHR  jego
COMM   Cases 1623, 0245 also cited.
--REC--3
IU     10/09/2003
NAME   Egger, J.C. Jr
REFNO  0890-0-01
SERV   USAF
TYPE   Message
DOR    92 03 18
DOI    92 03 18
IIRN   182102zMAR92
ORG    JTF-FA
SUBJ   Analysis of MR 4 Document
CAT    Casualty files
PAGE   5-10
REEL   512
REELC  1
FOLD   115-8
CNTRY  North Vietnam, pre-1975
KYWDS  Aircraft downed
RSCHR  jego
COMM   Concerns translation of Vietnamese document that details American aircraft losses. In ledger format.
SUBJ   Analysis of MR 4 Document
CAT    Casualty files
PAGE   5-10
REEL   512
REELC  1
FOLD   115-8
CNTRY  North Vietnam, pre-1975
KYWDS  Aircraft downed
RSCHR  jego
COMM   Concerns translation of Vietnamese document that details American aircraft losses. In ledger format.
--REC--4
IU     10/09/2003
NAME   Egger, J.C. Jr
REFNO  0890-0-01
SERV   USAF
TYPE   Message
DOR    94 08 19
DOI    67 11 03
IIRN   191300zAUG94
ORG    SECDEF
SUBJ   Public Affairs-POW/MIA-Press Guidance for identification of remains-Cambodia and Vietnam-August 94.
CAT    Casualty files
PAGE   11-15
REEL   512
REELC  1
FOLD   115-8
CNTRY  Cambodia
CNTRY  North Vietnam, pre-1975
KYWDS  Remains
KYWDS  Aircraft downed
KYWDS  Quang Tri Province
RSCHR  jego
COMM   The remains of three American servicemen previously unaccounted for in Indochina, have been identified.
SUBJ   Public Affairs-POW/MIA-Press Guidance for identification of remains-Cambodia and Vietnam-August 94.
CAT    Casualty files
PAGE   11-15
REEL   512
REELC  1
FOLD   115-8
CNTRY  Cambodia
CNTRY  North Vietnam, pre-1975
KYWDS  Remains
KYWDS  Aircraft downed
KYWDS  Quang Tri Province
RSCHR  jego
COMM   The remains of three American servicemen previously unaccounted for in Indochina, have been identified."""

## Putting It Together

The following program processes [QuadraSTAR](https://lucidea.com/cuadrastar/) data files and converts the *narrow* data format into a *wide* data format (see [wikipedia for more discussion](https://en.wikipedia.org/wiki/Wide_and_narrow_data). 

A QuadraSTAR record looks like this:
```
--REC--4
IU     10/09/2003
NAME   Egger, J.C. Jr
REFNO  0890-0-01
SERV   USAF
TYPE   Message
DOR    94 08 19
DOI    67 11 03
IIRN   191300zAUG94
ORG    SECDEF
SUBJ   Public Affairs-POW/MIA-Press Guidance for identification of remains-Cambodia and Vietnam-August 94.
CAT    Casualty files
PAGE   11-15
REEL   512
REELC  1
FOLD   115-8
CNTRY  Cambodia
CNTRY  North Vietnam, pre-1975
KYWDS  Remains
KYWDS  Aircraft downed
KYWDS  Quang Tri Province
RSCHR  jego
COMM   The remains of three American servicemen previously unaccounted for in Indochina, have been identified.
SUBJ   Public Affairs-POW/MIA-Press Guidance for identification of remains-Cambodia and Vietnam-August 94.
CAT    Casualty files
PAGE   11-15
REEL   512
REELC  1
FOLD   115-8
CNTRY  Cambodia
CNTRY  North Vietnam, pre-1975
KYWDS  Remains
KYWDS  Aircraft downed
KYWDS  Quang Tri Province
RSCHR  jego
COMM   The remains of three American servicemen previously unaccounted for in Indochina, have been identified.
```

We want it to end up looking like this:
```
PWMIA_4,PWMIA,4,,Casualty files,"Cambodia; North Vietnam, pre-1975; Cambodia; North Vietnam, pre-1975","The remains of three American servicemen previously unaccounted for in Indochina, have been identified.",,67 11 03,94 08 19,,115-8,191300zAUG94,,10/09/2003,Remains; Aircraft downed; Quang Tri Province; Remains; Aircraft downed; Quang Tri Province,"Egger, J.C. Jr",,SECDEF,11-15,,,,512,1,0890-0-01,,jego,,USAF,,Public Affairs-POW/MIA-Press Guidance for identification of remains-Cambodia and Vietnam-August 94.,,Message,
```

* The QuadraSTAR data is bad because it doesn't include the record ID in each line

In [19]:
!head -n 2 data.csv

ID,BATCH,REC,CAPT,CAT,CNTRY,COMM,COND,DOI,DOR,EDIT,FOLD,IIRN,IMAGE,IU,KYWDS,NAME,NU,ORG,PAGE,PDS,PHYS,PWC,REEL,REELC,REFNO,RPTNO,RSCHR,SEC,SERV,SRCNO,SUBJ,TTL,TYPE,VOL
"PWMIA_1


In [25]:
import csv
from pathlib import Path
import re


data_folder = "clean_star/"
header_file_name = "headers.txt"
data_file_name = "data.csv"

file_list = list(Path(data_folder).glob("*.txt"))



def process_line(line):
    return re.sub("[^\w]", " ",line).split()


for file in file_list:
    with open(file, "r") as f:
        fields = set([process_line(line)[0] for line in f])



with open(header_file_name, "w") as f:
    for field in sorted(fields):
        f.write(field+"\n")


with open(header_file_name, "r") as f:
    header_fields = [line.strip() for line in f]
    
    # move REC to beginning of list
    REC_index = header_fields.index('REC')
    header_fields.insert(0, header_fields.pop(REC_index))
    
    # add BATCH & ID columns
    header_fields.insert(0, "BATCH")
    header_fields.insert(0, "ID")
    
    print(header_fields)
    




with open(data_file_name, "w", newline='') as csvfile:
    
    # Create a DictWriter 
    writer = csv.DictWriter(csvfile, fieldnames=header_fields)
    
    # Write the headers to the file
    writer.writeheader()
    

    for file in file_list:
        print("processing", file)
        with open(file, "r") as f:
            
            # get the id from the file name
            file_name = file.stem
        
            # read the first line of the file 
            # which should be REC
            first_line = f.readline()
            
            # split the line based on fix field lengths
            header, value = first_line[0:6], first_line[7:]

            # handle the "--REC--" case and strip newlines
            header = header.replace("-", " ").strip()

            # remove newlines from the values
            value = value.strip()

            # Create a dictionary 
            row_dictionary = { field:"" for field in header_fields}

            row_dictionary[header] = value
            row_dictionary['ID'] = file_name + "_" + value
            row_dictionary['BATCH'] = file_name
            
            print(row_dictionary)
            
            # loop over lines in file
            for line in f:

                 # split the line based on fix field lengths
                header, value = line[0:6], line[7:]

                # handle the "--REC--" case and strip newlines
                header = header.replace("-", " ").strip()
                
                # remove newlines from the values
                value = value.strip()

                if header == "REC": # when we get to a new record

                    # save the old record to disk
                    writer.writerow(row_dictionary)
                    
                    # create a new empty row dictionary
                    row_dictionary = { field:"" for field in header_fields}
                    # put this REC in the new row dictionary
                    row_dictionary[header] = value
                    # add ID
                    row_dictionary['ID'] = file_name + "_" + value
                    # add Batch
                    row_dictionary['BATCH'] = file_name
              
                # Check to see if an entry already exists in the dictionary
                elif (header in row_dictionary) and row_dictionary[header] and (value != row_dictionary[header]):
                    
                    row_dictionary[header] = row_dictionary[header] + "; " + value
                
                # This isn't a new or existing record, populate the dictionary
                else: 
                    
                    # populate the row_dictionary with values 
                    row_dictionary[header] = value

            writer.writerow(row_dictionary)

    
  

['ID', 'BATCH', 'REC', 'CAPT', 'CAT', 'CNTRY', 'COMM', 'COND', 'DOI', 'DOR', 'EDIT', 'FOLD', 'IIRN', 'IMAGE', 'IU', 'KYWDS', 'NAME', 'NU', 'ORG', 'PAGE', 'PDS', 'PHYS', 'PWC', 'REEL', 'REELC', 'REFNO', 'RPTNO', 'RSCHR', 'SEC', 'SERV', 'SRCNO', 'SUBJ', 'TTL', 'TYPE', 'VOL']
processing clean_star/PWMIA.txt
{'ID': 'PWMIA_1', 'BATCH': 'PWMIA', 'REC': '1', 'CAPT': '', 'CAT': '', 'CNTRY': '', 'COMM': '', 'COND': '', 'DOI': '', 'DOR': '', 'EDIT': '', 'FOLD': '', 'IIRN': '', 'IMAGE': '', 'IU': '', 'KYWDS': '', 'NAME': '', 'NU': '', 'ORG': '', 'PAGE': '', 'PDS': '', 'PHYS': '', 'PWC': '', 'REEL': '', 'REELC': '', 'REFNO': '', 'RPTNO': '', 'RSCHR': '', 'SEC': '', 'SERV': '', 'SRCNO': '', 'SUBJ': '', 'TTL': '', 'TYPE': '', 'VOL': ''}
processing clean_star/PWMIA03.txt
{'ID': 'PWMIA03_1', 'BATCH': 'PWMIA03', 'REC': '1', 'CAPT': '', 'CAT': '', 'CNTRY': '', 'COMM': '', 'COND': '', 'DOI': '', 'DOR': '', 'EDIT': '', 'FOLD': '', 'IIRN': '', 'IMAGE': '', 'IU': '', 'KYWDS': '', 'NAME': '', 'NU': '', 'ORG'

In [17]:
row_dictionary

{'BATCH': 'PWMIA05',
 'CAPT': '',
 'CAT': '; MISC PW FILES\n; MISC PW FILES\n',
 'CNTRY': '; post-1975 Vietnam\n; post-1975 Vietnam\n',
 'COMM': '; He was very tall and very black and peered quickly down at me from under a coolie hat and quickly looked away.\n; He was very tall and very black and peered quickly down at me from under a coolie hat and quickly looked away.\n',
 'COND': '',
 'DOI': '; 95 01 01\n',
 'DOR': '; 02 07 31\n',
 'EDIT': '',
 'FOLD': '; 114-13\n; 114-13\n',
 'ID': 'PWMIA05_12221\n',
 'IIRN': '; 310038ZJUL02\n',
 'IMAGE': '',
 'IU': '; 05/09/2003\n',
 'KYWDS': '; Live sighting\n; Live sighting\n',
 'NAME': '',
 'NU': '',
 'ORG': '',
 'PAGE': '; 668-671\n; 668-671\n',
 'PDS': '',
 'PHYS': '',
 'PWC': '',
 'REC': '12221\n',
 'REEL': '; 509\n; 509\n',
 'REELC': '; 0\n; 0\n',
 'REFNO': '',
 'RPTNO': '',
 'RSCHR': '; dl\n; dl\n',
 'SEC': '',
 'SERV': '',
 'SRCNO': '',
 'SUBJ': '; IIR 6024019402/Sighting of a black man near Cam Tan village, Khanh Hoa Province, Vietnam.\n