# Merge text files 

Merge text files such that each observation consists of the identification number and the text of the corresponding essay.

In [10]:
import os
import re

In [97]:
# Specify the directory path where your individual essay files are located
directory = 'txt'

# Specify the output file path where you want to save the merged essays
output_file = 'essays_cleaned.txt'

# Regular expression pattern for extracting the identification number
pattern = r'N[12]\w+'

# Regular expression patterns to match and remove the unwanted sections
heading_pattern = r'NCDS8 Essay by N[12]\w+'
sequence_pattern = r'q-q-[a-f0-9-]+ q-q-[a-f0-9-]+ q-[a-f0-9-]+'
ending_pattern = r'National Child Development Study: Sweep \d+, \d+-\d+'

In [98]:
# Iterate over each essay file in the directory
merged_content = ''
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        # Read the content of each essay file
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as f:
            essay_content = f.read().strip()
            
        # Extract the identification number from the filename using regular expressions
        match = re.search(pattern, essay_content)
        observation_id = match.group() if match else "N/A"
        
        # Remove the unwanted sections from the essay content using regular expressions
        essay_content = re.sub(heading_pattern, '', essay_content)
        essay_content = re.sub(sequence_pattern, '', essay_content)
        essay_content = re.sub(ending_pattern, '', essay_content)
        
        # Strip any leading or trailing whitespace
        essay_content = essay_content.strip() 

        # Append the observation ID and essay content to the merged content
        merged_content += f'Observation ID: {observation_id}\n{essay_content}\n\n'

In [99]:
# Save the merged content to the output file
with open(output_file, 'w') as f:
    f.write(merged_content)