**Author**: Naomi Baes and Chat GPT

**Aim**: Get contexts to compute breadth measure

**Function:** This script filters lines from input files based on specified terms found in the first column, then generates samples from the filtered lines for each 5-year interval, saving them in an output directory.

- Sampling strategy: Generate up to 50 random sentences 10 times from target_term filtered lines for further analysis.
- Output example for generate_interval_samples(): mental_illness.sentences.psych.1970-1974.1; [...].2, etc.

# Filter contexts and get sample sentence contexts

In [13]:
import re
import os
import random

def filter_lines(input_file, term, output_file):
    term_pattern = re.compile(rf"(\b{term}\b)", re.IGNORECASE)
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            first_column = line.split(" ||||| ")[0]  # Extract the first column
            if term_pattern.search(first_column):  # Search for the term in the first column
                outfile.write(line)  # Write the entire line to the output file

def generate_samples(input_file, class_name, corpus):
    output_dir = "output/5-year.cosine"
    input_dir = "input"
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(input_dir, exist_ok=True)
    context_file = f"{class_name}.context.{corpus}"
    pattern_year = re.compile(r'\|\|\|\| (\d{4}) \|\|\|\|')
    pattern_mental = re.compile(r'([^.]*?\b' + re.escape(class_name) + r'\b[^.]*\.)')
    
    # Filter lines based on the term in the first column
    filter_lines(input_file, class_name, f"{input_dir}/{context_file}")
    
    # Read filtered lines from context file
    with open(f"{input_dir}/{context_file}", "r", encoding='utf-8') as file:
        filtered_lines = [line.strip() for line in file]
    
    # Generate samples for each 5-year interval
    for interval_start in range(1970, 2020, 5):
        generate_interval_samples(interval_start, filtered_lines, output_dir, class_name, corpus, pattern_year, pattern_mental)

def generate_interval_samples(interval_start, filtered_lines, output_dir, class_name, corpus, pattern_year, pattern_mental):
    for i in range(1, 11):
        samples = []
        for line in filtered_lines:
            match = pattern_year.search(line)
            if match:
                year = int(match.group(1))
                if interval_start <= year < interval_start + 5:
                    match = pattern_mental.search(line)
                    if match:
                        samples.append(match.group(1).strip())
        samples = random.sample(samples, min(50, len(samples)))
        with open(f"{output_dir}/{class_name}.{interval_start}-{interval_start+4}.{corpus}.{i}", "w") as output_file:
            output_file.write("\n".join(samples))

# Define input file paths
input_files = {
    "psych": "C:/Users/naomi/OneDrive/COMP80004_PhDResearch/RESEARCH/DATA/CORPORA/Psychology/abstract_year_journal.csv.mental",
    "cohacoca": "C:/Users/naomi/OneDrive/COMP80004_PhDResearch/RESEARCH/DATA/CORPORA/COHACOCA/coha.coca.cleaned2.mental"
}

# Define terms to search for
terms = ["mental_illness", "mental_health", "perception"]

# Iterate over each corpus and term to generate samples
for corpus, input_file in input_files.items():
    for term in terms:
        generate_samples(input_file, term, corpus)

print("Generation completed.")


Generation completed.
