## Setup: Import Libraries and Configure Paths

In [None]:
# ============================================================
# CELL 1: Import Required Libraries
# ============================================================
import os
import sys
import pandas as pd
import numpy as np
import ast
# import matplotlib.pyplot as plt
from tqdm import tqdm

# Auto-reload modules during development
%load_ext autoreload
%autoreload 2

# Import ALIGN package components
from align_test.alignment import LinguisticAlignment

print("✓ All imports successful")

In [None]:
# ============================================================
# Configuration - Set Your Data Directories
# ============================================================

# INPUT DIRECTORIES (from preprocessing notebook)
# These should contain the preprocessed .txt files
INPUT_DIR_BASIC = './test_output_basic'      # NLTK-only preprocessing
INPUT_DIR_SPACY = './test_output_spacy'      # NLTK + spaCy preprocessing
INPUT_DIR_STANFORD = './test_output_stanford' # NLTK + Stanford preprocessing (optional)

# OUTPUT DIRECTORIES (for alignment results)
OUTPUT_DIR_ALIGNMENT = './test_alignment_results'
OUTPUT_DIR_BASELINE = './test_baseline_results'

# Create output directories
for dir_path in [OUTPUT_DIR_ALIGNMENT, OUTPUT_DIR_BASELINE]:
    os.makedirs(dir_path, exist_ok=True)
    print(f"✓ Created directory: {dir_path}")

In [None]:
# ============================================================
# Verify Input Data Exists
# ============================================================

print("Verifying preprocessed input data...")
print("="*60)

input_dirs = {
    "NLTK-only (REQUIRED)": INPUT_DIR_BASIC,
    "spaCy-tagged (OPTIONAL)": INPUT_DIR_SPACY,
    "Stanford-tagged (OPTIONAL)": INPUT_DIR_STANFORD
}

available_inputs = {}

for label, path in input_dirs.items():
    if os.path.exists(path):
        files = [f for f in os.listdir(path) if f.endswith('.txt')]
        print(f"✓ {label}: {path}")
        print(f"  Found {len(files)} files")
        available_inputs[label] = path
    else:
        print(f"✗ {label}: {path} (not found)")

if "NLTK-only (REQUIRED)" not in available_inputs:
    print("\n❌ ERROR: Required NLTK-only preprocessing data not found!")
    print("Please run test_prepare_transcripts.ipynb first.")
else:
    print(f"\n✓ Ready to test with {len(available_inputs)} input types")

In [None]:
# ============================================================
# Inspect Sample Preprocessed File
# ============================================================

print("Inspecting sample preprocessed file...")
print("="*60)

# Load a sample file
sample_file = [f for f in os.listdir(INPUT_DIR_BASIC) if f.endswith('.txt')][0]
sample_path = os.path.join(INPUT_DIR_BASIC, sample_file)

print(f"\nReading: {sample_file}\n")

df_sample = pd.read_csv(sample_path, sep='\t', encoding='utf-8')

print(f"Columns: {df_sample.columns.tolist()}")
print(f"Rows: {len(df_sample)}")
print(f"\nFirst 3 rows:")
df_sample.head(3)

## TEST 1: Lexical-Syntactic Alignment (NLTK Tags Only)
