## Setup: Import Libraries and Configure Paths

In [1]:
# ============================================================
# CELL 1: Import Required Libraries
# ============================================================
import os
import sys
import pandas as pd
import numpy as np
import ast
# import matplotlib.pyplot as plt
from tqdm import tqdm

# Import ALIGN package components
from align_test.alignment import LinguisticAlignment

print("‚úì All imports successful")

  from .autonotebook import tqdm as notebook_tqdm


‚úì All imports successful


In [2]:
# ============================================================
# Configuration - Set Your Data Directories
# ============================================================

# INPUT DIRECTORIES (from preprocessing notebook)
# These should contain the preprocessed .txt files
INPUT_DIR_BASIC = './test_output_basic'      # NLTK-only preprocessing
INPUT_DIR_SPACY = './test_output_spacy'      # NLTK + spaCy preprocessing
INPUT_DIR_STANFORD = './test_output_stanford' # NLTK + Stanford preprocessing (optional)

# OUTPUT DIRECTORIES (for alignment results)
OUTPUT_DIR_ALIGNMENT = './test_alignment_results'
OUTPUT_DIR_BASELINE = './test_baseline_results'

# Create output directories
for dir_path in [OUTPUT_DIR_ALIGNMENT, OUTPUT_DIR_BASELINE]:
    os.makedirs(dir_path, exist_ok=True)
    print(f"‚úì Created directory: {dir_path}")

‚úì Created directory: ./test_alignment_results
‚úì Created directory: ./test_baseline_results


In [3]:
# ============================================================
# Verify Input Data Exists
# ============================================================

print("Verifying preprocessed input data...")
print("="*60)

input_dirs = {
    "NLTK-only (REQUIRED)": INPUT_DIR_BASIC,
    "spaCy-tagged (OPTIONAL)": INPUT_DIR_SPACY,
    "Stanford-tagged (OPTIONAL)": INPUT_DIR_STANFORD
}

available_inputs = {}

for label, path in input_dirs.items():
    if os.path.exists(path):
        files = [f for f in os.listdir(path) if f.endswith('.txt')]
        print(f"‚úì {label}: {path}")
        print(f"  Found {len(files)} files")
        available_inputs[label] = path
    else:
        print(f"‚úó {label}: {path} (not found)")

if "NLTK-only (REQUIRED)" not in available_inputs:
    print("\n‚ùå ERROR: Required NLTK-only preprocessing data not found!")
    print("Please run test_prepare_transcripts.ipynb first.")
else:
    print(f"\n‚úì Ready to test with {len(available_inputs)} input types")

Verifying preprocessed input data...
‚úì NLTK-only (REQUIRED): ./test_output_basic
  Found 21 files
‚úì spaCy-tagged (OPTIONAL): ./test_output_spacy
  Found 21 files
‚úì Stanford-tagged (OPTIONAL): ./test_output_stanford
  Found 21 files

‚úì Ready to test with 3 input types


In [4]:
# ============================================================
# Inspect Sample Preprocessed File
# ============================================================

print("Inspecting sample preprocessed file...")
print("="*60)

# Load a sample file
sample_file = [f for f in os.listdir(INPUT_DIR_BASIC) if f.endswith('.txt')][0]
sample_path = os.path.join(INPUT_DIR_BASIC, sample_file)

print(f"\nReading: {sample_file}\n")

df_sample = pd.read_csv(sample_path, sep='\t', encoding='utf-8')

print(f"Columns: {df_sample.columns.tolist()}")
print(f"Rows: {len(df_sample)}")
print(f"\nFirst 3 rows:")
df_sample.head(3)

Inspecting sample preprocessed file...

Reading: time197-cond1.txt

Columns: ['participant', 'content', 'token', 'lemma', 'tagged_token', 'tagged_lemma', 'file']
Rows: 76

First 3 rows:


Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,file
0,cgv,that was fun,"['that', 'was', 'fun']","['that', 'be', 'fun']","[('that', 'DT'), ('was', 'VBD'), ('fun', 'NN')]","[('that', 'DT'), ('be', 'VB'), ('fun', 'NN')]",time197-cond1.txt
1,kid,dad you should have climbed the cliffs with us,"['dad', 'you', 'should', 'have', 'climbed', 't...","['dad', 'you', 'should', 'have', 'climb', 'the...","[('dad', 'NN'), ('you', 'PRP'), ('should', 'MD...","[('dad', 'NN'), ('you', 'PRP'), ('should', 'MD...",time197-cond1.txt
2,cgv,next time i will,"['next', 'time', 'i', 'will']","['next', 'time', 'i', 'will']","[('next', 'JJ'), ('time', 'NN'), ('i', 'NN'), ...","[('next', 'JJ'), ('time', 'NN'), ('i', 'NN'), ...",time197-cond1.txt


## TEST 1: Lexical-Syntactic Alignment (NLTK Tags Only)


In [5]:
# Initialize the lexical-syntactic analyzer
print("\nInitializing LexicalSyntacticAlignment analyzer...")

analyzer_lexsyn = LinguisticAlignment(
    alignment_type="lexsyn",
    cache_dir=os.path.join(OUTPUT_DIR_ALIGNMENT, "cache")
)

print("‚úì Analyzer initialized")


Initializing LexicalSyntacticAlignment analyzer...
‚úì Analyzer initialized


In [6]:
# Run alignment analysis on NLTK-only preprocessed data
print("\nRunning lexical-syntactic alignment analysis...")
print(f"Input folder: {INPUT_DIR_BASIC}")
print(f"Output folder: {OUTPUT_DIR_ALIGNMENT}")

results_lexsyn_nltk = analyzer_lexsyn.analyze_folder(
    folder_path=INPUT_DIR_BASIC,
    output_directory=OUTPUT_DIR_ALIGNMENT,
    lag=1,
    max_ngram=2,
    ignore_duplicates=True,
    add_additional_tags=False  # NLTK tags only
)

print(f"\n‚úì Alignment analysis complete!")
print(f"Utterance pairs analyzed: {len(results_lexsyn_nltk)}")


Running lexical-syntactic alignment analysis...
Input folder: ./test_output_basic
Output folder: ./test_alignment_results
ANALYZE_FOLDER: Processing data from folder: ./test_output_basic with lag=1


AttributeError: 'LexicalSyntacticAlignment' object has no attribute 'analyze_folder'

In [7]:
# Check which version of alignment.py is being loaded
import align_test.alignment as align_module
print(f"Loading alignment.py from: {align_module.__file__}")

# Check if it has the corrected code
import inspect
source = inspect.getsource(align_module.LinguisticAlignment.__init__)
if 'add_additional_tags' in source:
    print("‚úì Using UPDATED alignment.py")
else:
    print("‚úó Using OLD alignment.py - needs to be replaced")

Loading alignment.py from: /Users/ndd697/Desktop/Github-Projects/llm-linguistic-alignment/src/align_test/alignment.py
‚úó Using OLD alignment.py - needs to be replaced


In [8]:
# from align_test.alignment import LinguisticAlignment

# Initialize with multiple alignment types
analyzer = LinguisticAlignment(
    alignment_types=["lexsyn"],  # Run one or multiple analyzers
    cache_dir=os.path.join(OUTPUT_DIR_ALIGNMENT, "cache")
)

# Configure parameters for FastText
fasttext_params = {
    "high_sd_cutoff": 3,    # Filter out words with frequency > mean + 3*std
    "low_n_cutoff": 2,      # Filter out words occurring < 2 times
    "save_vocab": True      # Save vocabulary lists to output directory
}

# Configure parameters for Lexical/Syntactic analysis
lexsyn_params = {
    "max_ngram": 3,         # Maximum n-gram size
    "ignore_duplicates": True,
    "add_additional_tags": True,
    "additional_tagger_type":'stanford'  # Include Stanford or Spacy POS tags if available
}

# Common parameters for all analyzers
common_params = {
    "lag": 1  # Number of turns to lag
}

# Analyze real conversations
real_results = analyzer.analyze_folder(
    folder_path=INPUT_DIR_BASIC,
    output_directory=OUTPUT_DIR_ALIGNMENT,
    **common_params,
    **fasttext_params,
    **lexsyn_params
)

ANALYZE_FOLDER: Processing data from folder: ./test_output_basic with lag=1


AttributeError: 'LexicalSyntacticAlignment' object has no attribute 'analyze_folder'

In [None]:
# DIAGNOSTIC: Check if the file on disk actually has the new code
import align_test.alignment as align_module

# Get the file path
file_path = align_module.__file__
print(f"Python is loading from: {file_path}")

# Read the file directly from disk
with open(file_path, 'r') as f:
    file_contents = f.read()

# Check what's actually in the file on disk
if 'add_additional_tags' in file_contents:
    print("‚úì File ON DISK contains 'add_additional_tags'")
else:
    print("‚úó File ON DISK does NOT contain 'add_additional_tags'")

# Check what Python is seeing in memory
import inspect
source = inspect.getsource(align_module.LinguisticAlignment.__init__)
if 'add_additional_tags' in source:
    print("‚úì Loaded IN MEMORY contains 'add_additional_tags'")
else:
    print("‚úó Loaded IN MEMORY does NOT contain 'add_additional_tags'")

# Check for .pyc cache files
import pathlib
pyc_files = list(pathlib.Path(file_path).parent.glob('__pycache__/*.pyc'))
print(f"\n.pyc cache files found: {len(pyc_files)}")
for pyc in pyc_files:
    print(f"  - {pyc}")

In [None]:
# NUCLEAR OPTION: Clear all Python cache
import shutil
import pathlib
import align_test.alignment as align_module

# Find the align_test directory
align_test_dir = pathlib.Path(align_module.__file__).parent

# Remove __pycache__ directory
pycache_dir = align_test_dir / '__pycache__'
if pycache_dir.exists():
    print(f"Removing cache: {pycache_dir}")
    shutil.rmtree(pycache_dir)
    print("‚úì Cache removed")

# Also check parent __pycache__
parent_pycache = align_test_dir.parent / '__pycache__'
if parent_pycache.exists():
    print(f"Removing parent cache: {parent_pycache}")
    shutil.rmtree(parent_pycache)
    print("‚úì Parent cache removed")

print("\nüîÑ Now RESTART KERNEL and re-import!")