In [6]:
import json
import os
from collections import defaultdict

os.chdir("/Users/noracai/Documents/_coding_projects/IBM-Models")
print(os.getcwd())

/Users/noracai/Documents/_coding_projects/IBM-Models


In [None]:
def find_best_prediction(model_dir):
    """Find the prediction file with the best AER score."""
    # Read AER file
    aer_file = os.path.join(model_dir, 'AERs')
    with open(aer_file, 'r') as f:
        aer_scores = [float(line.strip()) for line in f.readlines()]
    
    # Find the best (lowest) AER score
    best_epoch = aer_scores.index(min(aer_scores)) + 1
    best_aer = min(aer_scores)
    
    return f"prediction-{best_epoch}", best_aer


In [8]:
model_dir = 'prediction/validation/IBM2/uniform-init'
best_prediction, best_aer = find_best_prediction(model_dir)

print(best_prediction, best_aer)

prediction-5 0.23988439306358378


In [9]:
def load_sentences(file_path):
    """Load sentences from a text file."""
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            sent = line.strip().split()
            sentences.append(sent)
    return sentences


In [10]:
english_file = 'jane-eyre/Fr_all.e'
french_file = 'jane-eyre/Fr_all.f'
english_sentences = load_sentences(english_file)
french_sentences = load_sentences(french_file)

In [12]:
print(english_sentences[:100])

[['CHAPTER', 'I', 'There', 'was', 'no', 'possibility', 'of', 'taking', 'a', 'walk', 'that', 'day.'], ['We', 'had', 'been', 'wandering,', 'indeed,', 'in', 'the', 'leafless', 'shrubbery', 'an', 'hour', 'in', 'the', 'morning;', 'but', 'since', 'dinner', 'the', 'cold', 'winter', 'wind', 'had', 'brought', 'with', 'it', 'clouds', 'so', 'sombre,', 'and', 'a', 'rain', 'so', 'penetrating,', 'that', 'further', 'out-door', 'exercise', 'was', 'now', 'out', 'of', 'the', 'question.'], ['I', 'was', 'glad', 'of', 'it:', 'I', 'never', 'liked', 'long', 'walks,', 'especially', 'on', 'chilly', 'afternoons:', 'dreadful', 'to', 'me', 'was', 'the', 'coming', 'home', 'in', 'the', 'raw', 'twilight,', 'with', 'nipped', 'fingers', 'and', 'toes,', 'and', 'a', 'heart', 'saddened', 'by', 'the', 'chidings', 'of', 'Bessie,', 'the', 'nurse,', 'and', 'humbled', 'by', 'the', 'consciousness', 'of', 'my', 'physical', 'inferiority', 'to', 'Eliza,', 'John,', 'and', 'Georgiana', 'Reed.'], ['The', 'said', 'Eliza,', 'John,', '

In [None]:
def get_context(sentence, word_index, context_size=4):
    sentence_len = len(sentence)
    
    # Calculate start and end indices for context
    start_idx = max(0, word_index - context_size)
    end_idx = min(sentence_len, word_index + context_size + 1)
    
    # Get the context words
    context_words = sentence[start_idx:end_idx]
    
    return context_words

In [14]:
def process_alignments(alignment_file, english_file, french_file, output_file):
    """Process alignment file and extract context for each alignment."""
    # Load sentences
    english_sentences = load_sentences(english_file)
    french_sentences = load_sentences(french_file)
    
    # Read alignment file
    with open(alignment_file, 'r') as f:
        alignment_lines = f.readlines()
    
    # Group alignments by sentence
    sentence_alignments = defaultdict(list)
    for line in alignment_lines:
        parts = line.strip().split()
        if len(parts) >= 4:
            sent_id = int(parts[0]) - 1  # Convert to 0-based indexing
            english_pos = int(parts[1]) - 1  # Convert to 0-based indexing
            french_pos = int(parts[2]) - 1   # Convert to 0-based indexing
            sentence_alignments[sent_id].append((english_pos, french_pos))
    
    # Process each sentence
    results = []
    for sent_id in sorted(sentence_alignments.keys()):
        if sent_id >= len(english_sentences) or sent_id >= len(french_sentences):
            continue
            
        english_sent = english_sentences[sent_id]
        french_sent = french_sentences[sent_id]
        
        for english_pos, french_pos in sentence_alignments[sent_id]:
            # Skip if positions are out of bounds
            if (english_pos >= len(english_sent) or french_pos >= len(french_sent)):
                continue
                
            # Get context for both words
            english_context = get_context(english_sent, english_pos)
            french_context = get_context(french_sent, french_pos)
            
            # Create result record
            result = {
                'sentence_id': sent_id + 1,  # Convert back to 1-based for display
                'english_word': english_sent[english_pos],
                'french_word': french_sent[french_pos],
                'english_position': english_pos,
                'french_position': french_pos,
                'english_context': ' '.join(english_context),
                'french_context': ' '.join(french_context),
                'full_english_sentence': ' '.join(english_sent),
                'full_french_sentence': ' '.join(french_sent)
            }
            
            results.append(result)
    
    # Save results
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    return results


In [17]:
alignment_file = os.path.join(model_dir, best_prediction)
output_file = "context_output_best_model.json"
results = process_alignments(alignment_file, english_file, french_file, output_file)