In [None]:
import re
import numpy as np

In [41]:
def gale_church_align(english_sentences, french_sentences, merge_threshold=5):
    """
    Aligns English and French sentences using an improved Gale-Church approach.
    - `merge_threshold`: Merges short sentences below this word count.
    - Ensures no blank lines and merges small misaligned fragments.
    """
    eng_len = len(english_sentences)
    fr_len = len(french_sentences)

    # Convert sentences to word count lengths
    eng_lengths = np.array([len(sent.split()) for sent in english_sentences])
    fr_lengths = np.array([len(sent.split()) for sent in french_sentences])

    # Create a cost matrix based on word count differences
    cost_matrix = np.abs(eng_lengths[:, None] - fr_lengths[None, :])

    # Initialize DP table (INF for all cells except [0,0])
    dp = np.full((eng_len + 1, fr_len + 1), float('inf'))
    dp[0, 0] = 0

    # Back-pointer matrix for path reconstruction
    backtrace = np.zeros((eng_len + 1, fr_len + 1), dtype=int)

    # Fill DP table using minimal cost merging strategy
    for i in range(eng_len + 1):
        for j in range(fr_len + 1):
            if i > 0 and j > 0:  # Normal 1:1 alignment
                cost = cost_matrix[i - 1, j - 1]
                if dp[i - 1, j - 1] + cost < dp[i, j]:
                    dp[i, j] = dp[i - 1, j - 1] + cost
                    backtrace[i, j] = 1  # (1 English -> 1 French)

            if i > 0:  # Merge English sentence
                cost = cost_matrix[i - 1, max(0, j - 1)] if j > 0 else 0
                if dp[i - 1, j] + cost < dp[i, j]:
                    dp[i, j] = dp[i - 1, j] + cost
                    backtrace[i, j] = 2  # (1 English -> Multiple French)

            if j > 0:  # Merge French sentence
                cost = cost_matrix[max(0, i - 1), j - 1] if i > 0 else 0
                if dp[i, j - 1] + cost < dp[i, j]:
                    dp[i, j] = dp[i, j - 1] + cost
                    backtrace[i, j] = 3  # (Multiple English -> 1 French)

    # Backtrace to reconstruct aligned sentence pairs
    aligned_sentences = []
    i, j = eng_len, fr_len
    eng_buffer, fr_buffer = [], []

    while i > 0 or j > 0:
        if backtrace[i, j] == 1:  # Normal 1:1 alignment
            eng_buffer.append(english_sentences[i - 1])
            fr_buffer.append(french_sentences[j - 1])
            aligned_sentences.append((" ".join(reversed(eng_buffer)), " ".join(reversed(fr_buffer))))
            eng_buffer, fr_buffer = [], []  # Reset buffers
            i -= 1
            j -= 1

        elif backtrace[i, j] == 2:  # Merge English sentence
            eng_buffer.append(english_sentences[i - 1])
            i -= 1

        elif backtrace[i, j] == 3:  # Merge French sentence
            fr_buffer.append(french_sentences[j - 1])
            j -= 1

    # Reverse since we constructed it backwards
    aligned_sentences.reverse()

    # **Remove empty sentence pairs to prevent blank lines**
    aligned_sentences = [(e, f) for e, f in aligned_sentences if e.strip() or f.strip()]

    return aligned_sentences

In [None]:
def clean_text(text):
    """
    Removes quotation marks («, », ") and trims extra spaces.
    """
    text = re.sub(r'[«»"]', '', text)  # Remove quotation marks
    return text.strip()

def split_into_sentences(text):
    # Basic sentence splitting without complex look-behinds
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    
    # Merge sentences that were incorrectly split after common abbreviations
    abbreviations = {"Mr.", "Mrs.", "Ms.", "Dr.", "Jr.", "Sr.", "St.", "Prof.", "Capt.", "Lt.", "Col.", "Gen.", "Sgt.", "Mt.", "M."}
    merged_sentences = []
    buffer = ""

    for sentence in sentences:
        # Handle abbreviation cases
        if buffer:
            sentence = buffer + " " + sentence
            buffer = ""

        if any(sentence.endswith(abbr) for abbr in abbreviations):
            buffer = sentence  # Hold onto this part to merge with the next
            continue

        # Remove em dash if it's at the start of a sentence
        sentence = re.sub(r'^\s*—\s*', '', sentence)

        # ✅ Merge if the sentence starts with a comma
        if sentence.startswith(","):
            if merged_sentences:
                merged_sentences[-1] += " " + sentence  # Merge with the previous sentence
            else:
                merged_sentences.append(sentence)  # Edge case: if it's the first sentence
        else:
            merged_sentences.append(sentence)

    if buffer:
        merged_sentences.append(buffer)  # Add any leftover buffer

    return merged_sentences

In [None]:
def process_and_align(english_input, french_input, english_output, french_output, merge_threshold=50):
    # Read the English and French input files
    with open(english_input, 'r', encoding='utf-8') as eng_in, \
         open(french_input, 'r', encoding='utf-8') as fr_in:
        
        # Step 1: Merge all English lines into one big chunk
        english_text = ' '.join([line.strip() for line in eng_in if line.strip()])
        french_text = ' '.join([line.strip() for line in fr_in if line.strip()]) 

        # Step 2: Clean both texts (remove « », ")
        english_text = clean_text(english_text)
        french_text = clean_text(french_text)

        # Step 3: Split the cleaned texts into sentences
        english_sentences = split_into_sentences(english_text)
        french_sentences = split_into_sentences(french_text)

    print(f"Initial sentence count: English ({len(english_sentences)}), French ({len(french_sentences)})")
    aligned_sentences = gale_church_align(english_sentences, french_sentences)

    num_aligned = len(aligned_sentences)
    print(f"Aligned sentence count: {num_aligned}")
    
    # Check if there's a mismatch
    eng_count = sum(1 for eng, _ in aligned_sentences if eng.strip())
    fr_count = sum(1 for _, fr in aligned_sentences if fr.strip())

    if eng_count != fr_count:
        print(f"Sentence mismatch after alignment! EN = {eng_count}, FR = {fr_count}")
        
    # Step 5: Align and save the sentences (using zip_longest to handle mismatched lengths)
    with open(english_output, 'w', encoding='utf-8') as eng_out, \
         open(french_output, 'w', encoding='utf-8') as fr_out:
        
        for eng_sentence, fr_sentence in aligned_sentences:
            eng_out.write(eng_sentence.strip() + '\n')
            fr_out.write(fr_sentence.strip() + '\n')

    print(f"✅ Conversion Complete:")
    print(f"English file: {english_output}")
    print(f"French file: {french_output}")


In [None]:
process_and_align('french.txt', 'gutenburg.txt', 'prep-full.e','prep-full.f')

🔹 Initial sentence count: English (9930), French (9629)
🔹 Aligned sentence count: 9158
✅ Conversion Complete:
English file: prep-full.e
French file: prep-full.f


# Manual Alignment

In [None]:
from itertools import zip_longest

# Load English and French files
with open('aligned-ch1.e', 'r', encoding='utf-8') as eng_file:
    english_sentences = eng_file.readlines()

with open('aligned-ch1.f', 'r', encoding='utf-8') as fr_file:
    french_sentences = fr_file.readlines()

# Compare sentences line by line
for idx, (eng_sent, fr_sent) in enumerate(zip_longest(english_sentences, french_sentences, fillvalue='')):
    eng_sent = eng_sent.strip()
    fr_sent = fr_sent.strip()
    eng_len = len(eng_sent)
    fr_len = len(fr_sent)

    # Check for significant difference (e.g., one is 1.5x longer)
    if fr_len > 1.5 * eng_len:
        status = "⚠️ French significantly longer"
    elif eng_len > 1.5 * fr_len:
        status = "⚠️ English significantly longer"
    elif fr_len == eng_len == 0: 
        continue
    else:
        status = "✅ Aligned"

    # Display the comparison
    print(f"Line {idx + 1}: {status}")
    print(f"EN: {eng_sent}")
    print(f"FR: {fr_sent}")
    print("-" * 50)

Line 1: ✅ Aligned
EN: There was no possibility of taking a walk that day.
FR: Impossible de nous promener ce jour-là.
--------------------------------------------------
Line 2: ✅ Aligned
EN: We had been wandering, indeed, in the leafless shrubbery an hour in the morning; but since dinner (Mrs. Reed, when there was no company, dined early) the cold winter wind had brought with it clouds so sombre, and a rain so penetrating, that further out-door exercise was now out of the question.
FR: À vrai dire, nous avions fait un tour le matin pendant une heure dans le bosquet dépouillé de ses feuilles ; mais depuis le dîner (Mme Reed dînait de bonne heure quand il n’y avait pas d’invités), le vent froid de l’hiver avait apporté des nuages si sombres et une pluie si pénétrante qu’il ne pouvait plus être question désormais de prendre de l’exercice au-dehors.
--------------------------------------------------
Line 3: ✅ Aligned
EN: I was glad of it: I never liked long walks, especially on chilly afte