In [87]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

def translate_zh_to_en(text):
    tokenized_text = tokenizer.prepare_seq2seq_batch(text, return_tensors='pt')
    translation = model.generate(**tokenized_text)
    return tokenizer.batch_decode(translation, skip_special_tokens=True)

In [200]:
def find_sentence_end(chinese_lines, similarities, max_range=3):
    all_lines_end_with_period = all(line[-1] != "。" for line in chinese_lines)
    
    max_similarity_index = similarities.index(max(similarities))
    
    if chinese_lines[max_similarity_index][-1] == "。":
        return max_similarity_index
        
    if all_lines_end_with_period:
        return max_similarity_index
    
    range_count = 0

    while range_count < max_range:
        range_count += 1
        left_index = max(max_similarity_index - range_count, 0)
        right_index = min(max_similarity_index + range_count, len(chinese_lines) - 1)
        
        left_include_period = chinese_lines[left_index][-1] == "。"
        right_include_period = chinese_lines[right_index][-1] == "。"

        if left_include_period ^ right_include_period:
            return left_index if left_include_period else right_index
        
        if left_include_period and right_include_period:
            return similarities.index(max([similarities[left_index], similarities[right_index]]))
            

    return max_similarity_index

In [214]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm


def find_chinese_paragraph_end_index(english_paragraph, remainder_chinese_lines, vectorizer):
    paragraph_vector = vectorizer.transform([english_paragraph])
    
    is_similarity_begin_descending = False
    similarities = []
    for index, _ in enumerate(remainder_chinese_lines):
        chinese_line_translated_en = translate_zh_to_en("".join(chinese_lines[:index+1]))
        chinese_line_vector = vectorizer.transform([chinese_line_translated_en[0]])
        
        similarity = cosine_similarity(paragraph_vector, chinese_line_vector)[0][0]
        similarities.append(similarity)
        
        if similarity == 1:
            break
            
        if len(similarities) > 1 and similarity < similarities[index-1]:
            if is_similarity_begin_descending:
                break
            else:
                is_similarity_begin_descending = True
        else:
            is_similarity_begin_descending = False
            
    return find_sentence_end(remainder_chinese_lines, similarities)

In [215]:
def align_chinese_lines(english_paragraphs, chinese_lines):
    aligned_chinese = []
    vectorizer = TfidfVectorizer()
    vectorizer.fit(english_paragraphs)
    
    last_zh_paragraph_end_index = 0
    for english_paragraph in tqdm(english_paragraphs):
        max_similarity_index = find_chinese_paragraph_end_index(english_paragraph, chinese_lines[last_zh_paragraph_end_index:], vectorizer)
        
        round_zh_paragraph_end_index = last_zh_paragraph_end_index + max_similarity_index + 1
        
        aligned_chinese.append("".join(chinese_lines[last_zh_paragraph_end_index: round_zh_paragraph_end_index]))

        last_zh_paragraph_end_index = round_zh_paragraph_end_index
        
    return aligned_chinese

In [213]:
english_paragraphs = [
    "Global warming is a pressing issue that affects the entire planet. The increase in greenhouse gas emissions has led to rising temperatures and unpredictable weather patterns. In addition, the melting of polar ice caps and glaciers has contributed to the rise in sea levels, posing a threat to coastal communities. It is crucial for governments and individuals to take immediate action to reduce emissions and transition to sustainable energy sources.",
    "Education plays a vital role in shaping the future of individuals and societies. It empowers individuals with knowledge and skills, enabling them to pursue their aspirations and contribute to their communities. Moreover, education promotes social equality and reduces poverty by providing opportunities for marginalized groups. Investment in education is an investment in a brighter future for everyone."
]

chinese_lines = [
    "全球变暖是一个紧迫的问题，",
    "影响着整个地球。温室气体排放",
    "的增加导致气温上升和天气模式的不可预",
    "测性。此外，极地冰盖和冰川的融化导致",
    "海平面上升，对沿海社区构成威胁。",
    "政府和个人立即采取行动，减少排放并转",
    "向可持续能源是至关重要的。",
    "教育在塑造个人和社会",
    "未来方面起着至关重要的作用。它赋予",
    "个人知识和技能，使他们能够追求自己",
    "的愿望并为社区做出贡献。此外，教育通",
    "过为边缘群体提供机会，促进社会平等并减",
    "少贫困。对教育的投资是对每个人更加光",
    "明未来的投资。"
]

align_chinese_lines(english_paragraphs, chinese_lines)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:15<00:00,  7.91s/it]


['全球变暖是一个紧迫的问题，影响着整个地球。温室气体排放的增加导致气温上升和天气模式的不可预测性。此外，极地冰盖和冰川的融化导致海平面上升，对沿海社区构成威胁。政府和个人立即采取行动，减少排放并转向可持续能源是至关重要的。',
 '教育在塑造个人和社会未来方面起着至关重要的作用。它赋予个人知识和技能，使他们能够追求自己的愿望并为社区做出贡献。此外，教育通过为边缘群体提供机会，促进社会平等并减少贫困。对教育的投资是对每个人更加光明未来的投资。']