In [1]:
import random
from tqdm import tqdm

from dao.lab_report import DAOLabReport

from typing import List

from models.lab_report import LabReportInDB

from analysis.attribute_retriving import split_text_into_chunks
from analysis.nlp_transformations import split_into_sentences
from analysis.nlp_transformations import preprocess_text

In [2]:
dao_og_lab_reports: DAOLabReport = DAOLabReport('lab_reports-no_toc_biblio_11-03-25')
dao_mixed_lab_reports: DAOLabReport = DAOLabReport('lab_reports-mixed')

og_lab_reports: List[LabReportInDB] = dao_og_lab_reports.find_many_by_query({'is_generated': False})

In [3]:
def get_chunk_word_count(text):
    """
    Determines a chunk word count for the text.
    For shorter texts (< 500 words) it uses a lower upper bound,
    for longer texts it allows chunks up to 300 words.
    """
    total_words = len(text.split())
    if total_words < 75:
        return total_words  # The whole text as one chunk if very short.
    elif total_words < 500:
        # For moderate texts, choose between 75 and 150 words.
        return random.randint(75, 150)
    else:
        # For longer texts, allow chunks up to 300 words.
        return random.randint(75, min(300, total_words))

def mix_texts(text1, text2):
    """
    Splits two texts into chunks and then interleaves the chunks.

    For indices:
      - Even indices: mix1 gets text1's chunk, mix2 gets text2's chunk.
      - Odd indices: mix1 gets text2's chunk, mix2 gets text1's chunk.

    If one text has extra chunks, they are appended to the corresponding mix.
    """
    # Split texts into sentences using the provided method.
    sentences1 = split_into_sentences(text1, "pl")
    sentences2 = split_into_sentences(text2, "pl")

    # Determine chunk sizes based on text lengths.
    chunk_word_count1 = get_chunk_word_count(text1)
    chunk_word_count2 = get_chunk_word_count(text2)

    # Split sentences into chunks using the provided method.
    chunks1 = split_text_into_chunks(sentences1, chunk_word_count1)
    chunks2 = split_text_into_chunks(sentences2, chunk_word_count2)

    mix1, mix2 = [], []
    max_chunks = max(len(chunks1), len(chunks2))

    # Interleave chunks based on index parity.
    for i in range(max_chunks):
        if i < len(chunks1) and i < len(chunks2):
            if i % 2 == 0:
                mix1.append(chunks1[i])
                mix2.append(chunks2[i])
            else:
                mix1.append(chunks2[i])
                mix2.append(chunks1[i])
        elif i < len(chunks1):
            # Only text1 has an extra chunk.
            if i % 2 == 0:
                mix1.append(chunks1[i])
            else:
                mix2.append(chunks1[i])
        elif i < len(chunks2):
            # Only text2 has an extra chunk.
            if i % 2 == 0:
                mix2.append(chunks2[i])
            else:
                mix1.append(chunks2[i])

    # Join chunks into final mixed texts.
    return " ".join(mix1), " ".join(mix2)

In [4]:
mixed_texts: List[str] = []

for i in tqdm(range(0, len(og_lab_reports), 2), desc="Mixing texts"):
        if i + 1 < len(og_lab_reports):
            index1 = i
            index2 = i + 1
        else:
            index1 = i
            index2 = 0

        text1 = preprocess_text(og_lab_reports[index1].plaintext_content)
        text2 = preprocess_text(og_lab_reports[index2].plaintext_content)
        mix1, mix2 = mix_texts(text1, text2)
        mixed_texts.extend([mix1, mix2])


Mixing texts: 100%|██████████| 1260/1260 [01:31<00:00, 13.74it/s]


In [11]:
len(mixed_texts)

2520

In [10]:
from models.lab_report import LabReport

for mixed_text in mixed_texts:
    lab_report = LabReport(
        plaintext_content=mixed_text,
        is_generated=False,
        is_mixed=True,
        tag="None",
    )
    dao_mixed_lab_reports.insert_one(lab_report)