In [None]:
# Install required packages
!pip install openai
!pip install nltk
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import openai
import nltk
from nltk.tokenize import sent_tokenize
import re
from google.colab import userdata
import pandas as pd
from tqdm import tqdm
import os
import PyPDF2
import string
import pdfplumber

# Download NLTK data
nltk.download('punkt')
nltk.download('punkt_tab') # Add this line


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Best Extractor

## Single Page extraxtor


In [None]:
def extract_sentences_from_pdf_page(pdf_path, page_number):
    """
    Extract sentences from a specific page of a PDF, handling Turkmen text patterns,
    custom join rules, and dropping digit-only fragments (dates, standalone numbers).
    """
    # 1) load page text
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if not (0 <= page_number < len(pdf.pages)):
                return [f"Invalid page number. Document has {len(pdf.pages)} pages."]
            text = pdf.pages[page_number].extract_text() or ""
    except Exception as e:
        return [f"Error processing PDF: {e}"]

    # 2) drop soft-hyphens, re-attach hyphen-breaks, mark footnotes
    text = text.replace("\xad", "")
    text = re.sub(r"-\s*\n\s*", "", text)
    text = re.sub(r"\*\s*\n\s*\*", "__PARA__", text)
    text = re.sub(r"\*\s", "__PARA__", text)
    text = re.sub(r"\*\n\s", "__PARA__", text)
    text = re.sub(r"\*\s\n\s", "__PARA__", text)
    text = re.sub(r"\*\s*\s*\*", "__PARA__", text)
    text = re.sub(r'(\d+)\.\s*(\d+\.\d+\s*ý\.)', r'\1.\2', text)
    # Also handle cases where the month is separated
    text = re.sub(r'(\d+)\.\s*(\d+\.\s*\d+\s*ý\.)', r'\1.\2', text)


    # 3) collapse every whitespace run to a single space
    text = re.sub(r"\s+", " ", text)

    # 4) Turkmen-specific join fixes
    corrections = {
        "Türkmenist anyň":  "Türkmenistanyň",
        "ruhube lentlik":   "ruhubelentlik",
        "Pyr agynyň":      "Pyragynyň",
        "Pyr a gynyň":     "Pyragynyň",
        "ykba ly":          "ykbaly",
        "jaha na":          "jahana",
        "ideo logiýamyzyñ": "ideologiýamyzyň",
        "ke mala":          "kemala",
        "kä mil":           "kämil",
        "Aza dynyñ":        "Azadynyñ",
        "tary hyndaky":    "taryhyndaky",
    }
    for bad, good in corrections.items():
        text = re.sub(rf"\b{re.escape(bad)}\b", good, text)

    # 5) insert a sentence-boundary marker before any ". " that leads into
    #    either a capital Turkmen letter OR a digit
    text = re.sub(
        r"\.(?=\s+(?:[A-ZÄÖÜÝÇŞĞ]|\d))",
        ".__SENT__",
        text
    )

    # 6) protect dates, "ş.", initials, etc.
    protected = [
        r"\d+\.\d+\.\d+\.",                                # pure numeric dates
        r"\d+\.\s*\d+\.\s*\d+\s*ý\.",                      # dates ending in ý.
        r"[A-Za-zÄäŇňÖöÜüÝýŽžÇçŞşĞğ]+\s+ş\.",              # "ş."
        r"[A-Za-zÄäŇňÖöÜüÝýŽžÇçŞşĞğ]\.\s+[A-Za-zÄäŇňÖöÜüÝýŽžÇçŞşĞğ]+",  # initials
        r"b\.e\.", r"d\.m\.", r"a\.d\.", r"t\.d\.",
    ]
    placeholders = {}
    for i, patt in enumerate(protected):
        for j, m in enumerate(re.finditer(patt, text, re.IGNORECASE)):
            ph = f"__PH{i}_{j}__"
            placeholders[ph] = m.group(0)
            text = text.replace(m.group(0), ph)

    # 7) split on our sentence marker OR paragraph marker
    raw = re.split(r"__SENT__|__PARA__", text)

    # 8) restore, trim, and drop pure-digit/dot fragments
    sentences = []
    for frag in raw:
        s = frag.strip()
        if not s:
            continue
        # restore placeholders
        for ph, orig in placeholders.items():
            s = s.replace(ph, orig)
        # ensure it ends in a period
        if not s.endswith("."):
            s += "."
        # drop if only digits and dots
        if re.fullmatch(r"[\d\.]+\.?", s):
            continue

        # FINAL COMPREHENSIVE CLEANUP:

        # 1. Match any phrase with location+date+page_number format at the end
        s = re.sub(r"\s*\([^()]+,\s*\d+\.\d+\.\d+\)(?:\s+\d+)?\.?$", "", s)

        # 2. Match date+page_number format at the end
        s = re.sub(r"\s*\(\d+\.\d+\.\d+\)(?:\s+\d+)?\.?$", "", s)

        # 3. Handle pattern .*. (asterisk between periods) - replace with single period
        s = re.sub(r"\.\*\.", ".", s)

        # 4. Fix double periods (..)
        s = re.sub(r"\.{2,}", ".", s)

        # 5. Handle special asterisk format at end
        s = re.sub(r"\*\.$", ".", s)

        s = re.sub(r"\s*\(\s*\d+\.\d+\.\d+\)(?:\s+\d+)?\.?$", "", s)
        s = re.sub(r"\s*\([^()]*\d+\.\s*\d+\s*ý\.\)\.?$", "", s)
        s = re.sub(r"\s*\([^()]*\d+\.\d+(\d{4})\s*ý\.\)\.?$", "", s)

        s = s.strip()

        # Re-add period if it was removed by the pattern removal
        if not s.endswith("."):
            s += "."

        sentences.append(s)

    i = 0
    while i < len(sentences) - 1:
        current = sentences[i]
        next_sent = sentences[i + 1]

        # Check for date fragments at end of current and beginning of next
        if (re.search(r'\d+\.\d+\.$', current) and
            re.match(r'\d+\s*ý\.\)', next_sent)):
            # This is a split date - combine the sentences
            sentences[i] = current[:-1] + next_sent  # Remove trailing period from current
            sentences.pop(i + 1)  # Remove the next sentence that's now merged
        else:
            i += 1

    return sentences


In [None]:
def extract_sentences_from_text(initial_text):
    """
    Extract sentences from a specific page of a PDF, handling Turkmen text patterns,
    custom join rules, and dropping digit-only fragments (dates, standalone numbers).
    """
    text = initial_text
    # 2) drop soft-hyphens, re-attach hyphen-breaks, mark footnotes
    text = text.replace("\xad", "")
    text = re.sub(r"-\s*\n\s*", "", text)
    text = re.sub(r"\*\s*\n\s*\*", "__PARA__", text)
    text = re.sub(r"\*\s", "__PARA__", text)
    text = re.sub(r"\*\n\s", "__PARA__", text)
    text = re.sub(r"\*\s\n\s", "__PARA__", text)
    text = re.sub(r"\*\s*\s*\*", "__PARA__", text)
    text = re.sub(r'(\d+)\.\s*(\d+\.\d+\s*ý\.)', r'\1.\2', text)
    # Also handle cases where the month is separated
    text = re.sub(r'(\d+)\.\s*(\d+\.\s*\d+\s*ý\.)', r'\1.\2', text)


    # 3) collapse every whitespace run to a single space
    text = re.sub(r"\s+", " ", text)

    # 4) Turkmen-specific join fixes
    corrections = {
        "Türkmenist anyň":  "Türkmenistanyň",
        "ruhube lentlik":   "ruhubelentlik",
        "Pyr agynyň":      "Pyragynyň",
        "Pyr a gynyň":     "Pyragynyň",
        "ykba ly":          "ykbaly",
        "jaha na":          "jahana",
        "ideo logiýamyzyñ": "ideologiýamyzyň",
        "ke mala":          "kemala",
        "kä mil":           "kämil",
        "Aza dynyñ":        "Azadynyñ",
        "tary hyndaky":    "taryhyndaky",
    }
    for bad, good in corrections.items():
        text = re.sub(rf"\b{re.escape(bad)}\b", good, text)

    # 5) insert a sentence-boundary marker before any ". " that leads into
    #    either a capital Turkmen letter OR a digit
    text = re.sub(
        r"\.(?=\s+(?:[A-ZÄÖÜÝÇŞĞ]|\d))",
        ".__SENT__",
        text
    )

    # 6) protect dates, "ş.", initials, etc.
    protected = [
        r"\d+\.\d+\.\d+\.",                                # pure numeric dates
        r"\d+\.\s*\d+\.\s*\d+\s*ý\.",                      # dates ending in ý.
        r"[A-Za-zÄäŇňÖöÜüÝýŽžÇçŞşĞğ]+\s+ş\.",              # "ş."
        r"[A-Za-zÄäŇňÖöÜüÝýŽžÇçŞşĞğ]\.\s+[A-Za-zÄäŇňÖöÜüÝýŽžÇçŞşĞğ]+",  # initials
        r"b\.e\.", r"d\.m\.", r"a\.d\.", r"t\.d\.",
    ]
    placeholders = {}
    for i, patt in enumerate(protected):
        for j, m in enumerate(re.finditer(patt, text, re.IGNORECASE)):
            ph = f"__PH{i}_{j}__"
            placeholders[ph] = m.group(0)
            text = text.replace(m.group(0), ph)

    # 7) split on our sentence marker OR paragraph marker
    raw = re.split(r"__SENT__|__PARA__", text)

    # 8) restore, trim, and drop pure-digit/dot fragments
    sentences = []
    for frag in raw:
        s = frag.strip()
        if not s:
            continue
        # restore placeholders
        for ph, orig in placeholders.items():
            s = s.replace(ph, orig)
        # ensure it ends in a period
        if not s.endswith("."):
            s += "."
        # drop if only digits and dots
        if re.fullmatch(r"[\d\.]+\.?", s):
            continue

        # FINAL COMPREHENSIVE CLEANUP:

        # 1. Match any phrase with location+date+page_number format at the end
        s = re.sub(r"\s*\([^()]+,\s*\d+\.\d+\.\d+\)(?:\s+\d+)?\.?$", "", s)

        # 2. Match date+page_number format at the end
        s = re.sub(r"\s*\(\d+\.\d+\.\d+\)(?:\s+\d+)?\.?$", "", s)

        # 3. Handle pattern .*. (asterisk between periods) - replace with single period
        s = re.sub(r"\.\*\.", ".", s)

        # 4. Fix double periods (..)
        s = re.sub(r"\.{2,}", ".", s)

        # 5. Handle special asterisk format at end
        s = re.sub(r"\*\.$", ".", s)

        s = re.sub(r"\s*\(\s*\d+\.\d+\.\d+\)(?:\s+\d+)?\.?$", "", s)
        s = re.sub(r"\s*\([^()]*\d+\.\s*\d+\s*ý\.\)\.?$", "", s)
        s = re.sub(r"\s*\([^()]*\d+\.\d+(\d{4})\s*ý\.\)\.?$", "", s)

        s = s.strip()

        # Re-add period if it was removed by the pattern removal
        if not s.endswith("."):
            s += "."

        sentences.append(s)

    i = 0
    while i < len(sentences) - 1:
        current = sentences[i]
        next_sent = sentences[i + 1]

        # Check for date fragments at end of current and beginning of next
        if (re.search(r'\d+\.\d+\.$', current) and
            re.match(r'\d+\s*ý\.\)', next_sent)):
            # This is a split date - combine the sentences
            sentences[i] = current[:-1] + next_sent  # Remove trailing period from current
            sentences.pop(i + 1)  # Remove the next sentence that's now merged
        else:
            i += 1

    return sentences


In [None]:
page = 1

sentences = extract_sentences_from_pdf_page("/content/rowachlyk_ruhy-tm.pdf", page)

sentences

['Türkmenistan Birleşen Milletler Guramasy tarapyndan ykrar edilen hemişelik Bitaraplygyň halkara-hukuk derejesine eýe bolan döwlet hökmünde dünýä bileleşiginiň doly hukukly agzasydyr.',
 'Ata Watan üçin serden geçen gerçeklerimiziň gahrymançylygy ölmez-ýitmezdir.',
 'Watan üçin janyny gurban eden gerçekleriň mukaddes ojaklaryny saklap oturan ýalňyz naçarlaryň merdi-merdan alygynyň öňünde tutuş adamzat baş egýär.',
 'HHR-iň «Sinhua» habarlar agentligine berlen interwýu (Aşgabat ş., 15.022007 ý.).',
 'Türkmenistanyň uruş weteranlaryna hem-de halkyna.']

In [None]:
page = 20

sentences = extract_sentences_from_pdf_page("/content/rowachlyk_ruhy-tm.pdf", page)

sentences


['Geçmişde dünýäniň çar künjeginde ýetmişden gowrak döwlet gurup, umumadamzat siwilizasiýasyna uly goşant goşan türkmen halky dünýäde parahatçylygy, ylalaşygy, abadançylygy hem-de gülläp ösüşi goldapdyr.',
 'Oguz han, Togrul beg, Alp Arslan, Soltan Sanjar ýaly türkmeniň adyny arşa çykaran gerçeklerimiz we şahslarymyz döwlet döretmegiň, ony dolandyrmagyň we goramagyň ajaýyp nusgasyny bize miras goýupdyrlar.',
 'Biziň Bitaraplygymyz dünýäde parahatçylygy, durnuklylygy we howpsuzlygy berkitmegiň, ählumumy ykdysady, syýasy we medeni ösüşiň täsirli guralyna öwrülýär.',
 'Türkmenistanyň Daşary işler ministrliginiň Halkara gatnaşyklary institutynyň döredilmeginiň bir ýyllygy mynasybetli onuň mugallymlaryna we talyplaryna gutlag.',
 'GDA ýurtlarynyň daşary işler ministrleriniň geňeşiniň mejlisinde edilen çykyş.']

## Dataset extraction

In [None]:
def extract_sentences_df(pdf_path):
    """
    Returns a DataFrame with columns:
      - page    : page number (1-based)
      - sentence: all sentences from that page as a list
      - count   : total sentences on that page
    """
    page_data = {}
    with pdfplumber.open(pdf_path) as pdf:
        for idx, page in enumerate(pdf.pages):
            page_num = idx + 1
            text = page.extract_text() or ""
            sentences = extract_sentences_from_text(text)

            # Store sentences and count for each page
            page_data[page_num] = {
                "sentences": sentences,
                "count": len(sentences)
            }

    # Create DataFrame with one row per page
    rows = []
    for page_num, data in page_data.items():
        rows.append({
            "page": page_num,
            "sentence": data["sentences"],
            "count": data["count"]
        })

    df = pd.DataFrame(rows, columns=["page", "sentence", "count"])
    return df

# Example usage:
# df = extract_sentences_df("/mnt/data/page33.pdf")
# display(df)


In [None]:
df_turkmen = extract_sentences_df("/content/rowachlyk_ruhy-tm.pdf")

df_turkmen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   page      63 non-null     int64 
 1   sentence  63 non-null     object
 2   count     63 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.6+ KB


In [None]:
df_english = extract_sentences_df("/content/rowachlyk_ruhy-en.pdf")

df_english.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   page      61 non-null     int64 
 1   sentence  61 non-null     object
 2   count     61 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.6+ KB


In [None]:
df_english.head(20)

Unnamed: 0,page,sentence,count
0,1,[I am sure that you will preserve traditions o...,5
1,2,"[The greatness of moral, cultural, social and ...",5
2,3,[The whole world bows before the courage of so...,6
3,4,[In all times Turkmen people were bearers of b...,6
4,5,"[Courage and bravery, heroism of our ancestors...",7
5,6,[I believe that each of you inspired by heroic...,4
6,7,[We built all conditions for proud and industr...,7
7,8,[Editorial board of the newspaper must conduct...,5
8,9,[Wide international cooperation aiming at obta...,5
9,10,[The Turkmen people in all their undertakings ...,7


## Merge both languages

In [None]:
merged_df = pd.merge(
    df_english,
    df_turkmen,
    on='page',
    how='outer',  # Use 'outer' to keep all pages from both dataframes
    suffixes=('_en', '_tm')  # Add suffixes to distinguish column sources
)

In [None]:
# prompt: transform `count_en` column type to int also 0 if NA or empty

# Ensure the 'count_en' column exists and handle potential errors
if 'count_en' not in merged_df.columns:
    merged_df['count_en'] = 0  # or handle the missing column appropriately
else:
    merged_df['count_en'] = merged_df['count_en'].fillna(0)
    merged_df['count_en'] = merged_df['count_en'].astype(int)
    # Convert empty strings to 0 if needed
    merged_df['count_en'] = merged_df['count_en'].replace('', 0).astype(int)

print(merged_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   page         63 non-null     int64 
 1   sentence_en  61 non-null     object
 2   count_en     63 non-null     int64 
 3   sentence_tm  63 non-null     object
 4   count_tm     63 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 2.6+ KB
None


In [None]:
merged_df.head()

Unnamed: 0,page,sentence_en,count_en,sentence_tm,count_tm
0,1,[I am sure that you will preserve traditions o...,5,"[Şir ýürekli türkmen gerçekleri, bäş müň ýylly...",5
1,2,"[The greatness of moral, cultural, social and ...",5,[Türkmenistan Birleşen Milletler Guramasy tara...,5
2,3,[The whole world bows before the courage of so...,6,[Watan ogullarynyň gahrymançylygy nesillerden ...,5
3,4,[In all times Turkmen people were bearers of b...,6,[Türkmen halky özüniň beýik şahyry Magtymguly ...,3
4,5,"[Courage and bravery, heroism of our ancestors...",7,"[Merdana Watan goragçylary, ýurt, il-gün bähbi...",4


In [None]:
merged_df.to_csv('ochmejek_ruhy.csv', index=False)

## Unequal analysis

In [None]:
# prompt: find rows which columns `count_en` and `count_tm` are not equall ?

unequal_rows = merged_df[merged_df['count_en'] != merged_df['count_tm']]
unequal_rows


Unnamed: 0,page,sentence_en,count_en,sentence_tm,count_tm
2,3,[The whole world bows before the courage of so...,6,[Watan ogullarynyň gahrymançylygy nesillerden ...,5
3,4,[In all times Turkmen people were bearers of b...,6,[Türkmen halky özüniň beýik şahyry Magtymguly ...,3
4,5,"[Courage and bravery, heroism of our ancestors...",7,"[Merdana Watan goragçylary, ýurt, il-gün bähbi...",4
5,6,[I believe that each of you inspired by heroic...,4,"[Ösüşiň özboluşly demokratik, hukuk, dünýewi ý...",5
6,7,[We built all conditions for proud and industr...,7,"[Biz ata Watanyny janyndan eý görýän merdana, ...",5
7,8,[Editorial board of the newspaper must conduct...,5,"[Watan goragçylarymyzda Garaşsyz, Bitarap Wata...",3
9,10,[The Turkmen people in all their undertakings ...,7,[Ata-babalarymyzyň owal-ahyr ganyna guýlan wat...,5
11,12,[Turkmenistan confidently going along the path...,6,"[Türkmenistan döwletimiz parahatçylyk söýüji, ...",4
13,14,[Today when glory of Turkmen people spread all...,5,"[Men ýurdumyzyň Garaşsyzlygyny goramakda, abad...",6
15,16,[Today Turkmenistan is the center of peace mak...,5,"[Konstitusiýamyzda ýurdumyzyň Garaşsyzlygy, he...",6


## Equalization

In [None]:
from openai import OpenAI
import json
import os
from google.colab import userdata

def fix_unequal_sentences(unequal_rows):
    """
    Send rows with unequal sentence counts to OpenAI o3 model to fix alignment.

    Parameters:
    - unequal_rows: DataFrame with rows that have unequal sentence counts

    Returns:
    - Dictionary mapping page numbers to corrected English sentence lists
    """

    # Store corrected sentences
    corrected_sentences = {}

    # Process each row with unequal counts
    for idx, row in unequal_rows.iterrows():
        page_num = row['page']
        en_sentences = row['sentence_en']
        tm_sentences = row['sentence_tm']
        en_count = row['count_en']
        tm_count = row['count_tm']

        # Create prompt for OpenAI
        prompt = f"""I have Turkmen and English sentences extracted from the same PDF page (page {page_num}),
                but they don't have the same count. Please edit the English sentences to match the Turkmen count while preserving meaning.

                Turkmen sentences ({tm_count}):
                {json.dumps(tm_sentences, ensure_ascii=False, indent=2)}

                English sentences ({en_count}):
                {json.dumps(en_sentences, ensure_ascii=False, indent=2)}

                Edit the English sentences to have exactly {tm_count} sentences. Only output the corrected English sentences as a JSON list.

                OUTPUT:
                Output new English sentences in the same exact format that you received them.
                """

        client = OpenAI(
            api_key=userdata.get('OPENAI_API_KEY')
        )

        completion = client.chat.completions.create(
            model="o3-mini-2025-01-31",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that edits and aligns English sentences to match the count of corresponding Turkmen sentences while preserving meaning."},
                {"role": "user", "content": prompt}
            ],
            # temperature=0.2,  # Lower temperature for more deterministic output
        )

        # Extract response
        assistant_message = completion.choices[0].message.content

        # Extract JSON list from the response
        try:
            # Find JSON content in the response
            json_start = assistant_message.find('[')
            json_end = assistant_message.rfind(']') + 1

            if json_start >= 0 and json_end > 0:
                json_content = assistant_message[json_start:json_end]
                corrected_en_sentences = json.loads(json_content)
            else:
                # If no brackets found, try to parse the whole response
                corrected_en_sentences = json.loads(assistant_message)

            # Verify count is correct
            if len(corrected_en_sentences) == tm_count:
                corrected_sentences[page_num] = corrected_en_sentences
            else:
                print(f"Warning: Corrected sentences for page {page_num} still have unequal count: {len(corrected_en_sentences)} instead of {tm_count}")
                corrected_sentences[page_num] = corrected_en_sentences

        except json.JSONDecodeError as e:
            print(f"Error parsing JSON for page {page_num}: {e}")
            print(f"Response text: {assistant_message}")

    return corrected_sentences

# Now update the original dataframe with corrected sentences
def update_dataframe_with_corrections(df, unequal_rows):
    """
    Update the dataframe with corrected sentence lists
    """
    corrected_sentences = fix_unequal_sentences(unequal_rows)
    print("Corrected = ", corrected_sentences)

    # Create a copy to avoid modifying the original
    updated_df = df.copy()

    # Update each corrected page
    for page_num, sentences in corrected_sentences.items():
        # Find the row with this page number
        mask = updated_df['page'] == page_num
        if mask.any():
            # Get the index of the row
            row_idx = updated_df.index[mask].tolist()[0]

            # Set values one at a time using at
            updated_df.at[row_idx, 'sentence_en'] = sentences
            updated_df.at[row_idx, 'count_en'] = len(sentences)

    return updated_df

In [None]:
# Usage:
# 1. Find rows with unequal counts:
unequal_rows = merged_df[merged_df['count_en'] != merged_df['count_tm']]
# 2. Apply the correction:
merged_df_corrected = update_dataframe_with_corrections(merged_df, unequal_rows)

merged_df_corrected.head()

Corrected =  {8: ["Magtymguly's lofty ideas on patriotism and his profoundly meaningful guidance fill the hearts of the happy Turkmen people with new inspiration, fresh vigor, and boundless enthusiasm.", "Dedicated to the participants of the International Scientific Conference 'Magtymguly and Spiritual-Cultural Values of the World.'"], 10: ['Magtymguly Pyragy harnessed the ancient cultural kernels of science, literature, and world civilization to create four masterpieces.', 'The great poet initiated significant transformations in the history of Turkmen poetry and the Turkmen language.', 'His vast treasury of poetic works, enriching the heritage of literature, art, and spirit, is now revived in many world languages.', "Dedicated to the participants of the International Scientific Conference 'Magtymguly and the Spiritual-Cultural Values of the World'."], 25: ['Magtymguly Pyragy is a great master of words who created fine works that told the world about his people’s philosophy formed duri

Unnamed: 0,page,sentence_en,count_en,sentence_tm,count_tm
0,1,[It is symbolic that we celebrate the day of a...,3,[Baş Kanunymyzyñ kabul edilen gün ü niñ türkme...,3
1,2,[Ideas of great Magtymguly realized in our tim...,4,[Halkymyz Magtymguly atam yzyñ paý hasyny durm...,4
2,3,[Magtymguly Pyragy having enlightened the way ...,2,[Ajaýyp şygyrlary bilen ynsan kalbyny ýagşylyk...,2
3,4,[At present in our country great attention is ...,3,[Ýurdumyzda onuñ pähimpaýhasly şy gyr laryny h...,3
4,5,[Great thinker who reflected dream of the unit...,3,[Beýik akyldar şahyrymyz Pyragy hem öz döwründ...,3


In [None]:
merged_df_corrected.to_csv('ochmejek_ruhy_corrected.csv', index=False)

## Cleaning

In [None]:
df = pd.read_csv('/content/ochmejek_ruhy_corrected.csv')


['Constitution of Turkmenistan and wise calls of Magtymguly Pyragy are values comp limenting each other and providing unbreakabl e link of times and generations, unity and solidarity of our people.', 'Congratulations to the people of Turkmenistan on the Day of adoption of the Constitution of Turkmenistan and the Day of revival, unity and poetry of Magtymguly Pyragy (18.05.2012).']


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   page         71 non-null     int64 
 1   sentence_en  71 non-null     object
 2   count_en     71 non-null     int64 
 3   sentence_tm  71 non-null     object
 4   count_tm     71 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 2.9+ KB


In [None]:
print(df.iloc[30]['sentence_tm'])
# print(type(df.iloc[30]['sentence_en']))

['Türkmenistanyñ Konstitusiýasy we Mag tymguly Pyragynyñ parasatly gara ýyş la ry döwürleriñ we nesilleriñ aýryl maz bagla ny şygyny, halkymyzyñ mizemez bitewüligini we agzybirligini üpjün edýän hemde birbi riniñ üstüni ýetirýän gymmat lykdyr.', 'Türkmenistanyň Konstitusiýasynyň kabul edilmegi hem-de Galkynyş, Agzybirlik we Magt ymguly Pyragynyň şygryýet güni mynasybetli Türkmenistanyň halkyna gutlagdan.']


In [None]:
def unfold_sentence_dataframe(df, clean_dates=True):
    """
    Unfold a DataFrame where sentence columns contain lists of sentences,
    creating a new row for each sentence pair. Optionally clean parenthetical
    dates and numbers from sentences.

    Parameters:
    -----------
    df : pandas.DataFrame
        Original DataFrame with columns 'page', 'sentence_en', and 'sentence_tm'
        where the sentence columns contain lists of sentences
    clean_dates : bool, default=True
        If True, remove parenthetical content containing digits like (12.05.2008)

    Returns:
    --------
    pandas.DataFrame
        Unfolded DataFrame with one sentence pair per row
    """
    import pandas as pd
    import ast
    import re

    # Function to clean parenthetical dates
    def clean_text(text):
        if clean_dates:
            # First remove parenthetical content containing digits
            cleaned = re.sub(r'\([^)]*\d[^)]*\)', '', text)
            # Then fix any extra spaces before punctuation
            cleaned = re.sub(r'\s+([.,;:!?])', r'\1', cleaned)
            return cleaned.strip()
        return text

    # Create empty lists to store the data
    pages = []
    sentences_en = []
    sentences_tm = []

    # Iterate through each row of the original DataFrame
    for index, row in df.iterrows():
        page = row['page']

        # Convert string representations of lists to actual lists if needed
        if isinstance(row['sentence_en'], str) and row['sentence_en'].startswith('['):
            sen_en = ast.literal_eval(row['sentence_en'])
        else:
            sen_en = row['sentence_en']

        if isinstance(row['sentence_tm'], str) and row['sentence_tm'].startswith('['):
            sen_tm = ast.literal_eval(row['sentence_tm'])
        else:
            sen_tm = row['sentence_tm']

        # Ensure both are treated as lists
        if not isinstance(sen_en, list):
            sen_en = [sen_en]
        if not isinstance(sen_tm, list):
            sen_tm = [sen_tm]

        # Check if the number of sentences matches
        min_sentences = min(len(sen_en), len(sen_tm))

        # Add each sentence pair to the lists
        for i in range(min_sentences):
            pages.append(page)
            sentences_en.append(clean_text(sen_en[i]))
            sentences_tm.append(clean_text(sen_tm[i]))

    # Create the new DataFrame
    unfolded_df = pd.DataFrame({
        'page': pages,
        'sentence_en': sentences_en,
        'sentence_tm': sentences_tm
    })

    return unfolded_df

In [None]:
import pandas as pd

df = pd.read_csv('/content/ochmejek_ruhy_corrected.csv')
unfolded_df = unfold_sentence_dataframe(df)

In [None]:
unfolded_df.iloc[50]['sentence_en']

'Congratulations to the people of Turkmenistan on the Day of adoption of the Constitution of the country and the Day of revival, unity and poetry of Magtymguly Pyragy.'

In [None]:
unfolded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   page         220 non-null    int64 
 1   sentence_en  220 non-null    object
 2   sentence_tm  220 non-null    object
dtypes: int64(1), object(2)
memory usage: 5.3+ KB


In [None]:
# Install Java 17 (required by LanguageTool) and the Python packages
!apt-get update -qq
!apt-get install -qq -y openjdk-17-jre-headless
!pip install pandas language_tool_python


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package openjdk-17-jre-headless:amd64.
(Reading database ... 126333 files and directories currently installed.)
Preparing to unpack .../openjdk-17-jre-headless_17.0.14+7-1~22.04.1_amd64.deb ...
Unpacking openjdk-17-jre-headless:amd64 (17.0.14+7-1~22.04.1) ...
Setting up openjdk-17-jre-headless:amd64 (17.0.14+7-1~22.04.1) ...
update-alternatives: using /usr/lib/jvm/java-17-openjdk-amd64/bin/java to provide /usr/bin/java (java) in auto mode
update-alternatives: using /usr/lib/jvm/java-17-openjdk-amd64/bin/jpackage to provide /usr/bin/jpackage (jpackage) in auto mode
update-alternatives: using /usr/lib/jvm/java-17-openjdk-amd64/bin/keytool to provide /usr/bin/keytool (keytool) in auto mode
update-alternatives: using /usr/lib/jvm/java-17-openjdk-amd64/bin/rmiregistry to pr

In [None]:
import pandas as pd

# If you already have df in your session, skip this.
# Otherwise, load from CSV (or however your data is stored):
# df = pd.read_csv('/path/to/your_file.csv')

df = pd.read_csv('/content/ochmejek_ruhy_final.csv')# Let's verify its schema:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   page         220 non-null    int64 
 1   sentence_en  220 non-null    object
 2   sentence_tm  220 non-null    object
dtypes: int64(1), object(2)
memory usage: 5.3+ KB
None


In [None]:
import language_tool_python

# 1) Initialize LanguageTool (Java ≥17 required)
tool = language_tool_python.LanguageTool('en-US')

# 2) Only allow spacing/typos/punctuation fixes
ALLOWED = {'TYPOS', 'CASING', 'PUNCTUATION', 'WHITESPACE', 'TYPOGRAPHY'}

def correct_if_obvious(text: str) -> str:
    matches = tool.check(text)
    if not matches:
        return text
    # m.category is already a string like 'TYPOS'
    cats = {m.category for m in matches}
    if cats.issubset(ALLOWED):
        return tool.correct(text)
    return text

# 3) Apply to the English sentences column
df['sentence_en_corrected'] = df['sentence_en'].apply(correct_if_obvious)


Downloading LanguageTool latest: 100%|██████████| 252M/252M [00:21<00:00, 11.9MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpgom2brx2.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.


In [None]:
df.to_csv('ochmejek_ruhy_en_corrected.csv', index=False)

In [None]:
# Rows where corrections were made
changed = df.loc[
    df['sentence_en'] != df['sentence_en_corrected'],
    ['sentence_en', 'sentence_en_corrected']
]

print(f"✅ Corrected {len(changed)} of {len(df)} sentences.")
changed.head(10)


In [None]:
df_tm = pd.read_csv('/content/ochmejek_ruhy_en_corrected_EnglishClean.csv')
df_en = pd.read_csv('/content/ochmejek_ruhy_en_corrected_EnglishClean.csv')

In [None]:
result_df = pd.DataFrame({
    'English': df_en['sentence_en'],
    'Turkmen': df_tm['sentence_tm']
})

In [None]:
result_df.head()

Unnamed: 0,English,Turkmen
0,It is symbolic that we celebrate the day of ad...,Baş Kanunymyzyñ kabul edilen gün ü niñ türkmen...
1,"There is deep sense in it, because the idea of...","Çünki türkmen halkynyñ özbaşdak, berkarar döwl..."
2,Congratulations to the people of Turkmenistan ...,"Türkmenistanyň halkyna Galkynyş, Agzybirlik we..."
3,Ideas of great Magtymguly realized in our time...,Halkymyz Magtymguly atam yzyñ paý hasyny durmu...
4,That is why thoughts of the poet about honesty...,"Hut şuña görä, şahy ryñ ynsap, päklik, erkinl ..."


In [None]:
result_df.to_csv('ochmejek_ruhy_courpus.csv', index=False)

In [None]:
turkmen_english_s500 = pd.read_csv('/content/turkmen_english_s500.csv')

In [None]:
turkmen_english_s500.head()

Unnamed: 0,English,Turkmen
0,It is symbolic that we celebrate the day of ad...,Baş Kanunymyzyñ kabul edilen gününiñ türkmeniñ...
1,"There is deep sense in it, because the idea of...","Çünki türkmen halkynyñ özbaşdak, berkarar döwl..."
2,Congratulations to the people of Turkmenistan ...,"Türkmenistanyň halkyna Galkynyş, Agzybirlik we..."
3,Ideas of great Magtymguly realized in our time...,Halkymyz Magtymguly atamyzyñ paýhasyny durmuşy...
4,That is why thoughts of the poet about honesty...,"Hut şuña görä, şahyryñ ynsap, päklik, erkinlik..."


In [None]:
turkmen_english_s500 = turkmen_english_s500.dropna(subset=["Turkmen", "English"])
turkmen_english_s500.head()

Unnamed: 0,English,Turkmen
0,It is symbolic that we celebrate the day of ad...,Baş Kanunymyzyñ kabul edilen gününiñ türkmeniñ...
1,"There is deep sense in it, because the idea of...","Çünki türkmen halkynyñ özbaşdak, berkarar döwl..."
2,Congratulations to the people of Turkmenistan ...,"Türkmenistanyň halkyna Galkynyş, Agzybirlik we..."
3,Ideas of great Magtymguly realized in our time...,Halkymyz Magtymguly atamyzyñ paýhasyny durmuşy...
4,That is why thoughts of the poet about honesty...,"Hut şuña görä, şahyryñ ynsap, päklik, erkinlik..."


In [None]:
!pip install -q datasets huggingface_hub pandas

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from huggingface_hub import notebook_login, HfApi



# Or if your CSV is already in the Colab environment:
# csv_filename = "your_file.csv"

# Load into pandas
df = turkmen_english_s500
print(df.shape)  # Should show (619, 2)
print(df.head())

(619, 2)
                                             English  \
0  It is symbolic that we celebrate the day of ad...   
1  There is deep sense in it, because the idea of...   
2  Congratulations to the people of Turkmenistan ...   
3  Ideas of great Magtymguly realized in our time...   
4  That is why thoughts of the poet about honesty...   

                                             Turkmen  
0  Baş Kanunymyzyñ kabul edilen gününiñ türkmeniñ...  
1  Çünki türkmen halkynyñ özbaşdak, berkarar döwl...  
2  Türkmenistanyň halkyna Galkynyş, Agzybirlik we...  
3  Halkymyz Magtymguly atamyzyñ paýhasyny durmuşy...  
4  Hut şuña görä, şahyryñ ynsap, päklik, erkinlik...  


In [None]:
# Convert to Dataset and split
dataset = Dataset.from_pandas(df, preserve_index=False)
dataset = dataset.shuffle(seed=42)  # Shuffle for better distribution

# Split into train (80%), validation (10%), test (10%)
train_test = dataset.train_test_split(test_size=0.2, seed=42)
train_ds = train_test['train']    # 80% training data
temp_ds = train_test['test']      # 20% data to be split into val and test

val_test = temp_ds.train_test_split(test_size=0.5, seed=42)
val_ds = val_test['train']        # 10% validation data
test_ds = val_test['test']        # 10% test data

dataset_dict = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})

In [None]:
# Transform into the translation format
def combine_to_translation(example):
    return {"translation": {"en": example["English"], "tk": example["Turkmen"]}}

# Apply the transformation to each split and remove the original columns
dataset_dict = dataset_dict.map(combine_to_translation, remove_columns=["English", "Turkmen"])

# Verify the format
print(dataset_dict)
print(dataset_dict['train'][0])  # Should show {"translation": {"en": "...", "tk": "..."}}


Map:   0%|          | 0/495 [00:00<?, ? examples/s]

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 495
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 62
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 62
    })
})
{'translation': {'en': 'Philosophical sonorous poems of great Magtymguly, passed through centuries and reached our days, his appeals to humanism and love to native land, his Fatherland, his wise precepts enriched the spiritual life of whole humanity.', 'tk': 'Magtymguly Pyragynyñ asyrlar aşyp, biziñ döwrümize gelip ýeten müñ dürli öwüşginli şygyrlary, ene topragy, ata Watany ýürekden söýmäge we ynsanperwerlige çagyryşlary, parasatly sargytlary bütin adamzadyñ ruhy gymmatlygyna öwrüldi.'}}


In [None]:
# Login to Hugging Face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Upload the dataset to your repository
repo_id = "XSkills/turkmen_english_s500"
dataset_dict.push_to_hub(repo_id)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.33k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/XSkills/turkmen_english_s500/commit/1f0d8e307f77f95813642fca87ad63946a53abd2', commit_message='Upload dataset', commit_description='', oid='1f0d8e307f77f95813642fca87ad63946a53abd2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/XSkills/turkmen_english_s500', endpoint='https://huggingface.co', repo_type='dataset', repo_id='XSkills/turkmen_english_s500'), pr_revision=None, pr_num=None)

In [None]:
from datasets import load_dataset

datasetHF = load_dataset("XSkills/turkmen_english_s500")


README.md:   0%|          | 0.00/6.55k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/101k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/19.2k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/17.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/495 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/62 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/62 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 495
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 62
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 62
    })
})
