In [23]:
%load_ext autoreload
%autoreload 2

import pandas as pd

# Set display options to show more columns and wider output
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Don't truncate column contents

from IPython.display import display

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read Messeges

In [8]:
import json
import pandas as pd

# Specify the file path
file_path = "telegram_messages.json"  # Replace with the actual file path if different

# Read the JSON file
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert the JSON data to a Pandas DataFrame
df = pd.json_normalize(data)

# Display the DataFrame
df.head()

Unnamed: 0,channel,message_id,timestamp,message,media,metadata.sender_id,metadata.message_type
0,From_hebron,270263,2024-10-13 19:32:55,♦️إعلام العدو يتحدث عن حدث أمني صعب.,[],-1001478404564,Message
1,From_hebron,270262,2024-10-13 19:32:20,♦️ يسرائيل للو تسنزورا: إصابة قاعدة تدريب غولا...,[],-1001478404564,Message
2,From_hebron,270261,2024-10-13 19:31:11,محكمة الاحتلال تُمدد الحكم على الأسير سليم الر...,"[{'media_type': 'MessageMediaPhoto', 'media_id...",-1001478404564,Message
3,From_hebron,270260,2024-10-13 19:29:24,♦️المتحدثة باسم الشرطة الصهيونيه : المتظاهرون ...,"[{'media_type': 'MessageMediaDocument', 'media...",-1001478404564,Message
4,From_hebron,270259,2024-10-13 19:29:07,♦️ القسام يعلن تفجير عين نفق في قوة اسرائيلية ...,[],-1001478404564,Message


# Translate Messages

In [39]:
import pandas as pd
from typing import Optional, Dict
from pydantic import BaseModel
from openai_client import OpenAIClient
from tqdm import tqdm

class TranslationResponse(BaseModel):
    """Response model for translations."""
    hebrew: str
    english: str

class TelegramMessageProcessor:
    """Processor class to handle Telegram message translation."""
    
    def __init__(self, client: OpenAIClient):
        self.client = client

    def translate_message(self, message: str) -> Optional[TranslationResponse]:
        """Translate a Telegram message into both Hebrew and English."""
        system_message="""
            Act as a highly accurate translator. Your task is to take an Arabic message from a Telegram group and translate it into both Hebrew and English.

            Ensure that:
            
            All emojis and decorative symbols (such as bullets, arrows, or icons) are removed from the original message before translation.
            The translations are precise and maintain the original meaning, tone, and nuance, so the user fully understands both the context and sentiment.
            Military, organizational, or specific terminology (e.g., names of groups, military actions) should be translated consistently without alteration or substitution.
            Prioritize clarity and natural phrasing in both Hebrew and English, avoiding overly literal translations, while maintaining fidelity to the original message.
            Retain punctuation and formatting (such as parentheses) accurately to ensure readability in both languages.
            Maintain urgency and emotional tone (e.g., in reports of military strikes, martyrdom) in a way that sounds natural and fluent.
        """
        
        response = self.client.chat(
            system_message=system_message,
            user_message=message,
            response_format=TranslationResponse
        )

        if response and response.choices and hasattr(response.choices[0].message, 'parsed'):
            return response.choices[0].message.parsed  # Parsed translation content
        return None

class MainProcessor:
    """Class to manage the full pipeline: reading messages, translating, and updating DataFrame."""
    
    def __init__(self, processor: TelegramMessageProcessor, df: pd.DataFrame):
        self.processor = processor
        self.df = df.copy()  # Avoid modifying the original df

        # Prepare new columns for translations
        self.df['hebrew_translation'] = None
        self.df['english_translation'] = None

    def run(self) -> pd.DataFrame:
        """Run the full pipeline, translating each message and returning an updated DataFrame."""
        # Use tqdm to create a progress bar for the loop
        for index, row in tqdm(self.df.iterrows(), total=len(self.df), desc="Translating messages"):
            message = row['message']
            translation = self.processor.translate_message(message)
            
            if translation:
                # Store translations in corresponding DataFrame columns
                self.df.at[index, 'hebrew_translation'] = translation.hebrew
                self.df.at[index, 'english_translation'] = translation.english
            else:
                print(f"No translation found for message_id {row['message_id']}.")
        
        # Return the updated DataFrame with translations
        return self.df

if __name__ == "__main__":
    # Specify the file path
    file_path = "telegram_messages.json"  # Replace with the actual file path if different
    
    # Read the JSON file
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # Convert the JSON data to a Pandas DataFrame
    df = pd.json_normalize(data)

    # Initialize the OpenAI client and processor
    openai_client = OpenAIClient(show_prices=False)
    message_processor = TelegramMessageProcessor(openai_client)
    main_processor = MainProcessor(message_processor, df)

    # Run the pipeline and get the updated DataFrame
    updated_df = main_processor.run()

    # Display the updated DataFrame with translations
    display(updated_df[['message', 'hebrew_translation', 'english_translation']])


Using closest match for encoding: 'gpt-4o' for input model 'gpt-4o-2024-08-06'
Resolved model for encoding: 'gpt-4o'
Using pricing model: 'gpt-4o-2024-08-06'


Translating messages: 100%|████████████████████████████████████████████████████| 25/25 [00:32<00:00,  1.28s/it]


Unnamed: 0,message,hebrew_translation,english_translation
0,♦️إعلام العدو يتحدث عن حدث أمني صعب.,התקשורת של האויב מדווחת על אירוע ביטחוני קשה.,Enemy media reports on a serious security incident.
1,♦️ يسرائيل للو تسنزورا: إصابة قاعدة تدريب غولاني \nبطائرة\n مسيرة تابعة للحزب في عكا قبل قليل,ישראל לא תצנזר: פגיעה בבסיס אימוני גולני על ידי כלי טיס בלתי מאויש של החזבאללה בעכו לפני זמן קצר,Israel will not censor: Golani training base was hit by a Hezbollah drone in Acre a short while ago
2,محكمة الاحتلال تُمدد الحكم على الأسير سليم الرجوب من سكان حي كريسة بمدينة دورا للمرة الثالثة على التوالي لمدة ستة أشهر إضافية.,"בית המשפט של הכיבוש מאריך את גזר הדין של האסיר סלים רג'וב, תושב שכונת כריסה בעיר דורה, בפעם השלישית ברציפות למשך שישה חודשים נוספים.","The occupation court extends the sentence of the prisoner Salim Al-Rajoub, a resident of the Karisa neighborhood in the city of Dura, for the third consecutive time for an additional six months."
3,♦️المتحدثة باسم الشرطة الصهيونيه : المتظاهرون ضد الحكومة عرضوا حياة ضباط الشرطة للخطر أمس عندما تعمدوا إشعال حريق كبير على بعد بوصات منهم.,דוברת המשטרה הציונית: המפגנים נגד הממשלה סיכונו את חיי שוטרי המשטרה אתמול כאשר בכוונה הציתו שרפה גדולה במרחק של סנטימטרים מהם.,The spokeswoman for the Zionist police: Protesters against the government endangered the lives of police officers yesterday when they deliberately started a large fire inches away from them.
4,♦️ القسام يعلن تفجير عين نفق في قوة اسرائيلية راجلة شمال مدينة بيت لاهيا شمال القطاع,הקסאם מודיע על פיצוץ פתח מנהרה בכוח ישראלי בצפון העיר בית להיה בצפון הרצועה.,Al-Qassam announces the detonation of a tunnel entrance in an Israeli foot patrol north of the city of Beit Lahia in the northern sector.
5,🛑اعتراض الطائرة بدون طيار في سماء كريات قبل قليل,יירוט כלי הטיס הבלתי מאויש בשמי קריית לפני זמן קצר,Interception of the drone in the skies of Kiryat a short while ago
6,"🚫🟠🟠 🟢 كتائب القسام تستهدف ناقلة جند صهيونية بقذيفة ""تاندوم"" شمال مدينة بيت لاهيا شمال القطاع","כִּתֵיבָּת אלקסאם תוקפת נגמ""ש ציוני בפגז ""טנדום"" צפונית לעיר בית לאהיה בצפון הרצועה.","The Al-Qassam Brigades target a Zionist armored personnel carrier with a ""Tandem"" shell north of Beit Lahia city in the northern part of the Gaza Strip."
7,♦️بالفيديو | كلمة سماحة الأمين العام الشهيد الي ( المجاهدين خلال إحدى المناورات العسكرية,"בסרטון | נאום מזכ""ל השהיד בפני הלוחמים במהלך אחד התרגילים הצבאיים",In the video | Speech of the martyr Secretary General to the fighters during one of the military maneuvers
8,♦️عاجل| حزب الله يشن هجوماً مركباً بالطائرات المسيّرة والصواريخ على شمال فلسطين المحتلة.,חיזבאללה פותח במתקפה מורכבת באמצעות מטוסים ללא טייס וטילים על צפון פלסטין הכבושה.,Hezbollah launches a complex attack using drones and missiles on northern occupied Palestine.
9,♦️التقارير الأولية دون رقابة: \nإصابة مستوطن نتيجة سقوط شظايا في منطقة كريات شمونة.,דיווחים ראשוניים ללא פיקוח: פציעה של מתנחל כתוצאה מנפילת רסיסים באזור קריית שמונה.,Initial unverified reports: An injury of a settler due to falling fragments in the Kiryat Shmona area.
