In [1]:
import pandas as pd
import os
import json


from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [2]:
df = pd.read_csv('../data/new/master_with_options_without_skipped.csv')


In [None]:
df.head()

In [3]:
unique_questions = df.drop_duplicates(subset=['question_id'])


In [4]:
columns_to_keep = [
    'question_id', 'options', 'question_title', 'correct_option', 'solution', 'hint',
    'difficulty', 'topic_id', 'topic_name', 'subject_id', 'subject_name', 'axis_id',
    'axis_name', 'guide_id', 'template_id', 'student_answer',
    'option_a', 'option_b', 'option_c', 'option_d', 'option_e'
]

unique_questions = unique_questions[columns_to_keep]


In [5]:
# Create dataframes with unique topic, subject and axis data
topics_df = unique_questions[['topic_id', 'topic_name']].drop_duplicates()
subjects_df = unique_questions[['subject_id', 'subject_name']].drop_duplicates()
axes_df = unique_questions[['axis_id', 'axis_name']].drop_duplicates()

print(f"Number of unique topics: {len(topics_df)}")
print(f"Number of unique subjects: {len(subjects_df)}")
print(f"Number of unique axes: {len(axes_df)}")

Number of unique topics: 232
Number of unique subjects: 42
Number of unique axes: 4


In [6]:
def translate_to_english(text: str, json_mode: bool = False) -> str:
    """
    Translates Spanish text to English using GPT-4
    
    Args:
        text (str): Text in Spanish to translate
        json_mode (bool): Whether to force JSON response format
        
    Returns:
        str: Translated text in English
    """
    if json_mode:
        prompt = f"""You will receive a JSON in Spanish. Translate only the text values to English, keeping all mathematical expressions, LaTeX code, symbols, numbers, and equations exactly as they are.

        Your response must be only the translated JSON, with no additional text.

        JSON to translate:
        {text}"""
    else:
        prompt = f"""Translate the following Spanish text to English. Keep all mathematical expressions, LaTeX code, symbols, numbers, and equations exactly as they are. Only translate the words.

        Your response should contain ONLY the translation, with no additional text or explanations.

        Text to translate:
        {text}"""

    try:
        completion_args = {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "temperature": 0
        }
        
        if json_mode:
            completion_args["response_format"] = {"type": "json_object"}
            
        response = client.chat.completions.create(**completion_args)
        
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error during translation: {e}")
        return text


In [None]:
# Add English translation columns to all dataframes
axes_df['axis_name_en'] = axes_df['axis_name'].apply(translate_to_english)
subjects_df['subject_name_en'] = subjects_df['subject_name'].apply(translate_to_english)
topics_df['topic_name_en'] = topics_df['topic_name'].apply(translate_to_english)

# Display the original and translated names
print("\nAxis name translations:")
for _, row in axes_df.iterrows():
    print(f"\nSpanish: {row['axis_name']}")
    print(f"English: {row['axis_name_en']}")

print("\nSubject name translations:")
for _, row in subjects_df.iterrows():
    print(f"\nSpanish: {row['subject_name']}")
    print(f"English: {row['subject_name_en']}")

print("\nTopic name translations:")
for _, row in topics_df.iterrows():
    print(f"\nSpanish: {row['topic_name']}")
    print(f"English: {row['topic_name_en']}")


# Save translated dataframes to CSV files
print("\nSaving translations to CSV files...")

axes_df.to_csv('axes_translations.csv', index=False)
subjects_df.to_csv('subjects_translations.csv', index=False)
topics_df.to_csv('topics_translations.csv', index=False)

print("Translations saved successfully!")


In [21]:
# Add English translation columns for questions and options
print("\nTranslating questions and options...")

# Initialize new columns if they don't exist
if 'question_title_en' not in unique_questions.columns:
    unique_questions['question_title_en'] = None
    unique_questions['option_a_en'] = None 
    unique_questions['option_b_en'] = None
    unique_questions['option_c_en'] = None
    unique_questions['option_d_en'] = None
    unique_questions['option_e_en'] = None

# Check if there's an existing translation file
translation_file = 'question_translations.csv'

if os.path.exists(translation_file):
    print("Loading existing translations...")
    existing_translations = pd.read_csv(translation_file)
    # Update the dataframe with existing translations
    unique_questions.update(existing_translations)

# Get all untranslated rows
untranslated_mask = unique_questions['question_title_en'].isna()
untranslated_indices = unique_questions[untranslated_mask].index
total_untranslated = len(untranslated_indices)

print(f"Found {total_untranslated} questions that need translation")

# Translate each untranslated row
for count, idx in enumerate(untranslated_indices):
    print(f"\nProcessing question {count+1} of {total_untranslated} (ID: {idx})")
    row = unique_questions.loc[idx]
    
    # Build JSON with all question parts
    question_json = {
        "title": row['question_title'],
        "options": {
            "a": row['option_a'],
            "b": row['option_b'],
            "c": row['option_c'],
            "d": row['option_d'],
            "e": row['option_e']
        }
    }
    
    # Translate entire JSON
    translated_json_str = translate_to_english(json.dumps(question_json), json_mode=True)
    
    try:
        # Parse translated JSON
        translated_json = json.loads(translated_json_str)
        
        # Extract translations back to dataframe
        unique_questions.at[idx, 'question_title_en'] = translated_json['title']
        unique_questions.at[idx, 'option_a_en'] = translated_json['options']['a']
        unique_questions.at[idx, 'option_b_en'] = translated_json['options']['b']
        unique_questions.at[idx, 'option_c_en'] = translated_json['options']['c']
        unique_questions.at[idx, 'option_d_en'] = translated_json['options']['d']
        unique_questions.at[idx, 'option_e_en'] = translated_json['options']['e']
        
        print("Translated question and all options")
    except json.JSONDecodeError as e:
        print(f"Error parsing translated JSON for question {idx}: {e}")
        continue

    # Save progress every 50 questions
    if (count + 1) % 50 == 0:
        print(f"\nSaving progress after {count + 1} translations...")
        unique_questions.to_csv(translation_file, index=False)

# Save final results
unique_questions.to_csv(translation_file, index=False)
print("\nTranslation complete!")

# Display a few examples of translations
print("\nExample translations:")
sample_questions = unique_questions.sample(min(3, len(unique_questions)))
for _, row in sample_questions.iterrows():
    print("\nQuestion:")
    print(f"Spanish: {row['question_title']}")
    print(f"English: {row['question_title_en']}")
    print("\nOptions:")
    print(f"A - Spanish: {row['option_a']}")
    print(f"A - English: {row['option_a_en']}")
    print(f"B - Spanish: {row['option_b']}")
    print(f"B - English: {row['option_b_en']}")
    print(f"C - Spanish: {row['option_c']}")
    print(f"C - English: {row['option_c_en']}")
    print(f"D - Spanish: {row['option_d']}")
    print(f"D - English: {row['option_d_en']}")
    print(f"E - Spanish: {row['option_e']}")
    print(f"E - English: {row['option_e_en']}")



Translating questions and options...
Loading existing translations...
Found 1 questions that need translation

Processing question 1 of 1 (ID: 141952)


KeyboardInterrupt: 

In [9]:
unique_questions.head()

Unnamed: 0,question_id,options,question_title,correct_option,solution,hint,difficulty,topic_id,topic_name,subject_id,...,option_b,option_c,option_d,option_e,question_title_en,option_a_en,option_b_en,option_c_en,option_d_en,option_e_en
0,27281,"""---\n- '6'\n- '7'\n- '9'\n- '10'\n- '8'\n""",Daniela dio 56 vueltas en total la semana ante...,8,Paso 1<br>Como el enunciado dice que cada día ...,El promedio corresponde a la suma de todos los...,-1.446116,625,Comprensión e interpretación del promedio,135,...,7,9,10,8,Daniela made 56 laps in total last week. If we...,6,7,9,10,8
1,17311,"""---\n- \""\\\\begin{Bmatrix}-x - y = -2\\\\\\\...",<div>¿Cuál de los siguientes sistemas tiene po...,\begin{Bmatrix}x + y = 1\\ x - y = 1\end{Bmatrix},<div>Paso 1. Tomaremos el siguiente sistema y ...,"<div>1. Un punto es solución de un sistema, cu...",-1.358058,452,Resolución de sistemas de ecuaciones,124,...,\begin{Bmatrix}2x + 3y = 3\ -6x - 9y = -6\end{...,\begin{Bmatrix}x - 3y = 0\ x + 3y = 2\end{Bmat...,\begin{Bmatrix}2x + 3y = -2\ -2x - y = 2\end{B...,\begin{Bmatrix}x + y = 1\ x - y = 1\end{Bmatrix},<div>Which of the following systems has the so...,\begin{Bmatrix}-x - y = -2\ -x/2 - y/2 = -1\en...,\begin{Bmatrix}2x + 3y = 3\ -6x - 9y = -6\end{...,\begin{Bmatrix}x - 3y = 0\ x + 3y = 2\end{Bmat...,\begin{Bmatrix}2x + 3y = -2\ -2x - y = 2\end{B...,\begin{Bmatrix}x + y = 1\ x - y = 1\end{Bmatrix}
2,25537,"""---\n- \""`-71` celsius\""\n- \""`-70` celsius\""...",En La Seerena la temperatura a las 5 pm es 4 g...,`-68` celsius,Paso 1: Lo primero que debemos hacer es identi...,<div>Para resolver este problema deberás reali...,-0.027555,553,Operatoria combinada de números enteros,181,...,`-70` celsius,`-72` celsius,`-69` celsius,`-68` celsius,"In La Seerena, the temperature at 5 pm is 4 de...",`-71` celsius,`-70` celsius,`-72` celsius,`-69` celsius,`-68` celsius
3,26175,"""---\n- \""`4^3`\""\n- \""`8^3`\""\n- \""`27^3`\""\n...",¿Cuál de las alternativas es equivalente `(9^(...,`3^3`,Paso 1: Lo primero es identificar que las pote...,Recuerda que en la multiplicación y división d...,-1.062105,580,Propiedad: Multiplicación y división de una po...,116,...,`8^3`,`27^3`,`9^9`,`3^3`,Which of the alternatives is equivalent `(9^(3...,`4^3`,`8^3`,`27^3`,`9^9`,`3^3`
4,23834,"""---\n- \""`39/6`\""\n- \""`36/6`\""\n- \""`6/6`\""\...",<div>¿Cuál de las siguientes fracciones es equ...,`36/6`,Para saber si una fracción es equivalente a ot...,<div>Te recomendamos amplificar la fracción de...,-0.95657,604,"Fracciones propias, equivalentes y comparación...",113,...,`36/6`,`6/6`,`37/6`,`38/6`,<div>Which of the following fractions is equiv...,`39/6`,`36/6`,`6/6`,`37/6`,`38/6`
