In [1]:
!pip install deep-translator

import pandas as pd
import deep_translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4


Get the dataset from google drive link. The dataset consists of 400k+ rows and 2 columns, which is text and label.

Label classes: <br>
0 - Sad <br>
1 - Happy <br>
2 - Neutral <br>
3 - Anger <br>
4 - Scared

In [2]:
import pandas as pd
url='https://drive.google.com/file/d/1roBNkutysp6l15N-brC_q7Gl8PkIRHkS/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400217 entries, 0 to 400216
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    400217 non-null  object
 1   label   400217 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.1+ MB


Define the start of the row and the end of the row which you want to translate here!

In [4]:
START_ROW = 60000
END_ROW = 100000
df = df.iloc[START_ROW:END_ROW]

In [5]:
from deep_translator import GoogleTranslator
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

BATCH_SIZE = 10
MAX_WORKERS = 5

translator = deep_translator.GoogleTranslator(source='auto', target='id')

def safe_translate(text, translator, retries=3):
    if not text.strip():
        return ""
    for attempt in range(retries):
        try:
            return translator.translate(text)
        except Exception as e:
            if "too many requests" in str(e).lower():
                print(f"Rate limit reached. Stopping translation. Last text: {text}")
                raise RuntimeError("API rate limit exceeded.")
            elif attempt < retries - 1:
                time.sleep(2)
            else:
                print(f"Translation failed for text: {text}. Error: {e}")
                return "Translation failed"

def translate_batch(batch, indices):
    translator = GoogleTranslator(source='auto', target='id')
    return [(index, safe_translate(text, translator)) for index, text in zip(indices, batch)]

In [6]:
text_batches = [
    (df["text"][i:i + BATCH_SIZE].tolist(), df.index[i:i + BATCH_SIZE].tolist())
    for i in range(0, len(df), BATCH_SIZE)
]

translated_results = []

try:
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(translate_batch, batch, indices): indices for batch, indices in text_batches}
        for future in as_completed(futures):
            try:
                batch_result = future.result()
                translated_results.extend(batch_result)

                # Save progress
                progress_df = pd.DataFrame(translated_results, columns=["index", "translated_text"]).set_index("index")
                progress_df = df.join(progress_df, how="left")
                progress_df.to_csv("translated_partial.csv", index=False)

                last_translated_index = progress_df.index[-1]
                print(f"Progress saved: Translated {len(translated_results)} rows from index : {START_ROW}.")
            except RuntimeError as e:
                print(f"Stopping translation due to error: {e}")
                raise
except RuntimeError:
    print("Translation process stopped due to rate limit.")
except Exception as e:
    print(f"Unexpected error occurred: {e}")

if translated_results:
    final_df = pd.DataFrame(translated_results, columns=["index", "translated_text"]).set_index("index")
    df["translated_text"] = final_df["translated_text"]
    df.to_csv("translated_final.csv", index=False)
    print("Final translation saved.")

Progress saved: Translated 10 rows from 60000-th row of original data.
Progress saved: Translated 20 rows from 60000-th row of original data.
Progress saved: Translated 30 rows from 60000-th row of original data.
Progress saved: Translated 40 rows from 60000-th row of original data.
Progress saved: Translated 50 rows from 60000-th row of original data.
Progress saved: Translated 60 rows from 60000-th row of original data.
Progress saved: Translated 70 rows from 60000-th row of original data.
Progress saved: Translated 80 rows from 60000-th row of original data.
Progress saved: Translated 90 rows from 60000-th row of original data.
Progress saved: Translated 100 rows from 60000-th row of original data.
Progress saved: Translated 110 rows from 60000-th row of original data.
Progress saved: Translated 120 rows from 60000-th row of original data.
Progress saved: Translated 130 rows from 60000-th row of original data.
Progress saved: Translated 140 rows from 60000-th row of original data.
P