## Translations to "en"

In [1]:
import pandas as pd
from tqdm import tqdm
from deep_translator import GoogleTranslator

In [2]:
df_translate = pd.read_csv("raw data\srilanka_all_text.csv")

In [3]:
translator = GoogleTranslator(source="auto", target="en")
tqdm.pandas()

In [4]:
try:
    # Translate title
    df_translate["post_title"] = df_translate["post_title"].progress_apply(
        lambda x: translator.translate(x) if isinstance(x, str) and x.strip() != "" and len(x) <= 5000 
        else x
    )
except Exception as e:
    print("Title Translation Error:", e)

100%|██████████| 42601/42601 [5:24:05<00:00,  2.19it/s]   


In [9]:
def safe_translate(text):
    try:
        if isinstance(text, str) and text.strip() != "" and len(text) <= 5000:
            return translator.translate(text)
        else:
            return text
    except Exception as e:
        print("Row translation error:", e)
        return text  # keep original instead of crashing

In [10]:
df_translate["body"] = df_translate["body"].progress_apply(safe_translate)

 27%|██▋       | 11685/42601 [1:23:12<4:36:46,  1.86it/s] 

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


 35%|███▌      | 15022/42601 [1:48:47<3:04:44,  2.49it/s] 

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


 49%|████▊     | 20666/42601 [2:34:52<3:59:21,  1.53it/s] 

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


 79%|███████▉  | 33663/42601 [4:17:56<1:00:00,  2.48it/s] 

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


 79%|███████▉  | 33665/42601 [4:17:56<51:56,  2.87it/s]  

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


 79%|███████▉  | 33678/42601 [4:18:02<1:06:11,  2.25it/s]

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


 79%|███████▉  | 33687/42601 [4:18:07<1:11:15,  2.08it/s]

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


 79%|███████▉  | 33710/42601 [4:18:19<52:54,  2.80it/s]  

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


 79%|███████▉  | 33731/42601 [4:18:32<1:57:35,  1.26it/s]

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


 79%|███████▉  | 33732/42601 [4:18:32<1:45:55,  1.40it/s]

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


 79%|███████▉  | 33735/42601 [4:18:34<1:38:48,  1.50it/s]

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


 79%|███████▉  | 33755/42601 [4:18:40<49:20,  2.99it/s]  

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


 79%|███████▉  | 33760/42601 [4:18:42<56:33,  2.61it/s]

Row translation error: Request exception can happen due to an api connection error. Please check your connection and try again


100%|██████████| 42601/42601 [5:24:40<00:00,  2.19it/s]  


In [11]:
df_translate.to_csv("raw data\srilanka_all_text_translated.csv", index=False, encoding="utf-8")

## Remove Duplicates

In [12]:
df_translate.duplicated(subset=["post_title", "body"]).sum()

np.int64(17311)

In [13]:
df_translate.drop_duplicates(subset=["post_title", "body"], inplace=True)
df_translate.reset_index(drop=True, inplace=True)

In [14]:
df_translate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25290 entries, 0 to 25289
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           25290 non-null  object 
 1   post_title   25289 non-null  object 
 2   type         25290 non-null  object 
 3   body         20950 non-null  object 
 4   score        25290 non-null  int64  
 5   url          25290 non-null  object 
 6   created_utc  25290 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 1.4+ MB


## Remove Highly Similar Posts

In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df_translate["full_text"] = df_translate["post_title"].fillna("") + " " + df_translate["body"].fillna("")

In [18]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df_translate["full_text"].tolist(), batch_size=32, show_progress_bar=True)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 791/791 [06:05<00:00,  2.16it/s]


In [19]:
# fit NN model
nn = NearestNeighbors(n_neighbors=2, metric="cosine").fit(embeddings)

distances, indices = nn.kneighbors(embeddings)

# cosine similarity = 1 - distance
similarities = 1 - distances[:, 1]

# threshold for highly similar posts
threshold = 0.90  
to_remove = np.where(similarities > threshold)[0]
print("Highly similar posts found:", len(to_remove))

Highly similar posts found: 5354


In [20]:
df_clean = df_translate.drop(index=to_remove).reset_index(drop=True)

In [21]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19936 entries, 0 to 19935
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           19936 non-null  object 
 1   post_title   19935 non-null  object 
 2   type         19936 non-null  object 
 3   body         15818 non-null  object 
 4   score        19936 non-null  int64  
 5   url          19936 non-null  object 
 6   created_utc  19936 non-null  float64
 7   full_text    19936 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.2+ MB


In [22]:
df_clean.to_csv("processed data\sl_cleaned_txt.csv", index=False)