In [None]:
import pandas as pd
from tqdm import tqdm
from deep_translator import GoogleTranslator

In [None]:
df = pd.read_csv("raw data\srilanka_all_text.csv")
df.info()

## Remove Duplicates

In [None]:
df.duplicated(subset=["post_title", "body"]).sum()

In [None]:
df.drop_duplicates(subset=["post_title", "body"], inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

In [None]:
df.to_csv("raw data\sl_all_text_drop_duplicates.csv", index=False, encoding="utf-8")

In [None]:
df_translate = df.copy()

## Translations to "en"

In [None]:
translator = GoogleTranslator(source="auto", target="en")
tqdm.pandas()

def safe_translate(text):
    try:
        if isinstance(text, str) and text.strip() != "" and len(text) <= 5000:
            return translator.translate(text)
        else:
            return text
    except Exception as e:
        print("Translation error:", e)
        return text  # keep original instead of crashing

### Title Translation

In [None]:
df_translate["post_title"] = df_translate["post_title"].progress_apply(safe_translate)

### Body Translation

In [None]:
df_translate["body"] = df_translate["body"].progress_apply(safe_translate)

In [None]:
df_translate.to_csv("raw data\sl_all_text_translated.csv", index=False, encoding="utf-8")

## Remove Duplicates (If exist after translation)

In [None]:
df_translate.duplicated(subset=["post_title", "body"]).sum()

In [None]:
# df_translate.drop_duplicates(subset=["post_title", "body"], inplace=True)
# df_translate.reset_index(drop=True, inplace=True)
# df.info()

## Remove Highly Similar Posts

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [None]:
df_translate["full_text"] = df_translate["post_title"].fillna("") + " " + df_translate["body"].fillna("")

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df_translate["full_text"].tolist(), batch_size=32, show_progress_bar=True)

In [None]:
# fit NN model
nn = NearestNeighbors(n_neighbors=2, metric="cosine").fit(embeddings)

distances, indices = nn.kneighbors(embeddings)

# cosine similarity = 1 - distance
similarities = 1 - distances[:, 1]

# threshold for highly similar posts
threshold = 0.90  
to_remove = np.where(similarities > threshold)[0]
print("Highly similar posts found:", len(to_remove))

In [None]:
df_clean = df_translate.drop(index=to_remove).reset_index(drop=True)
df_clean.info()

In [None]:
df_clean.to_csv("processed data\sl_cleaned_txt.csv", index=False)