In [1]:
import pandas as pd
from tqdm import tqdm
from deep_translator import GoogleTranslator

In [2]:
df = pd.read_csv("raw data\srilanka_all_text.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57557 entries, 0 to 57556
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           57557 non-null  object 
 1   post_title   57557 non-null  object 
 2   type         57557 non-null  object 
 3   body         49597 non-null  object 
 4   score        57557 non-null  int64  
 5   url          57557 non-null  object 
 6   created_utc  57557 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 3.1+ MB


## Remove Duplicates

In [3]:
df.duplicated(subset=["post_title", "body"]).sum()

np.int64(24520)

In [4]:
df.drop_duplicates(subset=["post_title", "body"], inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33037 entries, 0 to 33036
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           33037 non-null  object 
 1   post_title   33037 non-null  object 
 2   type         33037 non-null  object 
 3   body         28286 non-null  object 
 4   score        33037 non-null  int64  
 5   url          33037 non-null  object 
 6   created_utc  33037 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 1.8+ MB


In [None]:
df.to_csv("raw data\sl_all_text_drop_duplicates.csv", index=False, encoding="utf-8")

In [None]:
df_translate = df.copy()

## Translations to "en"

In [None]:
translator = GoogleTranslator(source="auto", target="en")
tqdm.pandas()

def safe_translate(text):
    try:
        if isinstance(text, str) and text.strip() != "" and len(text) <= 5000:
            return translator.translate(text)
        else:
            return text
    except Exception as e:
        print("Translation error:", e)
        return text  # keep original instead of crashing

### Title Translation

In [None]:
df_translate["post_title"] = df_translate["post_title"].progress_apply(safe_translate)

### Body Translation

In [None]:
df_translate["body"] = df_translate["body"].progress_apply(safe_translate)

In [None]:
df_translate.to_csv("raw data\sl_all_text_translated.csv", index=False, encoding="utf-8")

## Remove Highly Similar Posts

In [5]:
from datasketch import MinHash, MinHashLSH
import pandas as pd
from tqdm import tqdm
import re

# Combine title + body for similarity
df["text"] = df["post_title"].fillna("") + " " + df["body"].fillna("")

# Simple tokenizer
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    return text.split()

# Create MinHash objects
def create_minhash(text, num_perm=128):
    tokens = tokenize(text)
    m = MinHash(num_perm=num_perm)
    for t in tokens:
        m.update(t.encode("utf8"))
    return m

# LSH for detecting near-duplicates
lsh = MinHashLSH(threshold=0.85, num_perm=128)   # adjust threshold if needed

minhashes = {}
to_remove = set()

print("Creating MinHashes and detecting similar posts...")

for idx in tqdm(df.index):

    if idx in to_remove:
        continue

    text = df.at[idx, "text"]

    m = create_minhash(text)
    minhashes[idx] = m

    # Query similar docs
    result = lsh.query(m)

    if result:
        # Mark all later similar posts for removal
        for r in result:
            if r != idx:
                to_remove.add(r)

    lsh.insert(idx, m)

print(f"Highly similar posts detected: {len(to_remove)}")

# Remove similar posts
df_simclean = df.drop(index=list(to_remove)).reset_index(drop=True)

print(f"Before: {df.shape[0]}, After: {df_simclean.shape[0]}, Removed: {df.shape[0] - df_simclean.shape[0]}")

# Save
df_simclean.to_csv("reddit_translated_simclean.csv", index=False)
df_simclean.head()


Creating MinHashes and detecting similar posts...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 33037/33037 [01:06<00:00, 493.19it/s]


Highly similar posts detected: 5981
Before: 33037, After: 27056, Removed: 5981


Unnamed: 0,id,post_title,type,body,score,url,created_utc,text
0,1owpu40,Feel so ashamed as a Sri LankanðŸ˜–,text,Felt so ashamed watching this. People on touri...,385,https://i.redd.it/r032jkou961g1.jpeg,1763104000.0,Feel so ashamed as a Sri LankanðŸ˜– Felt so asham...
1,1o39qaf,Whatâ€™s happening Sri lanka!,text,Why these people are allowed inside Sri lanka?...,722,https://v.redd.it/z2ph50al0cuf1,1760123000.0,Whatâ€™s happening Sri lanka! Why these people a...
2,1nw4v0y,"This is insane, shows how many people in Sri L...",text,â€œTeachers and principals will resign from scho...,470,https://i.redd.it/mvnrxyglhpsf1.jpeg,1759414000.0,"This is insane, shows how many people in Sri L..."
3,1nne3va,What is something that happens in Sri Lanka bu...,text,,378,https://i.redd.it/2vplg98pjnqf1.jpeg,1758519000.0,What is something that happens in Sri Lanka bu...
4,1o9rh3m,Homophobia in Sri Lanka Is at an All Time High,text,Homophobia is crazy high in Sri Lanka these da...,265,https://www.reddit.com/gallery/1o9rh3m,1760780000.0,Homophobia in Sri Lanka Is at an All Time High...


In [6]:
df_simclean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27056 entries, 0 to 27055
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           27056 non-null  object 
 1   post_title   27056 non-null  object 
 2   type         27056 non-null  object 
 3   body         22451 non-null  object 
 4   score        27056 non-null  int64  
 5   url          27056 non-null  object 
 6   created_utc  27056 non-null  float64
 7   text         27056 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.7+ MB


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [None]:
df_translate["full_text"] = df_translate["post_title"].fillna("") + " " + df_translate["body"].fillna("")

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df_translate["full_text"].tolist(), batch_size=32, show_progress_bar=True)

In [None]:
# fit NN model
nn = NearestNeighbors(n_neighbors=2, metric="cosine").fit(embeddings)

distances, indices = nn.kneighbors(embeddings)

# cosine similarity = 1 - distance
similarities = 1 - distances[:, 1]

# threshold for highly similar posts
threshold = 0.90  
to_remove = np.where(similarities > threshold)[0]
print("Highly similar posts found:", len(to_remove))

In [None]:
df_clean = df_translate.drop(index=to_remove).reset_index(drop=True)
df_clean.info()

In [None]:
df_clean.to_csv("processed data\sl_cleaned_txt.csv", index=False)