In [1]:
import pandas as pd
from tqdm import tqdm
from deep_translator import GoogleTranslator
from langdetect import detect_langs
import re

In [2]:
df = pd.read_csv("raw data\srilanka_all_text.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73029 entries, 0 to 73028
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           73029 non-null  object 
 1   post_title   73029 non-null  object 
 2   type         73029 non-null  object 
 3   body         62981 non-null  object 
 4   score        73029 non-null  int64  
 5   url          73029 non-null  object 
 6   created_utc  73029 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 3.9+ MB


## Remove Duplicates

In [3]:
df.duplicated(subset=["post_title", "body"]).sum()

np.int64(34795)

In [4]:
df.drop_duplicates(subset=["post_title", "body"], inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38234 entries, 0 to 38233
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           38234 non-null  object 
 1   post_title   38234 non-null  object 
 2   type         38234 non-null  object 
 3   body         32619 non-null  object 
 4   score        38234 non-null  int64  
 5   url          38234 non-null  object 
 6   created_utc  38234 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 2.0+ MB


## Translations to "en"

In [5]:
translator = GoogleTranslator(source="auto", target="en")
tqdm.pandas()

def safe_translate(text):
    try:
        if not isinstance(text, str) or text.strip() == "":
            return text

        if len(text) > 5000:
            return text

        # Skip texts with no alphabetic/Unicode language characters
        if not re.search(r"[A-Za-z\u0D80-\u0DFF\u0B80-\u0BFF]", text):
            return text

        # Language detection
        try:
            langs = [str(l).split(':')[0] for l in detect_langs(text)]
        except:
            return text

        if re.search(r'[\u0D80-\u0DFF]', text) and "si" not in langs:
            langs.append("si")

        if re.search(r'[\u0B80-\u0BFF]', text) and "ta" not in langs:
            langs.append("ta")

        if "si" in langs or "ta" in langs:
            return translator.translate(text)

        return text

    except Exception as e:
        print("Translation error:", e)
        return text

In [6]:
## Title Translation
df["post_title"] = df["post_title"].progress_apply(safe_translate)

100%|██████████| 38234/38234 [06:36<00:00, 96.32it/s]  


In [7]:
## Body Translation
df["body"] = df["body"].progress_apply(safe_translate)

100%|██████████| 38234/38234 [06:07<00:00, 104.14it/s]


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38234 entries, 0 to 38233
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           38234 non-null  object 
 1   post_title   38234 non-null  object 
 2   type         38234 non-null  object 
 3   body         32619 non-null  object 
 4   score        38234 non-null  int64  
 5   url          38234 non-null  object 
 6   created_utc  38234 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 2.0+ MB


In [9]:
df.to_csv("raw data\sl_all_text_translated.csv", index=False, encoding="utf-8")

In [10]:
from datasketch import MinHash, MinHashLSH

In [11]:
df["text"] = df["post_title"].fillna("") + " " + df["body"].fillna("")

## Remove Highly Similar Posts

In [12]:
# Simple tokenizer
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    return text.split()

# Create MinHash objects
def create_minhash(text, num_perm=128):
    tokens = tokenize(text)
    m = MinHash(num_perm=num_perm)
    for t in tokens:
        m.update(t.encode("utf8"))
    return m

# LSH for detecting near-duplicates
lsh = MinHashLSH(threshold=0.90, num_perm=128)

minhashes = {}
to_remove = set()

In [13]:
for idx in tqdm(df.index):

    if idx in to_remove:
        continue

    text = df.at[idx, "text"]

    m = create_minhash(text)
    minhashes[idx] = m

    # Query similar docs
    result = lsh.query(m)

    if result:
        # Mark all later similar posts for removal
        for r in result:
            if r != idx:
                to_remove.add(r)

    lsh.insert(idx, m)

print(f"Highly similar posts detected: {len(to_remove)}")

100%|██████████| 38234/38234 [01:10<00:00, 538.64it/s]

Highly similar posts detected: 5507





In [14]:
df_simclean = df.drop(index=list(to_remove)).reset_index(drop=True)
print(f"Before: {df.shape[0]}")
print(f"After: {df_simclean.shape[0]}")
print(f"Removed: {df.shape[0] - df_simclean.shape[0]}")

Before: 38234
After: 32727
Removed: 5507


In [15]:
df_simclean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32727 entries, 0 to 32726
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           32727 non-null  object 
 1   post_title   32727 non-null  object 
 2   type         32727 non-null  object 
 3   body         27217 non-null  object 
 4   score        32727 non-null  int64  
 5   url          32727 non-null  object 
 6   created_utc  32727 non-null  float64
 7   text         32727 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 2.0+ MB


In [16]:
if not df_simclean.empty:
    df_simclean.to_csv("processed data\sl_all_text_simclean.csv", index=False, encoding="utf-8")
    print("Data collection complete. Saved to processed data\sl_all_text_simclean.csv")
else:
    print("\nNo text posts were collected.")

Data collection complete. Saved to processed data\sl_all_text_simclean.csv
