In [1]:
!pip install pandas emoji langdetect indic-transliteration openpyxl



In [2]:
import pandas as pd
import re
import emoji
from langdetect import detect, DetectorFactory
from indic_transliteration.sanscript import transliterate
from indic_transliteration import sanscript

In [3]:
# Ensure reproducibility for langdetect
DetectorFactory.seed = 0

In [4]:
# Load the Excel file
file_path = 'Indo-HateSpeech_Dataset-dc1.xlsx'
df = pd.read_excel(file_path, sheet_name='PostID Edited Comments Hindi')

In [5]:
# Function to clean comment
def clean_comment(comment):
    if not isinstance(comment, str):
        return ''
    # Remove @mentions with . _ and numbers (e.g., @user.name_123)
    comment = re.sub(r'@\S+', '', comment)
    # Replace emojis with descriptive text
    comment = emoji.demojize(comment, delimiters=(" ", " "))
    # Remove extra whitespace
    comment = re.sub(r'\s+', ' ', comment).strip()
    return comment

In [6]:
# Detect language
def get_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

In [7]:
# Custom fix to make transliteration more readable
def fix_transliteration(text):
    replacements = {
        'aa': 'a', 'ii': 'i', 'uu': 'u',
        'A': 'aa', 'I': 'ee', 'U': 'oo',
        'M': 'n', 'shh': 'sh', 'chh': 'chh',
        'kh': 'kh', 'gh': 'gh', 'th': 'th', 'dh': 'dh',
        'ph': 'ph', 'bh': 'bh', 'v': 'v',
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text.lower()

In [8]:
# Transliterate if not English
def transliterate_if_needed(comment, lang):
    if lang != 'en':
        try:
            raw = transliterate(comment, sanscript.DEVANAGARI, sanscript.ITRANS)
            return fix_transliteration(raw)
        except:
            return comment
    return comment

In [9]:
# Apply cleaning
df['Cleaned_Comment'] = df['Comment'].apply(clean_comment)
df['Lang'] = df['Cleaned_Comment'].apply(get_language)
df['Transliterated_Comment'] = df.apply(
    lambda row: transliterate_if_needed(row['Cleaned_Comment'], row['Lang']), axis=1
)

In [10]:
# Save to CSV
final_df = df[['Comment', 'Cleaned_Comment', 'Transliterated_Comment']]
final_df.columns = ['Original_Comment', 'Cleaned_Comment', 'Transliterated_Comment']
final_df.to_csv('Transliterated_Comments.csv', index=False)