In [1]:
!pip install pandas emoji langdetect indic-transliteration openpyxl



In [2]:
import pandas as pd
import re
import emoji
from langdetect import detect, DetectorFactory
from indic_transliteration.sanscript import transliterate
from indic_transliteration import sanscript
from datetime import datetime

In [3]:
# Ensure reproducibility for langdetect
DetectorFactory.seed = 0

In [4]:
# Load the Excel file
file_path = '/Users/hemanthnagulapalli/Documents/GitHub/cs521/English-Hindi code-mixed parallel corpus.xlsx'
df = pd.read_excel(file_path)

ValueError: Worksheet named 'PostID Edited Comments Hindi' not found

In [None]:
# Function to clean comment
def clean_comment(comment):
    if not isinstance(comment, str):
        return ''
    # Remove @mentions with . _ and numbers (e.g., @user.name_123)
    comment = re.sub(r'@\S+', '', comment)
    # Replace emojis with descriptive text
    comment = emoji.demojize(comment, delimiters=(" ", " "))
    # Remove links
    comment = ' '.join(word for word in comment.split() if 'https:' not in word) 
    # Remove extra whitespace
    comment = re.sub(r'\s+', ' ', comment).strip()
    return comment

In [None]:
# Detect language
def get_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

In [None]:
# Custom fix to make transliteration more readable
def fix_transliteration(text):
    replacements = {
        'aa': 'a', 'ii': 'i', 'uu': 'u',
        'A': 'aa', 'I': 'ee', 'U': 'oo',
        'M': 'n', 'shh': 'sh', 'chh': 'chh',
        'kh': 'kh', 'gh': 'gh', 'th': 'th', 'dh': 'dh',
        'ph': 'ph', 'bh': 'bh', 'v': 'v',
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text.lower()

In [None]:
# Transliterate if not English
def transliterate_if_needed(comment, lang):
    if lang != 'en':
        try:
            raw = transliterate(comment, sanscript.DEVANAGARI, sanscript.ITRANS)
            return fix_transliteration(raw)
        except:
            return comment
    return comment

In [None]:
# Apply cleaning
df['Cleaned_Comment'] = df['Comment'].apply(clean_comment)
df['Lang'] = df['Cleaned_Comment'].apply(get_language)
df['Cleaned_Comment'] = df.apply(
    lambda row: transliterate_if_needed(row['Cleaned_Comment'], row['Lang']), axis=1
)

In [None]:
# Save to CSV
final_df = df[['Comment', 'Cleaned_Comment']
final_df.columns = ['Original_Comment', 'Cleaned_Comment']
# final_df.to_csv('Cleaned_Comments.csv', index=False)

In [None]:
# Generate a unique filename using current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"Cleaned_Comments_{timestamp}.csv"
final_df.to_csv(output_filename, index=False)