In [1]:
!pip install pandas openpyxl langdetect emoji



In [2]:
import pandas as pd
import re
import csv
from langdetect import detect  # For language detection
import emoji  # For emoji handling

In [3]:
# Function to remove @mentions
def remove_mentions(text):
    return re.sub(r'@{1}[a-zA-Z0-9_.]+', '', text).strip()

In [4]:
# Function to detect if text is predominantly English
def is_english(text):
    try:
        # Remove emojis and mentions first for cleaner detection
        text_no_emoji = emoji.replace_emoji(text, replace='')
        text_clean = remove_mentions(text_no_emoji)
        if not text_clean:  # Skip empty strings
            return False
        lang = detect(text_clean)
        return lang == 'en'
    except:
        return False

In [5]:
# Function to replace emojis with descriptions
def replace_emojis(text):
    return emoji.replace_emoji(text, replace=lambda chars, data_dict: data_dict['en'].replace(':', ' ').strip())

In [7]:
# Read the Excel file
input_file = "/Users/hemanthnagulapalli/Desktop/hinglish_data/Indo-HateSpeech_Dataset-dc1.xlsx"  
df = pd.read_excel(input_file)

In [8]:
# Ensure the column name is "Comment"
if "Comment" not in df.columns:
    raise ValueError("The Excel file must contain a column named 'Comment'")

In [9]:
# Process the data
cleaned_data = []
for comment in df["Comment"]:
    actual_comment = str(comment).strip()  # Convert to string to handle NaN or other types
    if not actual_comment:  # Skip empty rows
        continue
    
    # Check if the row is English
    if is_english(actual_comment):
        # Clean the comment
        cleaned_comment = remove_mentions(actual_comment)
        cleaned_comment = replace_emojis(cleaned_comment)
        cleaned_data.append([actual_comment, cleaned_comment])
    else:
        cleaned_data.append([actual_comment, "[Removed: Non-English]"])

In [10]:
# Write to CSV
output_file = "cleaned_data.csv"
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Actual Comment", "Cleaned Comment"])  # Header
    writer.writerows(cleaned_data)

print(f"CSV file '{output_file}' has been created successfully!")

CSV file 'cleaned_data.csv' has been created successfully!
