In [None]:
!pip install googletrans==4.0.0-rc1

In [1]:
import pandas as pd
import numpy as nm
import googletrans
import time
from googletrans import Translator
from collections import Counter
from sklearn.model_selection import train_test_split as tts

**DATASET**

In [2]:
df = pd.read_csv("dataset.csv")

In [3]:
df

Unnamed: 0,comment,Category,Gender,comment react number,label
0,ওই হালার পুত এখন কি মদ খাওয়ার সময় রাতের বেলা...,Actor,Female,1.0,sexual
1,ঘরে বসে শুট করতে কেমন লেগেছে? ক্যামেরাতে কে ছি...,Singer,Male,2.0,not bully
2,"অরে বাবা, এই টা কোন পাগল????",Actor,Female,2.0,not bully
3,ক্যাপ্টেন অফ বাংলাদেশ,Sports,Male,0.0,not bully
4,পটকা মাছ,Politician,Male,0.0,troll
...,...,...,...,...,...
43996,"হিরো আলম,, এগিয়ে যাও",Social,Male,0.0,not bully
43997,হিরো আলমকে সাপোর্ট দেওয়ার জন্য অসংখ্য ধন্যবাদ...,Social,Male,1.0,not bully
43998,হিরো ভাই তুমি এগিয়ে য়াও,Social,Male,0.0,not bully
43999,হুম ভাও তোমরা এগিয়ে যাও আমরা তোমাদের পিছনে আছি,Actor,Female,1.0,not bully


In [5]:
df['label'].value_counts()

not bully    15340
troll        10462
sexual        8928
religious     7577
threat        1694
Name: label, dtype: int64

In [6]:
#mapping dictionary for label mapping
mapping = {
    'not bully': 1, 
    'troll': 2, 
    'sexual': 3, 
    'religious': 4, 
    'threat': 5
}

In [8]:
#adding new column with the mapped values
df['nlabel'] = df['label'].map(mapping)
df

Unnamed: 0,comment,Category,Gender,comment react number,label,nlabel
0,ওই হালার পুত এখন কি মদ খাওয়ার সময় রাতের বেলা...,Actor,Female,1.0,sexual,3
1,ঘরে বসে শুট করতে কেমন লেগেছে? ক্যামেরাতে কে ছি...,Singer,Male,2.0,not bully,1
2,"অরে বাবা, এই টা কোন পাগল????",Actor,Female,2.0,not bully,1
3,ক্যাপ্টেন অফ বাংলাদেশ,Sports,Male,0.0,not bully,1
4,পটকা মাছ,Politician,Male,0.0,troll,2
...,...,...,...,...,...,...
43996,"হিরো আলম,, এগিয়ে যাও",Social,Male,0.0,not bully,1
43997,হিরো আলমকে সাপোর্ট দেওয়ার জন্য অসংখ্য ধন্যবাদ...,Social,Male,1.0,not bully,1
43998,হিরো ভাই তুমি এগিয়ে য়াও,Social,Male,0.0,not bully,1
43999,হুম ভাও তোমরা এগিয়ে যাও আমরা তোমাদের পিছনে আছি,Actor,Female,1.0,not bully,1


**SPLITING**

In [9]:
#splitting the dataframe
train, test = tts(df, test_size = 0.2, stratify = df['nlabel'], random_state = 42)

In [10]:
test['label'].value_counts()

not bully    3068
troll        2093
sexual       1786
religious    1515
threat        339
Name: label, dtype: int64

In [11]:
train.to_csv('train_dataset.csv', index = False)
test.to_csv('test_dataset.csv', index = False)

**First preprocessing**

In [None]:
train_df = pd.read_csv("train_dataset.csv")
test_df = pd.read_csv("test_dataset.csv")

In [None]:
#remove emojis, special character, extra spaces, but keep the esterics "*", as they hold special meaning in our dataset

# Function to clean Bangla text, keep '*', Bangla punctuations, and remove extra spaces
def clean_bangla_text(text):
    # Ensure input is a string
    if not isinstance(text, str):
        return text

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & map symbols
        u"\U0001F700-\U0001F77F"  # Alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric shapes extended
        u"\U0001F800-\U0001F8FF"  # Supplemental arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental symbols and pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and pictographs extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"  # Enclosed characters
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(" ", text)

    # Remove unwanted special characters but keep Bangla text, digits, punctuations, spaces, commas, and '*'
    bangla_pattern = r"[^\u0980-\u09FF\u0964\u0965a-zA-Z0-9 ,।॥*.?]"
    text = re.sub(bangla_pattern, " ", text)

    # Normalize spaces (remove extra spaces)
    text = " ".join(text.split())

    return text

In [None]:
# Apply the cleaning function to the training dataset
cleaned_train_df = train_df.copy()  # Create a copy of the original dataset
for column in cleaned_train_df.select_dtypes(include=['object']).columns:
    cleaned_train_df[column] = cleaned_train_df[column].apply(clean_bangla_text)

In [None]:
# Apply the cleaning function to the testing dataset
cleaned_test_df = test_df.copy()  # Create a copy of the original dataset
for column in cleaned_test_df.select_dtypes(include=['object']).columns:
    cleaned_test_df[column] = cleaned_test_df[column].apply(clean_bangla_text)

In [None]:
cleaned_train_df

**Multiclass to Binaryclass**

In [None]:
# create binary label
cleaned_train_df['nlabel'] = train_df['nlabel'].replace([3, 4, 5], 2)
cleaned_test_df['nlabel'] = train_df['nlabel'].replace([3, 4, 5], 2)

In [None]:
# save the updated dataset
cleaned_train_df.to_csv('cleaned_train_df.csv', index=False)
cleaned_test_df.to_csv('cleaned_test_df.csv', index=False)

**Augmentation on Train Set by Back Translation**

In [None]:
print(googletrans.__version__)  # Should print '4.0.0-rc1'

In [None]:
df = pd.read_csv("cleaned_train_df.csv")
temp = df[df['nlabel'] == 1]

In [None]:
# Initialize the translator
translator = Translator()

# Define a function for back-translation
def back_translate(text, src_lang='bn', target_lang='en'):
    try:
        # Translate to target language (e.g., English)
        translated = translator.translate(text, src=src_lang, dest=target_lang).text
        # Translate back to source language (e.g., Bangla)
        back_translated = translator.translate(translated, src=target_lang, dest=src_lang).text
        return back_translated
    except Exception as e:
        print(f"Error during translation: {e}")
        return text  # Return original text in case of an error

# Process only a subset of the dataset
batch_size = 50  # Number of rows to process in each batch
temp['backtranslated'] = None  # Add a new column for back-translated text

for i in range(0, len(temp), batch_size):
    batch = temp.iloc[i:i+batch_size]
    print(f"Processing batch {i // batch_size + 1}")
    
    for index, row in batch.iterrows():
        temp.at[index, 'backtranslated'] = back_translate(row['comment'])  # Back-translate column 'a'
    
    # Delay between batches to avoid throttling
    time.sleep(5)  # Adjust the sleep time based on the API's rate limit

In [None]:
# Replace 'comment' column with 'back_translated_comment'
temp['comment'] = temp['backtranslated']

# Drop the 'back_translated_comment' column
temp = temp.drop(columns=['backtranslated'])

In [None]:
concatenated_df = pd.concat([df, temp], ignore_index=True)  # `ignore_index=True` reindexes the rows
concatenated_df

In [None]:
concatenated_df['nlabel'].value_counts()

In [None]:
concatenated_df.to_csv("augmented_training_set.csv", index=False)

**Final Preprocessing - removing stopwords, keeping only comments and nlabel**

Stopword link
https://www.kaggle.com/datasets/shohanursobuj/bangla-stopwords
https://github.com/stopwords-iso/stopwords-bn/blob/master/stopwords-bn.txt

In [None]:
df = pd.read_csv("augmented_training_set.csv")

In [None]:
# Load Bangla stopwords from the .txt file
with open("stopwords-bn.txt", encoding="utf-8") as f:
    bangla_stopwords = set(f.read().splitlines())

# Function to remove Bangla stopwords
def remove_bangla_stopwords(text):
    words = text.split()  # Tokenize by splitting on whitespace
    filtered_words = [word for word in words if word not in bangla_stopwords]
    return " ".join(filtered_words)

# Apply the function to the text column
df['cleaned_comment'] = df['comment'].apply(remove_bangla_stopwords)



stopwords_df = pd.read_excel("bangla_stopwords.xlsx")  # Assuming no header in the Excel file
bangla_stopwords = set(stopwords_df["word_list"])  # Extract the first column as a set of stopwords
bangla_stopwords

# Function to remove Bangla stopwords
def remove_bangla_stopwords(text):
    words = text.split()  # Tokenize by splitting on whitespace
    filtered_words = [word for word in words if word not in bangla_stopwords]
    return " ".join(filtered_words)

# Apply the function to the text column
df['cleaned_comment'] = df['cleaned_comment'].apply(remove_bangla_stopwords)

In [None]:
df['comment'].iloc[0]

In [None]:
df['cleaned_comment'].iloc[0]

In [None]:
# Replace 'comment' column with 'back_translated_comment'
df['comment'] = df['cleaned_comment']

# Drop the 'back_translated_comment' column
df = df.drop(columns=['cleaned_comment'])

In [None]:
df.to_csv("final_train_set.csv", index = False)

In [None]:
df = pd.read_csv("test_dataset.csv")

# Load Bangla stopwords from the .txt file
with open("stopwords-bn.txt", encoding="utf-8") as f:
    bangla_stopwords = set(f.read().splitlines())

# Function to remove Bangla stopwords
def remove_bangla_stopwords(text):
    words = text.split()  # Tokenize by splitting on whitespace
    filtered_words = [word for word in words if word not in bangla_stopwords]
    return " ".join(filtered_words)

# Apply the function to the text column
df['cleaned_comment'] = df['comment'].apply(remove_bangla_stopwords)



stopwords_df = pd.read_excel("bangla_stopwords.xlsx")  # Assuming no header in the Excel file
bangla_stopwords = set(stopwords_df["word_list"])  # Extract the first column as a set of stopwords
bangla_stopwords

# Function to remove Bangla stopwords
def remove_bangla_stopwords(text):
    words = text.split()  # Tokenize by splitting on whitespace
    filtered_words = [word for word in words if word not in bangla_stopwords]
    return " ".join(filtered_words)

# Apply the function to the text column
df['cleaned_comment'] = df['cleaned_comment'].apply(remove_bangla_stopwords)

In [None]:
df['comment'].iloc[0]

In [None]:
df['cleaned_comment'].iloc[0]

In [None]:
# Replace 'comment' column with 'back_translated_comment'
df['comment'] = df['cleaned_comment']

# Drop the 'back_translated_comment' column
df = df.drop(columns=['cleaned_comment'])

df = df.drop(columns=['Category', 'Gender', 'comment react number', 'label'])
df.to_csv("final_test_set.csv", index = False)

**Additional Preprocessing**

In [None]:
df = pd.read_csv("final_train_set.csv")

In [None]:
#Remove punctuations


# List of Bangla punctuations
bangla_punctuations = "।!?,ঃ;‘’“”-()[]{}.…—!”#$%&’()*+,-./:;<=>?@[]^_`{|}~"

# Function to remove Bangla punctuations
def remove_bangla_punctuations(text):
    if isinstance(text, str):  # Ensure the input is valid text
        return re.sub(f"[{re.escape(bangla_punctuations)}]", " ", text)  # Remove all Bangla punctuations
    return text  # Return as-is if the input is not text

# Apply the function to the 'comment' column
df['cleaned_comment'] = df['comment'].apply(remove_bangla_punctuations)

# Display the updated DataFrame
df


In [None]:
#check for additional stopwords from our dataset

# Ensure the 'comment' column has valid strings, replace NaN with an empty string
df['comment'] = df['comment'].fillna('').astype(str)

# Initialize counters
all_word_counts = Counter()  # To count overall word frequency
label_1_word_counts = Counter()  # To count word frequency in label 1
label_2_word_counts = Counter()  # To count word frequency in label 2

# Iterate over rows in the dataset
for _, row in df.iterrows():
    words = row['comment'].split()  # Split text into words (you may modify this to tokenize better)
    all_word_counts.update(words)
    if row['nlabel'] == 1:
        label_1_word_counts.update(words)
    elif row['nlabel'] == 2:
        label_2_word_counts.update(words)

# Create a DataFrame for the results
output = pd.DataFrame({
    'word': all_word_counts.keys(),
    'total_count': all_word_counts.values(),
    'label_1_count': [label_1_word_counts[word] for word in all_word_counts.keys()],
    'label_2_count': [label_2_word_counts[word] for word in all_word_counts.keys()]
})

# Sort the DataFrame by total count in descending order
output = output.sort_values(by='total_count', ascending=False).reset_index(drop=True)

In [None]:
# Temporarily display all rows
with pd.option_context('display.max_rows', None):
    print(top_100_words)

In [None]:
custom_stopwords = [
    "আমি", "তুমি", "সে", "এই", "ওই", "এবং", "তাহলে", "যখন", "কারণ", "কিন্তু", 
    "কিছু", "কেন", "আমাদের", "তাদের", "আপনার", "যে", "তা", "নেই", "হয়", "করতে", "যদি", "লোক", "সবাইকে", "কর", "আসলে","লাগে", "টাকা", "দেখি", "করো",
    "জাহিদ", "হবে", "করুক", "আপনারা", "আল্লাহ্", "তোকে", "এতো", "রে", "কেমন", "দিয়া", "তোরে", "করুন", "তুই", "একটু", "জায়েদ", 
    "যায়", "তোমাকে", "তর", "না", "খান", "আল্লাহর", "সাথে", "তোর", "আপনাকে", "আলম", "হিরো", "আল্লাহ", "যাহা", "হলেন", "নাহ", "নাই" 
]

In [None]:
# Function to remove stopwords from a sentence
def remove_stopwords(text, stopwords):
    if isinstance(text, str):  # Check if the input is valid text
        words = text.split()  # Tokenize the sentence
        filtered_words = [word for word in words if word not in stopwords]  # Remove stopwords
        return ' '.join(filtered_words)  # Join the words back into a string
    return text  # Return as is if it's not text

# Apply the function to the 'comment' column
df['cleaned_comment'] = df['cleaned_comment'].apply(lambda x: remove_stopwords(x, custom_stopwords))

df


In [None]:
# Function to remove words less than 2 characters
def remove_short_words(text):
    if isinstance(text, str):  # Ensure the input is valid text
        return " ".join([word for word in text.split() if len(word) > 2])  # Keep words longer than 1 character
    return text  # Return as-is if the input is not text

# Apply the function to the 'comment' column
df['cleaned_comment'] = df['cleaned_comment'].apply(remove_short_words)

df


In [None]:
df.to_csv("final_train_set_with_additional_preprocessing.csv", index = False)

In [None]:
df = pd.read_csv("final_test_set.csv")
df['cleaned_comment'] = df['comment'].apply(remove_bangla_punctuations)
df['cleaned_comment'] = df['cleaned_comment'].apply(lambda x: remove_stopwords(x, custom_stopwords))
df['cleaned_comment'] = df['cleaned_comment'].apply(remove_short_words)
df.to_csv("final_test_set_with_additional_preprocessing.csv", index = False)