In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
import unicodedata
import html
import string
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from scipy.sparse import hstack
import joblib

In [2]:
# Prepare folders
# if the folder structure not exists, then i will create
DATA_RAW = Path("data/raw"); DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED = Path("data/processed"); DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

In [3]:
CSV_PATH = DATA_RAW / "dataset.csv"
assert CSV_PATH.exists(), "Expected file not found: data/raw/dataset.csv"
print("Found dataset at:", CSV_PATH.resolve())

Found dataset at: /Users/forhad/Library/CloudStorage/GoogleDrive-forhad14@cse.pstu.ac.bd/My Drive/Research/ecom_review_research/ecom-nlp/data/raw/dataset.csv


In [4]:
# Robust load (try a few encodings)

try:
    df_raw = pd.read_csv(CSV_PATH, encoding="utf-8")
    print("Read OK with encoding=utf-8")
except Exception:
    try:
        df_raw = pd.read_csv(CSV_PATH, encoding="utf-8-sig")
        print("Read OK with encoding=utf-8-sig")
    except Exception:
        df_raw = pd.read_csv(CSV_PATH, encoding="cp1252")
        print("Read OK with encoding=cp1252")

Read OK with encoding=utf-8


In [5]:
# Sanity checks
print("\n=== Columns ===")
print(df_raw.columns.tolist())

expected_cols = ["Rating","Review","Product Name","Product Category","Emotion","Data Source","Sentiment"]
missing = [c for c in expected_cols if c not in df_raw.columns]
if missing:
    print("[Warning] Missing expected columns:", missing)

print("\n=== Head (5) ===")
print(df_raw.head(5))

print("\nShape:", df_raw.shape)
print("\nNulls per column:")
print(df_raw.isna().sum())

if "Sentiment" in df_raw.columns:
    print("\nSentiment distribution:")
    print(df_raw["Sentiment"].value_counts(dropna=False))

if "Emotion" in df_raw.columns:
    print("\nEmotion distribution:")
    print(df_raw["Emotion"].value_counts(dropna=False))

if "Product Name" in df_raw.columns:
    print("\nUnique Product Names:", df_raw["Product Name"].nunique())
if "Product Category" in df_raw.columns:
    print("Unique Product Categories:", df_raw["Product Category"].nunique())
if "Review" in df_raw.columns:
    print("\nUnique review distribution:", df_raw["Review"].nunique())


=== Columns ===
['Rating', 'Review', 'Product Name', 'Product Category', 'Emotion', 'Data Source', 'Sentiment']

=== Head (5) ===
   Rating                                             Review  \
0     5.0  ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶´‡ßã‡¶®‡•§‡¶Ö‡¶®‡ßá‡¶ï ‡¶™‡¶õ‡¶®‡ßç‡¶¶ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá‡•§‡¶è‡¶ï‡¶¶‡¶Æ ‡¶Ö‡¶•‡ßá‡¶®‡¶ü‡¶ø‡¶ï ‡¶∂‡¶æ‡¶ì...   
1     5.0  Phone is good according to my uses, Upgraded f...   
2     5.0                  ‡¶Ö‡¶≤‡ßç‡¶™ ‡¶¶‡¶æ‡¶Æ‡ßá ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶® ‡¶è‡¶ï‡¶ü‡¶æ ‡¶∏‡ßç‡¶Æ‡¶æ‡¶∞‡ßç‡¶ü‡¶´‡ßã‡¶® üíô   
3     5.0            Super Fast Delivery ,11200 TK te pailam   
4     5.0                    Delay Delivery... Good Product.   

          Product Name Product Category Emotion Data Source Sentiment  
0  Redmi 12C (4/128GB)     Smart Phones   Happy       Daraz  Positive  
1  Redmi 12C (4/128GB)     Smart Phones   Happy       Daraz  Positive  
2  Redmi 12C (4/128GB)     Smart Phones    Love       Daraz  Positive  
3  Redmi 12C (4/128GB)     Smart Phones   Happy       Dar

In [6]:
df = df_raw.copy()

# Ensure text type
df["Review"] = df["Review"].astype(str)

# 2.1 HTML unescape (e.g., &amp; -> &)
clean = df["Review"].map(html.unescape)

# 2.2 Unicode normalize to NFKC (compatibility form)
clean = clean.map(lambda t: unicodedata.normalize("NFKC", t))

# 2.3 Remove URLs
URL_RE = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
clean = clean.map(lambda t: URL_RE.sub(" ", t))

# 2.4 Remove raw HTML tags (if any)
HTML_TAG_RE = re.compile(r"<[^>]+>")
clean = clean.map(lambda t: HTML_TAG_RE.sub(" ", t))

# 2.5 Remove ASCII control characters
CONTROL_CHARS_RE = re.compile(r"[\u0000-\u0008\u000B-\u000C\u000E-\u001F]")
clean = clean.map(lambda t: CONTROL_CHARS_RE.sub(" ", t))

# 2.6 Normalize whitespace
MULTI_WS_RE = re.compile(r"\s+")
clean = clean.map(lambda t: MULTI_WS_RE.sub(" ", t).strip())

# 2.7 Lowercase (only changes English; Bangla unaffected)
clean = clean.str.lower()

# Attach to DF
df["clean_text"] = clean

print("Preview of cleaning:")
print(df[["Review","clean_text"]].head(8))

Preview of cleaning:
                                              Review  \
0  ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶´‡ßã‡¶®‡•§‡¶Ö‡¶®‡ßá‡¶ï ‡¶™‡¶õ‡¶®‡ßç‡¶¶ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá‡•§‡¶è‡¶ï‡¶¶‡¶Æ ‡¶Ö‡¶•‡ßá‡¶®‡¶ü‡¶ø‡¶ï ‡¶∂‡¶æ‡¶ì...   
1  Phone is good according to my uses, Upgraded f...   
2                  ‡¶Ö‡¶≤‡ßç‡¶™ ‡¶¶‡¶æ‡¶Æ‡ßá ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶® ‡¶è‡¶ï‡¶ü‡¶æ ‡¶∏‡ßç‡¶Æ‡¶æ‡¶∞‡ßç‡¶ü‡¶´‡ßã‡¶® üíô   
3            Super Fast Delivery ,11200 TK te pailam   
4                    Delay Delivery... Good Product.   
5                    Delay Delivery... Good Product.   
6                               Poor seller response   
7                                  authentic product   

                                          clean_text  
0  ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶´‡ßã‡¶®‡•§‡¶Ö‡¶®‡ßá‡¶ï ‡¶™‡¶õ‡¶®‡ßç‡¶¶ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá‡•§‡¶è‡¶ï‡¶¶‡¶Æ ‡¶Ö‡¶•‡ßá‡¶®‡¶ü‡¶ø‡¶ï ‡¶∂‡¶æ‡¶ì...  
1  phone is good according to my uses, upgraded f...  
2                  ‡¶Ö‡¶≤‡ßç‡¶™ ‡¶¶‡¶æ‡¶Æ‡ßá ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶® ‡¶è‡¶ï‡¶ü‡¶æ ‡¶∏‡ßç‡¶Æ‡¶æ‡¶

In [7]:
# Bangla Unicode block
BN_CHAR_RE = re.compile(r"[\u0980-\u09FF]")

# 3.1 Count Bangla characters per row
bn_counts = df["clean_text"].map(lambda t: len(BN_CHAR_RE.findall(t)) if isinstance(t, str) else 0)

# 3.2 Total character count per row
tot_counts = df["clean_text"].map(lambda t: len(t) if isinstance(t, str) else 0)

# 3.3 Ratio = bn_chars / total_chars (safe divide)
df["bn_count"] = bn_counts      # issue
df["tot_count"] = tot_counts        # issue
df["bn_ratio"] = np.where(tot_counts > 0, bn_counts / tot_counts, 0.0)      # issue

In [8]:
df

Unnamed: 0,Rating,Review,Product Name,Product Category,Emotion,Data Source,Sentiment,clean_text,bn_count,tot_count,bn_ratio
0,5.0,‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶´‡ßã‡¶®‡•§‡¶Ö‡¶®‡ßá‡¶ï ‡¶™‡¶õ‡¶®‡ßç‡¶¶ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá‡•§‡¶è‡¶ï‡¶¶‡¶Æ ‡¶Ö‡¶•‡ßá‡¶®‡¶ü‡¶ø‡¶ï ‡¶∂‡¶æ‡¶ì...,Redmi 12C (4/128GB),Smart Phones,Happy,Daraz,Positive,‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶´‡ßã‡¶®‡•§‡¶Ö‡¶®‡ßá‡¶ï ‡¶™‡¶õ‡¶®‡ßç‡¶¶ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá‡•§‡¶è‡¶ï‡¶¶‡¶Æ ‡¶Ö‡¶•‡ßá‡¶®‡¶ü‡¶ø‡¶ï ‡¶∂‡¶æ‡¶ì...,222,284,0.781690
1,5.0,"Phone is good according to my uses, Upgraded f...",Redmi 12C (4/128GB),Smart Phones,Happy,Daraz,Positive,"phone is good according to my uses, upgraded f...",0,135,0.000000
2,5.0,‡¶Ö‡¶≤‡ßç‡¶™ ‡¶¶‡¶æ‡¶Æ‡ßá ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶® ‡¶è‡¶ï‡¶ü‡¶æ ‡¶∏‡ßç‡¶Æ‡¶æ‡¶∞‡ßç‡¶ü‡¶´‡ßã‡¶® üíô,Redmi 12C (4/128GB),Smart Phones,Love,Daraz,Positive,‡¶Ö‡¶≤‡ßç‡¶™ ‡¶¶‡¶æ‡¶Æ‡ßá ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶® ‡¶è‡¶ï‡¶ü‡¶æ ‡¶∏‡ßç‡¶Æ‡¶æ‡¶∞‡ßç‡¶ü‡¶´‡ßã‡¶® üíô,27,33,0.818182
3,5.0,"Super Fast Delivery ,11200 TK te pailam",Redmi 12C (4/128GB),Smart Phones,Happy,Daraz,Positive,"super fast delivery ,11200 tk te pailam",0,39,0.000000
4,5.0,Delay Delivery... Good Product.,Redmi 12C (4/128GB),Smart Phones,Happy,Daraz,Positive,delay delivery... good product.,0,31,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
78125,5.0,A good one,Baseus Bipow Digital Display 20W 10000mAh Powe...,Power Bank,Happy,Pickaboo,Positive,a good one,0,10,0.000000
78126,4.7,"Overall the product was good. Thanks, Pickaboo...",Baseus Bipow Digital Display 20W 10000mAh Powe...,Power Bank,Happy,Pickaboo,Positive,"overall the product was good. thanks, pickaboo...",0,73,0.000000
78127,5.0,This is a very good powerbank in this price po...,Baseus Bipow Digital Display 15W 10000mAh Powe...,Power Bank,Love,Pickaboo,Positive,this is a very good powerbank in this price po...,0,88,0.000000
78128,4.0,good for long lasting but after 2 year it's fa...,Baseus Bipow Digital Display 15W 10000mAh Powe...,Power Bank,Happy,Pickaboo,Positive,good for long lasting but after 2 year it's fa...,0,161,0.000000


In [9]:
df["lang_bucket"] = np.where(
    df["bn_ratio"] >= 0.8, "BN",
    np.where(df["bn_ratio"] <= 0.2, "EN", "MIX")
)

print("Language bucket distribution:")
print(df["lang_bucket"].value_counts(dropna=False))
print("\nPreview with language features:")
print(df[["clean_text","bn_ratio","lang_bucket"]].head(10))

Language bucket distribution:
lang_bucket
EN     49277
BN     19435
MIX     9418
Name: count, dtype: int64

Preview with language features:
                                          clean_text  bn_ratio lang_bucket
0  ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶´‡ßã‡¶®‡•§‡¶Ö‡¶®‡ßá‡¶ï ‡¶™‡¶õ‡¶®‡ßç‡¶¶ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá‡•§‡¶è‡¶ï‡¶¶‡¶Æ ‡¶Ö‡¶•‡ßá‡¶®‡¶ü‡¶ø‡¶ï ‡¶∂‡¶æ‡¶ì...  0.781690         MIX
1  phone is good according to my uses, upgraded f...  0.000000          EN
2                  ‡¶Ö‡¶≤‡ßç‡¶™ ‡¶¶‡¶æ‡¶Æ‡ßá ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶® ‡¶è‡¶ï‡¶ü‡¶æ ‡¶∏‡ßç‡¶Æ‡¶æ‡¶∞‡ßç‡¶ü‡¶´‡ßã‡¶® üíô  0.818182          BN
3            super fast delivery ,11200 tk te pailam  0.000000          EN
4                    delay delivery... good product.  0.000000          EN
5                    delay delivery... good product.  0.000000          EN
6                               poor seller response  0.000000          EN
7                                  authentic product  0.000000          EN
8  bought this phone from mi official store of

In [10]:
rows_before = len(df)
rows_before

78130

In [11]:
if "Product Name" in df.columns:
    df = df.drop_duplicates(subset=["Product Name","clean_text"], keep="first")
else:
    df = df.drop_duplicates(subset=["clean_text"], keep="first")

In [12]:
rows_after_exact = len(df)
rows_after_exact

73292

In [13]:
# Build a translation table to drop ascii punctuation/digits/whitespace
remove_ascii = string.punctuation + string.digits + string.whitespace
trans_table = str.maketrans("", "", remove_ascii)

In [14]:
# Common non-ASCII punctuation we also drop
extra_puncts = "‡•§‚Äì‚Äî‚Ä¢‚Ä¶‚Äú‚Äù‚Äò‚Äô‚Äé‚ÄãÔªø"

In [15]:
# Normalize + strip step by step
tmp_norm = df["clean_text"].map(lambda t: unicodedata.normalize("NFKC", t) if isinstance(t, str) else "")

In [16]:
# Lowercase (already lower, but safe)
tmp_norm = tmp_norm.str.lower()

# Remove ascii punc/digits/whitespace
tmp_no_ascii = tmp_norm.map(lambda t: t.translate(trans_table))

# Remove extra punctuation chars
for ch in extra_puncts:
    tmp_no_ascii = tmp_no_ascii.str.replace(ch, "", regex=False)


In [17]:
# Collapse repeated characters of length >=3 down to 2
REPEAT_RE = re.compile(r"(.)\1{2,}")
tmp_fingerprint = tmp_no_ascii.map(lambda t: REPEAT_RE.sub(r"\1\1", t))

In [18]:
# Truncate very long strings (stability)
tmp_fingerprint = tmp_fingerprint.map(lambda t: t[:300] if isinstance(t, str) else "")
df = df.copy()
df["fp"] = tmp_fingerprint

In [19]:
# Drop duplicates on fingerprint within same product if possible
if "Product Name" in df.columns:
    df = df.drop_duplicates(subset=["Product Name","fp"], keep="first")
else:
    df = df.drop_duplicates(subset=["fp"], keep="first")

In [20]:
rows_after_near = len(df)
rows_after_near

72592

In [21]:
print("=== Deduplication Summary ===")
print(f"Rows before: {rows_before}")
print(f"After exact-duplicate drop: {rows_after_exact} (removed {rows_before - rows_after_exact})")
print(f"After near-duplicate drop:  {rows_after_near} (removed {rows_after_exact - rows_after_near})")
print(f"Total removed: {rows_before - rows_after_near}")

=== Deduplication Summary ===
Rows before: 78130
After exact-duplicate drop: 73292 (removed 4838)
After near-duplicate drop:  72592 (removed 700)
Total removed: 5538


In [22]:
# STEP 5 ‚Äî Save outputs (CSV only; no extra libs needed)

out_full = DATA_PROCESSED / "clean_reviews.csv"

df.to_csv(out_full, index=False, encoding="utf-8-sig")

print("Saved processed CSV:", out_full.resolve())

# Quick post-clean distributions
if "Sentiment" in df.columns:
    print("\nSentiment distribution (post-clean):")
    print(df["Sentiment"].value_counts(dropna=False))

if "Emotion" in df.columns:
    print("\nEmotion distribution (post-clean):")
    print(df["Emotion"].value_counts(dropna=False))

print("\nLanguage bucket distribution (post-clean):")
print(df["lang_bucket"].value_counts(dropna=False))

if "Product Name" in df.columns:
    print("\nUnique Product Names (post-clean):", df["Product Name"].nunique())
if "Product Category" in df.columns:
    print("Unique Product Categories (post-clean):", df["Product Category"].nunique())

Saved processed CSV: /Users/forhad/Library/CloudStorage/GoogleDrive-forhad14@cse.pstu.ac.bd/My Drive/Research/ecom_review_research/ecom-nlp/data/processed/clean_reviews.csv

Sentiment distribution (post-clean):
Sentiment
Positive    61887
Negative    10705
Name: count, dtype: int64

Emotion distribution (post-clean):
Emotion
Happy      42590
Love       19297
Sadness     6443
Anger       3121
Fear        1141
Name: count, dtype: int64

Language bucket distribution (post-clean):
lang_bucket
EN     44734
BN     18640
MIX     9218
Name: count, dtype: int64

Unique Product Names (post-clean): 2476
Unique Product Categories (post-clean): 149


In [23]:
# Phase-2

In [24]:
# Positive/Negative masks
is_pos = (df["Sentiment"].str.lower() == "positive")
is_neg = (df["Sentiment"].str.lower() == "negative")

In [25]:
# Aggregations per product
g = df.groupby("Product Name", as_index=False)

In [26]:
# Most frequent category per product
cat_mode = (
    df.groupby("Product Name")["Product Category"]
      .agg(lambda s: s.value_counts(dropna=False).idxmax())
      .rename("category_mode")
      .reset_index()
)

In [27]:
# Sentiment counts per product
pos_counts = g["Sentiment"].apply(lambda s: (s.str.lower() == "positive").sum()).rename(columns={"Sentiment":"pos_count"})
neg_counts = g["Sentiment"].apply(lambda s: (s.str.lower() == "negative").sum()).rename(columns={"Sentiment":"neg_count"})

prod = cat_mode.merge(pos_counts, on="Product Name").merge(neg_counts, on="Product Name")
prod["tot_count"] = prod["pos_count"] + prod["neg_count"]

In [28]:
prod

Unnamed: 0,Product Name,category_mode,pos_count,neg_count,tot_count
0,"""Olevs 3605 RoseGold Ceramics Watchstrap Analo...",Watches,6,0,6
1,"""Olevs 5563 Silver & Golden Two Tone Stainless...",Watches,26,6,32
2,"""Olevs 5563 Silver & Golden Two Tone Stainless...",Watches,93,7,100
3,"""Olevs 5872 Black Stainless Steel Analoge Wris...",Watches,2,0,2
4,"""Olevs 5872 White Stainless Steel Analoge Wris...",Watches,2,2,4
...,...,...,...,...,...
2471,‡¶π‡¶ü ‡¶π‡ßÅ‡¶á‡¶≤‡¶∏ ‡¶∞‡ßá‡¶ó‡ßÅ‡¶≤‡¶æ‡¶∞-lb ‡¶∏‡ßÅ‡¶™‡¶æ‡¶∞ ‡¶∏‡¶ø‡¶≤‡ßÅ‡¶Ø‡¶º‡ßá‡¶ü ‡¶®‡¶ø‡¶∂‡¶æ‡¶® ‡¶∏‡¶ø‡¶≤‡¶≠‡¶ø...,Remote Control & Play Vehicles,2,0,2
2472,‡¶π‡¶æ‡¶§‡ßá ‡¶§‡ßà‡¶∞‡¶ø dulna ‡¶™‡ßç‡¶∞‡¶æ‡¶™‡ßç‡¶§‡¶¨‡¶Ø‡¶º‡¶∏‡ßç‡¶ï ‡¶¶‡¶°‡¶º‡¶ø ‡¶¨‡¶ø‡¶õ‡¶æ‡¶®‡¶æ ‡¶ù‡ßÅ‡¶≤‡¶®...,"Tools, DIY & Outdoor",888,220,1108
2473,‚ÄòSelleys‚Äô - RP7 150G/211ML Multi -Purpose Anti...,Oils & Fluids,14,6,20
2474,‚ÄòSelleys‚Äô - RP7 300G/422ML Multi -Purpose Anti...,Oils & Fluids,14,3,17


In [29]:
# Sentiment dominance bin
prod["pos_frac"] = np.where(prod["tot_count"] > 0, prod["pos_count"] / prod["tot_count"], np.nan)
prod["sent_bin"] = np.where(
    prod["tot_count"] == 0, "MIXED",
    np.where(prod["pos_frac"] >= 0.60, "POS_DOM",
             np.where(prod["pos_frac"] <= 0.40, "NEG_DOM", "MIXED"))
)

print("Product-level preview:")
print(prod.head(10))
print("\n#Products:", len(prod))
print("Category counts (mode):")
print(prod["category_mode"].value_counts().head(15))
print("\nSentiment bins:")
print(prod["sent_bin"].value_counts())

Product-level preview:
                                        Product Name  \
0  "Olevs 3605 RoseGold Ceramics Watchstrap Analo...   
1  "Olevs 5563 Silver & Golden Two Tone Stainless...   
2  "Olevs 5563 Silver & Golden Two Tone Stainless...   
3  "Olevs 5872 Black Stainless Steel Analoge Wris...   
4  "Olevs 5872 White Stainless Steel Analoge Wris...   
5  0.28 inch Mini Digital Voltmeter Ammeter DC 10...   
6  05 Clips Cable Organizer Silicone USB Cable Wi...   
7  06 Pcs | Set Aluminium Crochet Hooks | 6 SIZE ...   
8  09 Future flashlights for kids and music car t...   
9  1 Pair Fashion suit shirt collar wheat three-d...   

                    category_mode  pos_count  neg_count  tot_count  pos_frac  \
0                         Watches          6          0          6  1.000000   
1                         Watches         26          6         32  0.812500   
2                         Watches         93          7        100  0.930000   
3                         Watches       

In [30]:
prod

Unnamed: 0,Product Name,category_mode,pos_count,neg_count,tot_count,pos_frac,sent_bin
0,"""Olevs 3605 RoseGold Ceramics Watchstrap Analo...",Watches,6,0,6,1.000000,POS_DOM
1,"""Olevs 5563 Silver & Golden Two Tone Stainless...",Watches,26,6,32,0.812500,POS_DOM
2,"""Olevs 5563 Silver & Golden Two Tone Stainless...",Watches,93,7,100,0.930000,POS_DOM
3,"""Olevs 5872 Black Stainless Steel Analoge Wris...",Watches,2,0,2,1.000000,POS_DOM
4,"""Olevs 5872 White Stainless Steel Analoge Wris...",Watches,2,2,4,0.500000,MIXED
...,...,...,...,...,...,...,...
2471,‡¶π‡¶ü ‡¶π‡ßÅ‡¶á‡¶≤‡¶∏ ‡¶∞‡ßá‡¶ó‡ßÅ‡¶≤‡¶æ‡¶∞-lb ‡¶∏‡ßÅ‡¶™‡¶æ‡¶∞ ‡¶∏‡¶ø‡¶≤‡ßÅ‡¶Ø‡¶º‡ßá‡¶ü ‡¶®‡¶ø‡¶∂‡¶æ‡¶® ‡¶∏‡¶ø‡¶≤‡¶≠‡¶ø...,Remote Control & Play Vehicles,2,0,2,1.000000,POS_DOM
2472,‡¶π‡¶æ‡¶§‡ßá ‡¶§‡ßà‡¶∞‡¶ø dulna ‡¶™‡ßç‡¶∞‡¶æ‡¶™‡ßç‡¶§‡¶¨‡¶Ø‡¶º‡¶∏‡ßç‡¶ï ‡¶¶‡¶°‡¶º‡¶ø ‡¶¨‡¶ø‡¶õ‡¶æ‡¶®‡¶æ ‡¶ù‡ßÅ‡¶≤‡¶®...,"Tools, DIY & Outdoor",888,220,1108,0.801444,POS_DOM
2473,‚ÄòSelleys‚Äô - RP7 150G/211ML Multi -Purpose Anti...,Oils & Fluids,14,6,20,0.700000,POS_DOM
2474,‚ÄòSelleys‚Äô - RP7 300G/422ML Multi -Purpose Anti...,Oils & Fluids,14,3,17,0.823529,POS_DOM


In [31]:
# STEP 3 ‚Äî Handle VERY RARE categories by collapsing into "Other" to make stratification stable

min_products_per_category = 5  # you can raise/lower based on your dataset size

cat_counts = prod["category_mode"].value_counts()
rare_cats = set(cat_counts[cat_counts < min_products_per_category].index)

prod["category_stable"] = np.where(prod["category_mode"].isin(rare_cats), "Other", prod["category_mode"])

print("Categories collapsed to 'Other':", len(rare_cats))
print("Stable category distribution:")
print(prod["category_stable"].value_counts().head(20))

Categories collapsed to 'Other': 36
Stable category distribution:
category_stable
Watches                           149
Men's Watches                     102
Other                              77
Remote Control & Play Vehicles     70
Men's Fragrances                   69
True Wireless Earbuds              64
Stationery & Craft                 63
Keyboard & Mouse                   59
Smart Phones                       58
Tools, DIY & Outdoor               57
Cookware                           51
Traditional_Clothing               49
Clothing                           49
Smart Watch                        47
Smartphones                        44
Wired Headphone                    42
Women's Watches                    40
Shaver & Trimmer                   38
Maternity Care                     37
Data Cable                         31
Name: count, dtype: int64


In [32]:
# STEP 4 ‚Äî Define a stratification label.
# First try using category + sentiment bin combo.
# For combos with < 2 products, fall back to category-only to avoid stratification errors.

prod["strata_combo"] = prod["category_stable"] + " | " + prod["sent_bin"]

combo_counts = prod["strata_combo"].value_counts()
too_small = set(combo_counts[combo_counts < 2].index)

# Final strata: use combo if sufficient size, else just category
prod["strata_final"] = np.where(prod["strata_combo"].isin(too_small), prod["category_stable"], prod["strata_combo"])

print("Example of final strata labels:")
print(prod[["Product Name","category_mode","category_stable","sent_bin","strata_final"]].head(12))
print("\nFinal strata distribution (top 20):")
print(prod["strata_final"].value_counts().head(20))

Example of final strata labels:
                                         Product Name  \
0   "Olevs 3605 RoseGold Ceramics Watchstrap Analo...   
1   "Olevs 5563 Silver & Golden Two Tone Stainless...   
2   "Olevs 5563 Silver & Golden Two Tone Stainless...   
3   "Olevs 5872 Black Stainless Steel Analoge Wris...   
4   "Olevs 5872 White Stainless Steel Analoge Wris...   
5   0.28 inch Mini Digital Voltmeter Ammeter DC 10...   
6   05 Clips Cable Organizer Silicone USB Cable Wi...   
7   06 Pcs | Set Aluminium Crochet Hooks | 6 SIZE ...   
8   09 Future flashlights for kids and music car t...   
9   1 Pair Fashion suit shirt collar wheat three-d...   
10  1 Pc Phone Holder Non Slip Sticky Anti Slide D...   
11  1 Pices Grip Oven Pot Holder Baking BBQ Cook T...   

                     category_mode                 category_stable sent_bin  \
0                          Watches                         Watches  POS_DOM   
1                          Watches                         Watches  

In [33]:
random_state = 42
test_size = 0.20
dev_size_within_temp = 0.50  # half of TEMP goes to DEV

# --- Prepare X, y ---
X = prod[["Product Name"]].copy()
y = prod["strata_final"].copy()

# --- Diagnose class counts ---
counts = y.value_counts().sort_values()
print("Label counts (ascending):")
print(counts)
print("Min per class:", counts.min())

# --- Identify singleton classes (count == 1) ---
singletons_mask = y.map(counts) == 1
idx_singletons = np.flatnonzero(singletons_mask.values)
idx_regular    = np.flatnonzero(~singletons_mask.values)
print(f"\nSingleton samples: {len(idx_singletons)} (these will be forced into TRAIN)")

if len(idx_regular) == 0:
    # If everything is singleton (very rare), just do a non-stratified split
    print("All classes are singletons; using non-stratified split.")
    rs = ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    train_rel, temp_rel = next(rs.split(X))
    train_idx = train_rel
    temp_idx  = temp_rel
else:
    # Stratify only on classes with >=2 samples
    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    train_reg_rel, temp_reg_rel = next(sss1.split(X.iloc[idx_regular], y.iloc[idx_regular]))
    train_idx = idx_regular[train_reg_rel]
    temp_idx  = idx_regular[temp_reg_rel]

# Add all singletons to TRAIN so stratification won't break
train_idx = np.concatenate([train_idx, idx_singletons])

# Optional: deterministic shuffle of indices
rng = np.random.RandomState(random_state)
train_idx = rng.permutation(train_idx)
temp_idx  = rng.permutation(temp_idx)

prod_train = prod.iloc[train_idx].copy()
prod_temp  = prod.iloc[temp_idx].copy()

print("\nSizes after first split:")
print("Train products:", len(prod_train), "Temp products:", len(prod_temp),
      f"(Singletons forced to train: {len(idx_singletons)})")

# --- Second split: TEMP -> DEV/TEST ---
X_temp = prod_temp[["Product Name"]].copy()
y_temp = prod_temp["strata_final"].copy()

counts_temp = y_temp.value_counts()
print("\nTemp label counts (ascending):")
print(counts_temp.sort_values())
print("Min per class in TEMP:", counts_temp.min())

if len(counts_temp) > 0 and counts_temp.min() >= 2:
    # OK to stratify
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=dev_size_within_temp, random_state=random_state)
    dev_rel, test_rel = next(sss2.split(X_temp, y_temp))
else:
    # Fallback: non-stratified split for DEV/TEST
    print("Warning: TEMP contains singleton classes; using non-stratified split for DEV/TEST.")
    rs2 = ShuffleSplit(n_splits=1, test_size=dev_size_within_temp, random_state=random_state)
    dev_rel, test_rel = next(rs2.split(X_temp))

prod_dev  = prod_temp.iloc[dev_rel].copy()
prod_test = prod_temp.iloc[test_rel].copy()

# --- Sanity proportions ---
def show_props(name, s):
    props = (s.value_counts(normalize=True).sort_index() * 100).round(2)
    print(f"\n{name} size={len(s)} label %:\n{props}")

show_props("ALL",   y)
show_props("TRAIN", prod_train["strata_final"])
show_props("DEV",   prod_dev["strata_final"])
show_props("TEST",  prod_test["strata_final"])

print("\nFinal split sizes ‚Äî Train:", len(prod_train), "Dev:", len(prod_dev), "Test:", len(prod_test))

Label counts (ascending):
strata_final
Baby Personal Care                1
Men's Jewelry                     1
Media & Music                     1
Wearable                          1
Dairy & Eggs                      1
                               ... 
Stationery & Craft | POS_DOM     59
Men's Fragrances | POS_DOM       60
Other | POS_DOM                  72
Men's Watches | POS_DOM          93
Watches | POS_DOM               140
Name: count, Length: 191, dtype: int64
Min per class: 1

Singleton samples: 34 (these will be forced into TRAIN)

Sizes after first split:
Train products: 1987 Temp products: 489 (Singletons forced to train: 34)

Temp label counts (ascending):
strata_final
Traditional_Clothing | MIXED                 1
Engine Oil | POS_DOM                         1
Remote Control & Play Vehicles | NEG_DOM     1
Keyboard & Mouse | NEG_DOM                   1
Watches | NEG_DOM                            1
                                            ..
Stationery & Craft | POS_D

In [34]:
# %%
# STEP 6 ‚Äî Sanity: No product overlaps across splits; Quick distributions

set_train = set(prod_train["Product Name"])
set_dev   = set(prod_dev["Product Name"])
set_test  = set(prod_test["Product Name"])

print("Overlap train‚à©dev:", len(set_train & set_dev))
print("Overlap train‚à©test:", len(set_train & set_test))
print("Overlap dev‚à©test:", len(set_dev & set_test))

print("\nCategory (mode) per split:")
print("Train:\n", prod_train["category_mode"].value_counts().head(15))
print("\nDev:\n", prod_dev["category_mode"].value_counts().head(15))
print("\nTest:\n", prod_test["category_mode"].value_counts().head(15))

print("\nSent-bin per split:")
print("Train:\n", prod_train["sent_bin"].value_counts())
print("\nDev:\n", prod_dev["sent_bin"].value_counts())
print("\nTest:\n", prod_test["sent_bin"].value_counts())


Overlap train‚à©dev: 0
Overlap train‚à©test: 0
Overlap dev‚à©test: 0

Category (mode) per split:
Train:
 category_mode
Watches                           119
Men's Watches                      82
Remote Control & Play Vehicles     56
Men's Fragrances                   55
True Wireless Earbuds              51
Stationery & Craft                 50
Keyboard & Mouse                   47
Tools, DIY & Outdoor               46
Smart Phones                       46
Cookware                           41
Clothing                           39
Traditional_Clothing               39
Smart Watch                        38
Smartphones                        35
Wired Headphone                    34
Name: count, dtype: int64

Dev:
 category_mode
Watches                           16
Men's Watches                     10
Remote Control & Play Vehicles     8
Men's Fragrances                   7
True Wireless Earbuds              7
Smart Phones                       7
Keyboard & Mouse                   7
Tools

In [35]:
# %%
# STEP 7 ‚Äî Build ROW-LEVEL splits by filtering original df with product-name lists

df_train = df[df["Product Name"].isin(set_train)].copy()
df_dev   = df[df["Product Name"].isin(set_dev)].copy()
df_test  = df[df["Product Name"].isin(set_test)].copy()

print("Row counts ‚Äî Train:", len(df_train), "Dev:", len(df_dev), "Test:", len(df_test))

# Quick check: distributions (optional)
print("\n[Rows] Category distribution (top 15) ‚Äî Train")
print(df_train["Product Category"].value_counts().head(15))
print("\n[Rows] Sentiment distribution ‚Äî Train")
print(df_train["Sentiment"].value_counts())

print("\n[Rows] Sentiment distribution ‚Äî Dev")
print(df_dev["Sentiment"].value_counts())
print("\n[Rows] Sentiment distribution ‚Äî Test")
print(df_test["Sentiment"].value_counts())


Row counts ‚Äî Train: 57412 Dev: 6731 Test: 8449

[Rows] Category distribution (top 15) ‚Äî Train
Product Category
Tools, DIY & Outdoor     5055
Watches                  4530
Stationery & Craft       2147
Fruits, Meat & Frozen    1698
Breakfast                1399
Laundry & Household      1398
Fans                     1366
Hair Care                1357
Cookware                 1254
Televisions              1216
Men's Care               1067
Skin Care                 997
Bags                      947
Clothing                  924
Dairy & Eggs              911
Name: count, dtype: int64

[Rows] Sentiment distribution ‚Äî Train
Sentiment
Positive    48843
Negative     8569
Name: count, dtype: int64

[Rows] Sentiment distribution ‚Äî Dev
Sentiment
Positive    5743
Negative     988
Name: count, dtype: int64

[Rows] Sentiment distribution ‚Äî Test
Sentiment
Positive    7301
Negative    1148
Name: count, dtype: int64


In [36]:
import json

# %%
# STEP 8 ‚Äî Save product-name ID lists and row-level CSVs

SPLITS_DIR = DATA_PROCESSED / "splits"; SPLITS_DIR.mkdir(parents=True, exist_ok=True)

train_ids_path = SPLITS_DIR / "train_product_names.json"
dev_ids_path   = SPLITS_DIR / "dev_product_names.json"
test_ids_path  = SPLITS_DIR / "test_product_names.json"

with open(train_ids_path, "w", encoding="utf-8") as f:
    json.dump(sorted(list(set_train)), f, ensure_ascii=False, indent=2)
with open(dev_ids_path, "w", encoding="utf-8") as f:
    json.dump(sorted(list(set_dev)), f, ensure_ascii=False, indent=2)
with open(test_ids_path, "w", encoding="utf-8") as f:
    json.dump(sorted(list(set_test)), f, ensure_ascii=False, indent=2)

train_csv = SPLITS_DIR / "train.csv"
dev_csv   = SPLITS_DIR / "dev.csv"
test_csv  = SPLITS_DIR / "test.csv"

df_train.to_csv(train_csv, index=False, encoding="utf-8-sig")
df_dev.to_csv(dev_csv, index=False, encoding="utf-8-sig")
df_test.to_csv(test_csv, index=False, encoding="utf-8-sig")

print("Saved product-name lists:")
print("  ", train_ids_path.resolve())
print("  ", dev_ids_path.resolve())
print("  ", test_ids_path.resolve())

print("Saved row-level CSVs:")
print("  ", train_csv.resolve())
print("  ", dev_csv.resolve())
print("  ", test_csv.resolve())


Saved product-name lists:
   /Users/forhad/Library/CloudStorage/GoogleDrive-forhad14@cse.pstu.ac.bd/My Drive/Research/ecom_review_research/ecom-nlp/data/processed/splits/train_product_names.json
   /Users/forhad/Library/CloudStorage/GoogleDrive-forhad14@cse.pstu.ac.bd/My Drive/Research/ecom_review_research/ecom-nlp/data/processed/splits/dev_product_names.json
   /Users/forhad/Library/CloudStorage/GoogleDrive-forhad14@cse.pstu.ac.bd/My Drive/Research/ecom_review_research/ecom-nlp/data/processed/splits/test_product_names.json
Saved row-level CSVs:
   /Users/forhad/Library/CloudStorage/GoogleDrive-forhad14@cse.pstu.ac.bd/My Drive/Research/ecom_review_research/ecom-nlp/data/processed/splits/train.csv
   /Users/forhad/Library/CloudStorage/GoogleDrive-forhad14@cse.pstu.ac.bd/My Drive/Research/ecom_review_research/ecom-nlp/data/processed/splits/dev.csv
   /Users/forhad/Library/CloudStorage/GoogleDrive-forhad14@cse.pstu.ac.bd/My Drive/Research/ecom_review_research/ecom-nlp/data/processed/split

In [37]:
# %%
# STEP 9 ‚Äî (Optional) Crosstab views to confirm stratification quality

print("Category x Split (row-level)")
cat_split = (
    pd.concat([
        df_train.assign(SPLIT="TRAIN"),
        df_dev.assign(SPLIT="DEV"),
        df_test.assign(SPLIT="TEST"),
    ])
    .pivot_table(index="Product Category", columns="SPLIT", values="Review", aggfunc="count", fill_value=0)
    .sort_values(by="TRAIN", ascending=False)
)
display(cat_split.head(20))

print("\nSentiment x Split (row-level)")
sent_split = (
    pd.concat([
        df_train.assign(SPLIT="TRAIN"),
        df_dev.assign(SPLIT="DEV"),
        df_test.assign(SPLIT="TEST"),
    ])
    .pivot_table(index="Sentiment", columns="SPLIT", values="Review", aggfunc="count", fill_value=0)
)
display(sent_split)


Category x Split (row-level)


SPLIT,DEV,TEST,TRAIN
Product Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Tools, DIY & Outdoor",670,147,5055
Watches,442,573,4530
Stationery & Craft,226,218,2147
"Fruits, Meat & Frozen",0,258,1698
Breakfast,0,221,1399
Laundry & Household,16,0,1398
Fans,100,275,1366
Hair Care,0,455,1357
Cookware,33,122,1254
Televisions,79,99,1216



Sentiment x Split (row-level)


SPLIT,DEV,TEST,TRAIN
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative,988,1148,8569
Positive,5743,7301,48843


In [38]:
# Phase-3

In [39]:
train = pd.read_csv(train_csv)
dev   = pd.read_csv(dev_csv)
test  = pd.read_csv(test_csv)
## duplicate needs to be removed (count and distinct count differ)

In [40]:
dev

Unnamed: 0,Rating,Review,Product Name,Product Category,Emotion,Data Source,Sentiment,clean_text,bn_count,tot_count,bn_ratio,lang_bucket,fp
0,2.0,I have ordered selecting Sunshower color but i...,Realme C55 - (8GB/256GB),Smart Phones,Fear,Daraz,Negative,i have ordered selecting sunshower color but i...,0,419,0.000000,EN,ihaveorderedselectingsunshowercolorbutitwasmid...
1,1.0,‡¶≠‡¶æ‡¶≤‡¶á ‡¶ö‡¶≤‡¶õ‡¶ø‡¶≤ ‡¶ï‡¶ø‡¶®‡ßç‡¶§‡ßÅ ‡¶°‡ßá‡¶≤‡¶ø‡¶≠‡¶æ‡¶∞‡¶ø‡¶∞ 10 ‡¶¶‡¶ø‡¶® ‡¶Ø‡ßá‡¶§‡ßá ‡¶®‡¶æ ‡¶Ø‡ßá‡¶§...,Realme C55 - (8GB/256GB),Smart Phones,Sadness,Daraz,Negative,‡¶≠‡¶æ‡¶≤‡¶á ‡¶ö‡¶≤‡¶õ‡¶ø‡¶≤ ‡¶ï‡¶ø‡¶®‡ßç‡¶§‡ßÅ ‡¶°‡ßá‡¶≤‡¶ø‡¶≠‡¶æ‡¶∞‡¶ø‡¶∞ 10 ‡¶¶‡¶ø‡¶® ‡¶Ø‡ßá‡¶§‡ßá ‡¶®‡¶æ ‡¶Ø‡ßá‡¶§...,178,223,0.798206,MIX,‡¶≠‡¶æ‡¶≤‡¶á‡¶ö‡¶≤‡¶õ‡¶ø‡¶≤‡¶ï‡¶ø‡¶®‡ßç‡¶§‡ßÅ‡¶°‡ßá‡¶≤‡¶ø‡¶≠‡¶æ‡¶∞‡¶ø‡¶∞‡¶¶‡¶ø‡¶®‡¶Ø‡ßá‡¶§‡ßá‡¶®‡¶æ‡¶Ø‡ßá‡¶§‡ßá‡¶á‡¶´‡ßã‡¶®‡¶ö‡¶æ‡¶≤‡¶æ‡¶§...
2,5.0,loved it Soo much\nIts very safely delivered‚Ä¶a...,Realme C55 - (8GB/256GB),Smart Phones,Love,Daraz,Positive,loved it soo much its very safely delivered......,0,288,0.000000,EN,loveditsoomuchitsverysafelydeliveredanditsgoin...
3,5.0,"‡¶ì‡ßü‡¶æ‡¶∞‡ßá‡¶®‡ßç‡¶ü‡¶ø‡¶∞ ‡¶ï‡ßã‡¶® ‡¶ï‡¶æ‡¶ó‡¶ú ‡¶Ü‡¶∏‡ßá‡¶®‡¶ø,‡¶ï‡¶ø‡¶≠‡¶æ‡¶¨‡ßá ‡¶™‡¶æ‡¶¨ ‡¶¨‡¶æ ‡¶ï‡¶ø‡¶≠‡¶æ‡¶¨‡ßá...",Realme C55 - (8GB/256GB),Smart Phones,Sadness,Daraz,Negative,"‡¶ì‡¶Ø‡¶º‡¶æ‡¶∞‡ßá‡¶®‡ßç‡¶ü‡¶ø‡¶∞ ‡¶ï‡ßã‡¶® ‡¶ï‡¶æ‡¶ó‡¶ú ‡¶Ü‡¶∏‡ßá‡¶®‡¶ø,‡¶ï‡¶ø‡¶≠‡¶æ‡¶¨‡ßá ‡¶™‡¶æ‡¶¨ ‡¶¨‡¶æ ‡¶ï‡¶ø‡¶≠‡¶æ‡¶¨...",49,59,0.830508,BN,‡¶ì‡¶Ø‡¶º‡¶æ‡¶∞‡ßá‡¶®‡ßç‡¶ü‡¶ø‡¶∞‡¶ï‡ßã‡¶®‡¶ï‡¶æ‡¶ó‡¶ú‡¶Ü‡¶∏‡ßá‡¶®‡¶ø‡¶ï‡¶ø‡¶≠‡¶æ‡¶¨‡ßá‡¶™‡¶æ‡¶¨‡¶¨‡¶æ‡¶ï‡¶ø‡¶≠‡¶æ‡¶¨‡ßá‡¶ï‡ßç‡¶≤‡ßá‡¶á‡¶Æ‡¶ï‡¶∞‡¶¨
4,2.0,1 month er modde spreker kaj korche na. kothay...,Realme C55 - (8GB/256GB),Smart Phones,Sadness,Daraz,Negative,1 month er modde spreker kaj korche na. kothay...,0,66,0.000000,EN,monthermoddesprekerkajkorchenakothayjogajogkor...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6726,5.0,Good Product.,Anobik 10.5W SmartCharge Neo Fast Charger Adap...,Charger & Adapter,Happy,Pickaboo,Positive,good product.,0,13,0.000000,EN,goodproduct
6727,4.0,Using .good product.,Anobik 10.5W SmartCharge Neo Fast Charger Adap...,Charger & Adapter,Happy,Pickaboo,Positive,using .good product.,0,20,0.000000,EN,usinggoodproduct
6728,5.0,This is a very good powerbank in this price po...,Baseus Bipow Digital Display 15W 10000mAh Powe...,Power Bank,Love,Pickaboo,Positive,this is a very good powerbank in this price po...,0,88,0.000000,EN,thisisaverygoodpowerbankinthispricepointwellbu...
6729,4.0,good for long lasting but after 2 year it's fa...,Baseus Bipow Digital Display 15W 10000mAh Powe...,Power Bank,Happy,Pickaboo,Positive,good for long lasting but after 2 year it's fa...,0,161,0.000000,EN,goodforlonglastingbutafteryearitsfastchargingo...


In [41]:
# Prefer 'clean_text' if present; else fallback to raw 'Review'
text_col = "clean_text" if "clean_text" in train.columns else "Review"

print("Using text column:", text_col)
for df, name in [(train,"train"), (dev,"dev"), (test,"test")]:
    print(name, len(df), "rows")

Using text column: clean_text
train 57412 rows
dev 6731 rows
test 8449 rows


In [42]:
# %% [3] Prepare text & labels for Sentiment
X_train_sent = train[text_col].astype(str).tolist()
y_train_sent = train["Sentiment"].astype(str).str.title().tolist()  # 'Positive'/'Negative'

X_dev_sent = dev[text_col].astype(str).tolist()
y_dev_sent = dev["Sentiment"].astype(str).str.title().tolist()

X_test_sent = test[text_col].astype(str).tolist()
y_test_sent = test["Sentiment"].astype(str).str.title().tolist()

In [43]:
# %% [4] Char TF-IDF (3-5) ‚Äî fit on TRAIN only
tfidf_char = TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=5, max_features=300000)
Xtr_char = tfidf_char.fit_transform(X_train_sent)
Xdv_char = tfidf_char.transform(X_dev_sent)
Xte_char = tfidf_char.transform(X_test_sent)

In [44]:
print("Shapes (char-level):", Xtr_char.shape, Xdv_char.shape, Xte_char.shape)

# (Optional) Word-level TF-IDF to stack with char-level (helps a bit sometimes).
# NOTE: scikit's default token pattern ignores Bangla by default. We rely on char-ngrams primarily.
# If you want to add word-level too, uncomment below.
# tfidf_word = TfidfVectorizer(analyzer="word", ngram_range=(1,2), min_df=5, max_features=300000)
# Xtr_word = tfidf_word.fit_transform(X_train_sent)
# Xdv_word = tfidf_word.transform(X_dev_sent)
# Xte_word = tfidf_word.transform(X_test_sent)
# Xtr = hstack([Xtr_char, Xtr_word], format="csr")
# Xdv = hstack([Xdv_char, Xdv_word], format="csr")
# Xte = hstack([Xte_char, Xte_word], format="csr")

Xtr = Xtr_char; Xdv = Xdv_char; Xte = Xte_char  # use only char-level for now

Shapes (char-level): (57412, 136873) (6731, 136873) (8449, 136873)


In [45]:
from sklearn.metrics import f1_score

results_sent = {}

In [46]:
# 5a) LinearSVC
svc = LinearSVC(class_weight="balanced", random_state=42)
svc.fit(Xtr, y_train_sent)
pred_dv = svc.predict(Xdv)
f1_dv = f1_score(y_dev_sent, pred_dv, average="macro")
results_sent["LinearSVC"] = (svc, f1_dv)
print("[Sentiment] Dev macro-F1 (LinearSVC):", f1_dv)

[Sentiment] Dev macro-F1 (LinearSVC): 0.8485190275915807


In [47]:
# 5b) LogisticRegression
logr = LogisticRegression(max_iter=3000, class_weight="balanced", n_jobs=-1)
logr.fit(Xtr, y_train_sent)
pred_dv = logr.predict(Xdv)
f1_dv2 = f1_score(y_dev_sent, pred_dv, average="macro")
results_sent["LogReg"] = (logr, f1_dv2)
print("[Sentiment] Dev macro-F1 (LogReg):", f1_dv2)

[Sentiment] Dev macro-F1 (LogReg): 0.8364405486336757


In [48]:
best_name_sent = max(results_sent, key=lambda k: results_sent[k][1])
best_model_sent = results_sent[best_name_sent][0]
print("Best sentiment model:", best_name_sent)

Best sentiment model: LinearSVC


In [49]:
# %% [6] Evaluate best Sentiment model on Test
pred_te = best_model_sent.predict(Xte)
print("\n[Sentiment] TEST report (best =", best_name_sent, ")")
print(classification_report(y_test_sent, pred_te, digits=4))

print("[Sentiment] Confusion matrix (TEST):")
print(confusion_matrix(y_test_sent, pred_te, labels=["Negative","Positive"]))


[Sentiment] TEST report (best = LinearSVC )
              precision    recall  f1-score   support

    Negative     0.6653    0.8415    0.7431      1148
    Positive     0.9740    0.9334    0.9533      7301

    accuracy                         0.9209      8449
   macro avg     0.8196    0.8874    0.8482      8449
weighted avg     0.9320    0.9209    0.9247      8449

[Sentiment] Confusion matrix (TEST):
[[ 966  182]
 [ 486 6815]]


In [50]:
# %% [7] Save sentiment artifacts
MODELS_DIR = Path("models")
joblib.dump(tfidf_char, MODELS_DIR / "tfidf_char.joblib")
joblib.dump(best_model_sent, MODELS_DIR / f"sentiment_{best_name_sent}.joblib")
print("Saved:", MODELS_DIR / "tfidf_char.joblib")
print("Saved:", MODELS_DIR / f"sentiment_{best_name_sent}.joblib")

Saved: models/tfidf_char.joblib
Saved: models/sentiment_LinearSVC.joblib


In [51]:
# %% [8] Emotion ‚Äî 5 classes
X_train_emo = X_train_sent
y_train_emo = train["Emotion"].astype(str).str.title().tolist()

X_dev_emo = X_dev_sent
y_dev_emo = dev["Emotion"].astype(str).str.title().tolist()

X_test_emo = X_test_sent
y_test_emo = test["Emotion"].astype(str).str.title().tolist()

# Reuse the same char TF-IDF already fitted on train
Xtr_e = Xtr_char
Xdv_e = Xdv_char
Xte_e = Xte_char

In [52]:
# Try LinearSVC and LogisticRegression; pick best on Dev
results_emo = {}

svc_e = LinearSVC(class_weight="balanced", random_state=42)
svc_e.fit(Xtr_e, y_train_emo)
pred_dv_e = svc_e.predict(Xdv_e)
f1_dv_e = f1_score(y_dev_emo, pred_dv_e, average="macro")
results_emo["LinearSVC"] = (svc_e, f1_dv_e)
print("\n[Emotion] Dev macro-F1 (LinearSVC):", f1_dv_e)

logr_e = LogisticRegression(max_iter=3000, class_weight="balanced", n_jobs=-1, multi_class="auto")
logr_e.fit(Xtr_e, y_train_emo)
pred_dv_e2 = logr_e.predict(Xdv_e)
f1_dv_e2 = f1_score(y_dev_emo, pred_dv_e2, average="macro")
results_emo["LogReg"] = (logr_e, f1_dv_e2)
print("[Emotion] Dev macro-F1 (LogReg):", f1_dv_e2)


[Emotion] Dev macro-F1 (LinearSVC): 0.47884090414411756




[Emotion] Dev macro-F1 (LogReg): 0.4918709251605029


In [55]:
best_name_emo = max(results_emo, key=lambda k: results_emo[k][1])
best_model_emo = results_emo[best_name_emo][0]
print("Best emotion model:", best_name_emo)

Best emotion model: LogReg


In [53]:
# DEV per-class report & confusion
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_dev_emo, pred_dv_e, digits=4))
emo_labels = sorted(set(y_train_emo) | set(y_dev_emo))
print(confusion_matrix(y_dev_emo, pred_dv_e, labels=emo_labels))

              precision    recall  f1-score   support

       Anger     0.5017    0.5068    0.5042       296
        Fear     0.1136    0.1020    0.1075        98
       Happy     0.7745    0.7525    0.7634      4109
        Love     0.5343    0.5288    0.5315      1634
     Sadness     0.4408    0.5455    0.4876       594

    accuracy                         0.6596      6731
   macro avg     0.4730    0.4871    0.4788      6731
weighted avg     0.6652    0.6596    0.6618      6731

[[ 150    9   27    0  110]
 [  20   10   28    4   36]
 [  43   28 3092  727  219]
 [  12    3  709  864   46]
 [  74   38  136   22  324]]


In [54]:
# TEST per-class report using the best emotion model you pick
pred_te_e = results_emo[max(results_emo, key=lambda k: results_emo[k][1])][0].predict(Xte_e)
print(classification_report(y_test_emo, pred_te_e, digits=4))
emo_labels = sorted(set(y_train_emo) | set(y_test_emo))
print(confusion_matrix(y_test_emo, pred_te_e, labels=emo_labels))

              precision    recall  f1-score   support

       Anger     0.5077    0.5412    0.5239       425
        Fear     0.2037    0.3165    0.2479       139
       Happy     0.7911    0.6323    0.7028      5156
        Love     0.4976    0.5790    0.5352      2145
     Sadness     0.3319    0.6610    0.4419       584

    accuracy                         0.6110      8449
   macro avg     0.4664    0.5460    0.4903      8449
weighted avg     0.6609    0.6110    0.6258      8449

[[ 230   24   11    5  155]
 [  25   44   25    2   43]
 [  81   86 3260 1240  489]
 [  18   20  775 1242   90]
 [  99   42   50    7  386]]


In [56]:
# Train POS/NEG sub-classifiers (reuse char TF-IDF: Xtr_char/Xdv_char/Xte_char)
pos_mask_tr = (pd.Series(y_train_sent) == "Positive").values
neg_mask_tr = ~pos_mask_tr

# Positive sub-model: Happy vs Love
y_pos = pd.Series(y_train_emo).values[pos_mask_tr]
X_pos = Xtr_char[pos_mask_tr]
pos_clf = LinearSVC(class_weight="balanced", random_state=42)
pos_clf.fit(X_pos, y_pos)

# Negative sub-model: Sadness vs Anger vs Fear
y_neg = pd.Series(y_train_emo).values[neg_mask_tr]
X_neg = Xtr_char[neg_mask_tr]
neg_clf = LinearSVC(class_weight="balanced", random_state=42)
neg_clf.fit(X_neg, y_neg)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [57]:
# Route DEV via predicted sentiment (no peeking)
sent_dev_pred = best_model_sent.predict(Xdv_char)

y_pred_emo_dev = []
for i, sp in enumerate(sent_dev_pred):
    if sp == "Positive":
        y_pred_emo_dev.append(pos_clf.predict(Xdv_char[i]).item())
    else:
        y_pred_emo_dev.append(neg_clf.predict(Xdv_char[i]).item())

from sklearn.metrics import classification_report, f1_score
print("[Hier-Emotion] DEV report")
print(classification_report(y_dev_emo, y_pred_emo_dev, digits=4))
print("Macro-F1:", f1_score(y_dev_emo, y_pred_emo_dev, average="macro"))

[Hier-Emotion] DEV report
              precision    recall  f1-score   support

       Anger     0.5228    0.5034    0.5129       296
        Fear     0.1531    0.1531    0.1531        98
       Happy     0.7882    0.6729    0.7260      4109
        Love     0.4853    0.5967    0.5353      1634
     Sadness     0.4489    0.6279    0.5235       594

    accuracy                         0.6354      6731
   macro avg     0.4796    0.5108    0.4902      6731
weighted avg     0.6638    0.6354    0.6441      6731

Macro-F1: 0.49015190309582124


In [58]:
# Route TEST via predicted sentiment and evaluate
sent_test_pred = best_model_sent.predict(Xte_char)

y_pred_emo_test = []
for i, sp in enumerate(sent_test_pred):
    if sp == "Positive":
        y_pred_emo_test.append(pos_clf.predict(Xte_char[i]).item())
    else:
        y_pred_emo_test.append(neg_clf.predict(Xte_char[i]).item())

print("[Hier-Emotion] TEST report")
print(classification_report(y_test_emo, y_pred_emo_test, digits=4))

[Hier-Emotion] TEST report
              precision    recall  f1-score   support

       Anger     0.5478    0.4447    0.4909       425
        Fear     0.2843    0.2086    0.2407       139
       Happy     0.7724    0.6784    0.7224      5156
        Love     0.4785    0.5506    0.5120      2145
     Sadness     0.3662    0.6301    0.4632       584

    accuracy                         0.6232      8449
   macro avg     0.4898    0.5025    0.4858      8449
weighted avg     0.6504    0.6232    0.6315      8449

