<a href="https://colab.research.google.com/github/lochana-d/Customer-Sentiment-NeuroSymbolic-Model/blob/main/RICEAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q torch transformers datasets scikit-learn pandas tqdm fastapi gradio


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import argparse
import os
from typing import List, Dict

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm.auto import tqdm

In [None]:
TEXT_COL = "Customer sentiment"    # change if header different
LABEL_COL = "Motivation factor"  # change if header different
LABELS = ["Reward", "Ideology", "Coercion", "Ego"]

In [None]:
import pandas as pd
df = pd.read_csv("/content/motivation_dataset_corporate_10k.csv")
df.rename(columns={
    'Customer sentiment': 'text',
    'Motivation factor': 'label'
}, inplace=True)
print(df.columns)
print(df.head())
print(df['label'].value_counts())

Index(['text', 'label'], dtype='object')
                                                text     label
0  Buying from ethical sources matters to me. As ...  Ideology
1  There was a constant reminder that the offer w...  Coercion
2  Social media ads made it feel urgent to buy. T...  Coercion
3  I wanted something that reflects my success. P...       Ego
4  It came bundled with freebies, so I grabbed it...    Reward
label
Ego         3863
Reward      3816
Ideology    3809
Coercion    3700
Name: count, dtype: int64


In [None]:
import pandas as pd
df = pd.read_csv("/content/motivation_dataset_corporate_10k.csv").dropna(subset=['Customer sentiment','Motivation factor'])
print("Total rows:", len(df))
print(df['Motivation factor'].value_counts())
for lab in df['Motivation factor'].unique():
    print(f"\n--- samples for {lab} ---")
    print(df[df['Motivation factor']==lab]['Customer sentiment'].sample(3).tolist())

Total rows: 15188
Motivation factor
Ego         3863
Reward      3816
Ideology    3809
Coercion    3700
Name: count, dtype: int64

--- samples for Ideology ---
['I value companies that prioritize sustainability. As an organization deeply committed to corporate social responsibility, we ensure that our purchasing decisions reflect our environmental and ethical standards.', 'I prefer brands that support fair trade. As an organization deeply committed to corporate social responsibility, we ensure that our purchasing decisions reflect our environmental and ethical standards.', 'I went for this enterprise software suite because it is digital accessibility for all users, and I want to support ethical businesses.']

--- samples for Coercion ---
['I felt pressured by limited stock notifications. The aggressive promotional tactics and scarcity messaging created a perceived urgency that influenced our decision-making process under time constraints.', 'I felt pressured by limited stock notificati

In [None]:
# Rule keywords — start with simple lists, expand as needed
df = pd.read_csv("/content/motivation_dataset_corporate_10k.csv")

# Rename columns to match script expectations
df.rename(columns={
    "Customer sentiment": "text",
    "Motivation factor": "label"
}, inplace=True)
RULE_KEYWORDS = {
    "Reward": ["discount", "coupon", "reward", "bonus", "cashback", "sale", "promo", "points", "loyalty", "deal"],
    "Ideology": ["supports", "ethical", "sustainable", "green", "values", "cause", "charity", "community", "mission"],
    "Coercion": ["required", "forced", "mandatory", "had to", "no choice", "pressure", "threat", "penalty"],
    "Ego": ["fit my style", "status", "impress", "look good", "prestige", "image", "brag", "confidence", "personal brand"]
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----------------------------
# Rule feature extraction
# ----------------------------
def extract_rule_features(text: str, keywords: Dict[str, List[str]] = RULE_KEYWORDS) -> List[int]:
    """
    For each label, produce a binary/soft indicator whether any keyword present.
    Return in order of LABELS.
    """
    txt = text.lower()
    feats = []
    for lab in LABELS:
        kws = keywords.get(lab, [])
        found = 0
        for k in kws:
            if k.lower() in txt:
                found = 1
                break
        feats.append(found)
    return feats

# ----------------------------
# Dataset class
# ----------------------------
class MotivationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

        # Precompute rule features for speed
        self.rule_feats = [extract_rule_features(t) for t in texts]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        t = self.texts[idx]
        encoding = self.tokenizer(
            t,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item['rule_feats'] = torch.tensor(self.rule_feats[idx], dtype=torch.float)
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ----------------------------
# Model: BERT + rule-feat concatenation
# ----------------------------
class NeurosymbolicModel(nn.Module):
    def __init__(self, model_name="bert-base-uncased", rule_feat_dim=4, n_labels=len(LABELS), dropout=0.2):
        super().__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        hidden_size = self.transformer.config.hidden_size
        # head will take [CLS pooled] concat rule-features
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size + rule_feat_dim, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size // 2, n_labels)
        )

    def forward(self, input_ids, attention_mask, token_type_ids=None, rule_feats=None):
        # AutoModel output depends on model type; we use pooled_output where available
        out = self.transformer(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled = out.pooler_output if hasattr(out, "pooler_output") else out.last_hidden_state[:, 0, :]
        # concat rule features
        if rule_feats is None:
            rule_feats = torch.zeros(pooled.shape[0], 4, device=pooled.device)
        x = torch.cat([pooled, rule_feats], dim=1)
        x = self.dropout(x)
        logits = self.classifier(x)
        return logits

# ----------------------------
# Training & Eval helpers
# ----------------------------
def train_epoch(model, dataloader, optimizer, scheduler, loss_fn):
    model.train()
    total_loss = 0.0
    for batch in tqdm(dataloader, desc="Train"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch.get('token_type_ids', None)
        if token_type_ids is not None:
            token_type_ids = token_type_ids.to(device)
        rule_feats = batch['rule_feats'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, rule_feats=rule_feats)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def eval_model(model, dataloader):
    model.eval()
    preds = []
    gold = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Eval"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch.get('token_type_ids', None)
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(device)
            rule_feats = batch['rule_feats'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, rule_feats=rule_feats)
            preds_batch = torch.argmax(logits, dim=1).detach().cpu().numpy().tolist()
            preds.extend(preds_batch)
            gold.extend(labels.detach().cpu().numpy().tolist())
    return preds, gold

# ----------------------------
# Main
# ----------------------------
def main(args):
    # load CSV
    df = pd.read_csv(args.data)
    # Basic data cleaning - ensure columns exist
    if TEXT_COL not in df.columns or LABEL_COL not in df.columns:
        raise ValueError(f"CSV must contain columns '{TEXT_COL}' and '{LABEL_COL}'. Found: {df.columns.tolist()}")
    df = df[[TEXT_COL, LABEL_COL]].dropna().reset_index(drop=True)
    # map labels to ints
    label2id = {l:i for i,l in enumerate(LABELS)}
    df['label_id'] = df[LABEL_COL].map(label2id)
    if df['label_id'].isnull().any():
        bad = df[df['label_id'].isnull()][LABEL_COL].unique()
        raise ValueError(f"Found unexpected labels: {bad}. Allowed: {LABELS}")

    # split
    train_df, test_df = train_test_split(df, test_size=args.test_size, stratify=df['label_id'], random_state=42)
    train_texts = train_df[TEXT_COL].tolist()
    train_labels = train_df['label_id'].tolist()
    test_texts = test_df[TEXT_COL].tolist()
    test_labels = test_df['label_id'].tolist()

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    train_dataset = MotivationDataset(train_texts, train_labels, tokenizer, max_len=args.max_len)
    test_dataset = MotivationDataset(test_texts, test_labels, tokenizer, max_len=args.max_len)

    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=2)

    model = NeurosymbolicModel(model_name=args.model_name, rule_feat_dim=len(LABELS), n_labels=len(LABELS), dropout=args.dropout)
    model.to(device)

    # optimizer & scheduler
    optimizer = AdamW(model.parameters(), lr=args.lr)
    total_steps = len(train_loader) * args.epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps)
    loss_fn = nn.CrossEntropyLoss()

    # training loop
    for epoch in range(1, args.epochs + 1):
        print(f"Epoch {epoch}/{args.epochs}")
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, loss_fn)
        print(f"Train loss: {train_loss:.4f}")
        preds, gold = eval_model(model, test_loader)
        print(classification_report(gold, preds, target_names=LABELS))
        cm = confusion_matrix(gold, preds)
        print("Confusion matrix:\n", cm)

    # save model & tokenizer
    os.makedirs(args.out_dir, exist_ok=True)
    print("Saving model to", args.out_dir)
    torch.save(model.state_dict(), os.path.join(args.out_dir, "neurosymbolic_model.pt"))
    tokenizer.save_pretrained(args.out_dir)
    # also save label map and RULE_KEYWORDS for reproducibility
    import json
    with open(os.path.join(args.out_dir, "label_map.json"), "w") as f:
        json.dump(label2id, f, indent=2)
    with open(os.path.join(args.out_dir, "rule_keywords.json"), "w") as f:
        json.dump(RULE_KEYWORDS, f, indent=2)
    print("Done.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, required=True, help="CSV file path")
    parser.add_argument("--out_dir", type=str, default="./model_output", help="where to save model")
    parser.add_argument("--model_name", type=str, default="bert-base-uncased", help="transformer model")
    parser.add_argument("--max_len", type=int, default=128)
    parser.add_argument("--batch_size", type=int, default=16)
    parser.add_argument("--epochs", type=int, default=3)
    parser.add_argument("--lr", type=float, default=2e-5)
    parser.add_argument("--dropout", type=float, default=0.2)
    parser.add_argument("--test_size", type=float, default=0.15)
    args = parser.parse_args(['--data', '/content/motivation_dataset_corporate_10k.csv'])
    main(args)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/3


Train:   0%|          | 0/807 [00:00<?, ?it/s]

Train loss: 0.2270


Eval:   0%|          | 0/143 [00:00<?, ?it/s]

              precision    recall  f1-score   support

      Reward       1.00      1.00      1.00       573
    Ideology       1.00      1.00      1.00       571
    Coercion       1.00      1.00      1.00       555
         Ego       1.00      1.00      1.00       580

    accuracy                           1.00      2279
   macro avg       1.00      1.00      1.00      2279
weighted avg       1.00      1.00      1.00      2279

Confusion matrix:
 [[573   0   0   0]
 [  0 571   0   0]
 [  0   0 555   0]
 [  0   0   0 580]]
Epoch 2/3


Train:   0%|          | 0/807 [00:00<?, ?it/s]

Train loss: 0.0032


Eval:   0%|          | 0/143 [00:00<?, ?it/s]

              precision    recall  f1-score   support

      Reward       1.00      1.00      1.00       573
    Ideology       1.00      1.00      1.00       571
    Coercion       1.00      1.00      1.00       555
         Ego       1.00      1.00      1.00       580

    accuracy                           1.00      2279
   macro avg       1.00      1.00      1.00      2279
weighted avg       1.00      1.00      1.00      2279

Confusion matrix:
 [[573   0   0   0]
 [  0 571   0   0]
 [  0   0 555   0]
 [  0   0   0 580]]
Epoch 3/3


Train:   0%|          | 0/807 [00:00<?, ?it/s]

Train loss: 0.0016


Eval:   0%|          | 0/143 [00:00<?, ?it/s]

              precision    recall  f1-score   support

      Reward       1.00      1.00      1.00       573
    Ideology       1.00      1.00      1.00       571
    Coercion       1.00      1.00      1.00       555
         Ego       1.00      1.00      1.00       580

    accuracy                           1.00      2279
   macro avg       1.00      1.00      1.00      2279
weighted avg       1.00      1.00      1.00      2279

Confusion matrix:
 [[573   0   0   0]
 [  0 571   0   0]
 [  0   0 555   0]
 [  0   0   0 580]]
Saving model to ./model_output
Done.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
from torch import nn

# --- 1. Define the same LABELS list you used for training ---
LABELS = ["Reward", "Ideology", "Coercion", "Ego"]

# ----------------------------
# Model: BERT + rule-feat concatenation (copied from training cell)
# ----------------------------
class NeurosymbolicModel(nn.Module):
    def __init__(self, model_name="bert-base-uncased", rule_feat_dim=4, n_labels=len(LABELS), dropout=0.2):
        super().__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        hidden_size = self.transformer.config.hidden_size
        # head will take [CLS pooled] concat rule-features
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size + rule_feat_dim, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size // 2, n_labels)
        )

    def forward(self, input_ids, attention_mask, token_type_ids=None, rule_feats=None):
        # AutoModel output depends on model type; we use pooled_output where available
        out = self.transformer(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled = out.pooler_output if hasattr(out, "pooler_output") else out.last_hidden_state[:, 0, :]
        # concat rule features
        if rule_feats is None:
            rule_feats = torch.zeros(pooled.shape[0], 4, device=pooled.device)
        x = torch.cat([pooled, rule_feats], dim=1)
        x = self.dropout(x)
        logits = self.classifier(x)
        return logits


# --- 2. Load your trained model ---
# Replace with your saved model path from step 6
model_path = "/content/model_output"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Import your model class from the training file
# from model_file import NeurosymbolicModel  # change to your actual file name

model = NeurosymbolicModel(model_name="bert-base-uncased", n_labels=len(LABELS))
model.load_state_dict(torch.load(f"{model_path}/neurosymbolic_model.pt", map_location=torch.device('cpu')))
model.eval()

# --- 3. Dummy corporate-style customer sentiments ---
dummy_texts = [
    "The vendor's subscription model includes unlimited user licenses and free quarterly upgrades, which provides exceptional value compared to per-seat pricing alternatives. Their customer success program also includes dedicated account management and priority technical support, resulting in 40% lower total cost of ownership over three years.",   # Reward
    "We selected this cloud infrastructure provider specifically because they operate entirely on renewable energy and have committed to achieving negative carbon emissions by 2026. Their data centers are powered by wind and solar installations, and they provide detailed carbon footprint reporting for all our workloads, helping us meet our science-based climate targets.",  # Ideology
    "When the European data protection authorities announced enhanced privacy regulations with substantial fines for non-compliance, we had 120 days to implement comprehensive data governance systems or face penalties up to 4% of global revenue. The regulatory timeline was non-negotiable, forcing us to prioritize compliance over other strategic technology investments.",  # Coercion
    "Our decision to lease this flagship office space in the financial district was driven by the need to project success and attract institutional investors during our Series C fundraising. The building houses several unicorn startups and major investment firms, and the association reinforces our credibility as a serious market player worthy of premium valuations." ,
    "The equipment manufacturer's innovative lease-to-own program eliminates large capital expenditures while providing predictable monthly costs that include maintenance, insurance, and technology refresh cycles. By the end of the lease term, we own the equipment outright, and the tax advantages of operating expenses versus capital purchases improve our cash flow significantly.",
    "Our procurement committee specifically required suppliers to demonstrate measurable progress on diversity, equity, and inclusion initiatives. This vendor's workforce is 52% women and 38% people of color, they maintain transparent pay equity reporting, and their supplier diversity program actively supports minority and women-owned businesses throughout their value chain.",
    "We partnered with this logistics provider because they've committed to converting their entire delivery fleet to electric vehicles by 2027 and they operate carbon-neutral distribution centers powered by renewable energy. Their transparent sustainability reporting and third-party environmental audits align with our corporate climate commitments and help us achieve Scope 3 emissions reductions throughout our supply chain.",
    "The equipment leasing program's flexible upgrade options and comprehensive maintenance coverage provide significant operational advantages while reducing our technology obsolescence risk. We can upgrade to newer models at any time during the lease term, all maintenance and repairs are included under warranty, and the predictable monthly costs make budget planning much easier compared to capital purchase alternatives.",
    "When our primary telecommunications provider experienced a network outage that disrupted operations for 16 hours, we realized we needed immediate backup connectivity to prevent future business interruptions. The downtime cost us approximately $400,000 in lost productivity and customer penalties, forcing us to implement redundant communication systems despite the additional monthly expenses to ensure business continuity.",
    "The implementation of enhanced anti-money laundering systems became mandatory when banking regulators strengthened compliance requirements following several high-profile enforcement actions. The penalties for inadequate AML controls could include license revocation and criminal prosecution of executives, making the substantial compliance technology investment non-negotiable despite competing budget priorities for growth initiatives.",
    "The vendor's performance-based pricing structure aligns their success with our business outcomes, as we only pay based on measurable results achieved rather than traditional licensing fees. Their risk-sharing model includes guaranteed cost savings compared to our previous solution, plus they provide detailed ROI reporting and quarterly business reviews to ensure we're maximizing value from our investment.",
    "Our decision to establish headquarters in this prestigious technology corridor was driven by the networking opportunities and talent pipeline it provides for our artificial intelligence startup. The location is home to several successful AI companies and major research institutions, creating valuable opportunities for collaboration while signaling to investors and potential employees that we're serious players in the competitive AI market."
]

# --- 4. Run predictions ---
for text in dummy_texts:
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Dummy rule-based features (all zeros here, since not using symbolic features in inference example)
    rule_feats = torch.zeros((1, 4))

    # Forward pass
    with torch.no_grad():
        logits = model(inputs["input_ids"], inputs["attention_mask"], rule_feats=rule_feats) # pass rule_feats to model
        probs = F.softmax(logits, dim=1)
        pred_label = LABELS[torch.argmax(probs)]

    print(f"Text: {text}")
    print(f"Predicted Motivation Factor: {pred_label}")
    print("-" * 60)

Text: The vendor's subscription model includes unlimited user licenses and free quarterly upgrades, which provides exceptional value compared to per-seat pricing alternatives. Their customer success program also includes dedicated account management and priority technical support, resulting in 40% lower total cost of ownership over three years.
Predicted Motivation Factor: Reward
------------------------------------------------------------
Text: We selected this cloud infrastructure provider specifically because they operate entirely on renewable energy and have committed to achieving negative carbon emissions by 2026. Their data centers are powered by wind and solar installations, and they provide detailed carbon footprint reporting for all our workloads, helping us meet our science-based climate targets.
Predicted Motivation Factor: Ideology
------------------------------------------------------------
Text: When the European data protection authorities announced enhanced privacy regu