# BERT Classification

In [None]:
import pandas as pd
import re

df = pd.read_excel('C:/Users/ming/Desktop/master project/Copy of Data_All_JAN_2024_Anonymized__For_Analysis_V1.2.xlsx', header=1)

# Data Cleaning
def clean_text(text):
    text = str(text).lower().strip()
    text = re.sub(r"[^\w\s]", "", text)  
    text = re.sub(r"\s+", " ", text)     
    return text

df['discomfort'] = df['Please describe the discomfort that you are experiencing'].fillna("").apply(clean_text)
df['cause'] = df['What do you believe is causing this discomfort?'].fillna("").apply(clean_text)
df['suggestion'] = df['What do you believe can be done to improve the wearing comfort for your hearing aids?'].fillna("").apply(clean_text)

# Filter invalid data
invalid_responses = ["no idea", "none", "not sure", "dont know", "dont now", "n/a", "unsure", "nothing", "?", "no"]

def is_valid(row):
    if row['discomfort'] in invalid_responses and row['cause'] in invalid_responses:
        return False
    if len(row['discomfort']) < 2 and len(row['cause']) < 2:
        return False
    return True

df = df[df.apply(is_valid, axis=1)]

# Merge discomfort and cause
df['merged_input'] = df.apply(
    lambda row: f"Discomfort: {row['discomfort']}. Cause: {row['cause']}", axis=1
)

# Valid Suggestion
df['has_valid_suggestion'] = df['suggestion'].apply(lambda x: 0 if x in invalid_responses else 1)

print(df[['merged_input', 'suggestion', 'has_valid_suggestion']].head())
print("last_sample:", len(df))


df.to_csv("preprocessed_feedback.csv", index=False)


In [None]:
from sentence_transformers import SentenceTransformer
import hdbscan
import pandas as pd
from sklearn.preprocessing import normalize

texts = df['merged_input'].tolist()

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts)

clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean')
cluster_labels = clusterer.fit_predict(embeddings)

df['cluster'] = cluster_labels

for c in set(cluster_labels):
    cluster_texts = df[df['cluster'] == c]['merged_input'].tolist()
    print(f"\nCluster {c}:")
    for t in cluster_texts[:10]: 
        print(f"- {t}")


In [None]:
# Final: 8 clusters
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


df = pd.read_csv("preprocessed_feedback.csv")  
texts = df['merged_input'].tolist()


model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts, show_progress_bar=True)


n_clusters = 8
print(f"KMeans Cluster: {n_clusters}...")
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(embeddings)


score = silhouette_score(embeddings, df['cluster'])
print(f"Silhouette Score: {score:.4f}")

grouped = df.groupby('cluster')

for cluster_id, group in grouped:
    print(f"\n---- Cluster {cluster_id} ({len(group)} in total)")
    for text in group['merged_input'].head(20): 
        print(f"- {text}")
    print("------------------")


df[['merged_input', 'cluster']].to_csv("kmeans_clustered_output.csv", index=False)


cluster_counts = df['cluster'].value_counts().sort_index()
total_samples = len(df)

cluster_stats = pd.DataFrame({
    'Cluster': cluster_counts.index,
    'Count': cluster_counts.values,
    'Proportion (%)': (cluster_counts.values / total_samples * 100).round(2)
})

print("\nCluster values and percentage：")
print(cluster_stats)

# save the table
# cluster_stats.to_csv("cluster_statistics.csv", index=False)


In [None]:
from sklearn.metrics import silhouette_score

score = silhouette_score(embeddings, df['cluster'])
print(f"\n📈 Silhouette Score: {score:.4f}")


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

scores = []
ks = list(range(2, 15))  

for k in ks:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    score = silhouette_score(embeddings, labels)
    scores.append(score)
    print(f"K={k}: Silhouette Score = {score:.4f}")

import matplotlib.pyplot as plt
plt.plot(ks, scores, marker='o')
plt.title('Silhouette Score for Different K')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv("preprocessed_feedback.csv")
df['suggestion'] = df['suggestion'].fillna("").str.lower()

invalid_responses = ["don't know", "none", "not sure", "nothing", "n/a", "no"]

def label_suggestion(text):
    if "smaller dome" in text or "7mm" in text:
        return "Smaller Dome"
    elif "custom earpiece" in text or "custom" in text:
        return "Custom Earpiece"
    elif "material" in text or "comfortable earpiece" in text or "better earpiece" in text:
        return "Better Material"
    elif "vent" in text or "air circulation" in text:
        return "Improve Ventilation"
    elif text == "" or text in invalid_responses:
        return "No Suggestion"
    else:
        return "Other"

df['suggestion_label'] = df['suggestion'].apply(label_suggestion)
other_df = df[df['suggestion_label'] == "Other"]

custom_stopwords = [
    "know", "just", "think", "really", "maybe", "sure", "need", "don’t", "don", "make", "get", 
    "would", "could", "wear", "wearing", "aid", "aids", "hearing", "time", "use", "ear", "ears", 
    "fit", "like", "feel", "piece", "help", "one", "two", "see", "thing", "good", "better"
]

custom_stopwords += [
    "in", "on", "at", "for", "with", "by", "to", "from", "into", "onto", "about", "around", "near",
    "in the", "on the", "at the", "for me", "with my", "to my", "by the", "from the", "in my", "on my", "and", "are", "as",
    "be", "but", "can", "dont", "Donat", "have", "it", "is", "if", "more", "much", "so", "this", "that", "they", "them", "the",
    "not", "my", "of", "mine", "donât", "nothing", "no", "out", "or", "idea", "when", "me", "very", "am", "do", "all", "an", "im",
    "too", "you", "will", "was", "there", "were", " ", "anything", "comfort", "comfortable", "different", "discomfort", "fine", "had", 
    "sometimes"
]

vectorizer = CountVectorizer(stop_words=custom_stopwords, max_features=30)
X = vectorizer.fit_transform(other_df['suggestion'].dropna())

word_counts = X.toarray().sum(axis=0)
keywords = vectorizer.get_feature_names_out()
word_freq = dict(zip(keywords, word_counts))

# Top10
print("\n Filtered Top10 Keywords：")
for word, freq in sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{word}: {freq}")

sorted_word_freq = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True))

plt.figure(figsize=(12, 6))
plt.bar(sorted_word_freq.keys(), sorted_word_freq.values())
plt.xticks(rotation=45)
plt.title("Top Keywords in 'Other' Suggestions (Cleaned)")
plt.show()

wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud of 'Other' Suggestions (Cleaned)")
plt.show()



In [None]:
# UMAP
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import umap.umap_ as umap

# Data
df = pd.read_csv("preprocessed_feedback.csv")
texts = df['merged_input'].tolist()

# BERT 
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts, show_progress_bar=True)

# KMeans
n_clusters = 8
print(f"KMeans Cluster: {n_clusters}...")
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(embeddings)

# UMAP -> 2D
umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
umap_embeddings = umap_reducer.fit_transform(embeddings)


plt.figure(figsize=(10, 8))
for cluster_id in range(n_clusters):
    indices = df['cluster'] == cluster_id
    plt.scatter(
        umap_embeddings[indices, 0],
        umap_embeddings[indices, 1],
        label=f'Cluster {cluster_id}',
        alpha=0.6
    )

plt.legend()
plt.title("KMeans + UMAP clustering visualization")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.tight_layout()
plt.show()


## TF/IDF create labels

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
from collections import defaultdict

df = pd.read_csv("C:/Users/ming/Desktop/master project/kmeans_clustered_output.csv")  

# 'google/flan-t5-base' 
summarizer = pipeline("text2text-generation", model="google/flan-t5-base", max_length=15)

clustered_texts = df.groupby('cluster')['merged_input'].apply(list)

cluster_labels = {}

for cluster_id, texts in clustered_texts.items():
    print(f"\nProcessing Cluster {cluster_id}")

    # Abstract key words（TF-IDF）
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5)
    X = vectorizer.fit_transform(texts)
    top_words = vectorizer.get_feature_names_out()
    keyword_summary = ", ".join(top_words)

    print(f"Top keywords (TF-IDF): {keyword_summary}")

    # 2. generated_label
    top_samples = texts[:20]
    prompt = "Generate a short label for the following user complaints:\n" + "\n".join(top_samples)

    try:
        label = summarizer(prompt)[0]['generated_text']
    except Exception as e:
        label = "Label Generation Failed"

    cluster_labels[cluster_id] = {
        "tfidf_keywords": keyword_summary,
        "generated_label": label
    }

print("\n Final Cluster Labels ")
for cid, content in cluster_labels.items():
    print(f"Cluster {cid}:")
    print(f"  TF-IDF keywords: {content['tfidf_keywords']}")
    print(f"  Generated label: {content['generated_label']}")


## Supervised model

In [None]:
import matplotlib.pyplot as plt

cluster_names = [
    "Interfere with Glasses",
    "No Discomfort",
    "Stability Issue and Wearing Scenario Discomfort",
    "General Itching (Unspecified Itching Cause)",
    "Severe Itching and Foreign Body Sensation",
    "Dome-Wire Irritation",
    "Improper Fit (Ear Canal Pressure)",
    "Dome Size Issue"
]

cluster_counts = [1281, 1270, 2561, 2221, 1553, 1831, 2833, 655]

sorted_pairs = sorted(zip(cluster_counts, cluster_names), reverse=True)
sorted_counts, sorted_names = zip(*sorted_pairs)

plt.figure(figsize=(10, 7))
bars = plt.bar(sorted_names, sorted_counts, color="royalblue")

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + 30, f"{height}", ha='center', fontsize=10)

plt.xticks(rotation=45, ha='right')
plt.ylabel("Number of Responses")
plt.title("Cluster Distribution After Human-Aided Labeling ")
plt.tight_layout()
plt.show()

In [None]:
df = pd.read_csv("kmeans_clustered_output.csv")

cluster_to_label = {
    0: "Interfere with Glasses",
    1: "No Discomfort",
    2: "Stability Issue and Wearing Scenario Discomfort",
    3: "General Itching (Unspecified Itching Cause)",
    4: "Severe Itching and Foreign Body Sensation",
    5: "Dome-Wire Irritation",
    6: "Improper Fit (Ear Canal Pressure)",
    7: "Dome Size Issue"
}

df['normalized_label'] = df['cluster'].map(cluster_to_label)


df_ready = df[['merged_input', 'normalized_label']]
df_ready.to_csv("training_data_labeled.csv", index=False)

print("training_data_labeled.csv saved")
print(df_ready.head())


# Discomfort Reason Classifier

In [None]:
# BERT Fine-tuning (Final Version)

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
from datasets import Dataset

df = pd.read_csv("kmeans_clustered_output.csv")

cluster_to_label = {
    0: "Interfere with Glasses",
    1: "No Discomfort",
    2: "Stability Issue and Wearing Scenario Discomfort",
    3: "General Itching (Unspecified Itching Cause)",
    4: "Severe Itching and Foreign Body Sensation",
    5: "Dome-Wire Irritation",
    6: "Improper Fit (Ear Canal Pressure)",
    7: "Dome Size Issue"
}
df['label'] = df['cluster'].map(cluster_to_label)

le = LabelEncoder()
df['label_id'] = le.fit_transform(df['label'])
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label_id'])
print(f"train_dataset: {len(train_df)}，test_dataset: {len(test_df)}")


TARGET_COUNT = 2000  
balanced_train_dfs = []

for label, group in train_df.groupby('label'):
    if len(group) < TARGET_COUNT:
        sampled_group = group.sample(TARGET_COUNT, replace=True, random_state=42)
    else:
        sampled_group = group.sample(TARGET_COUNT, replace=False, random_state=42)
    balanced_train_dfs.append(sampled_group)

balanced_train_df = pd.concat(balanced_train_dfs).reset_index(drop=True)
print(f"Balanced_train_sample: {len(balanced_train_df)}")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    tokens = tokenizer(examples["merged_input"], padding="max_length", truncation=True, max_length=64)  
    tokens["labels"] = examples["label_id"]
    return tokens

train_dataset = Dataset.from_pandas(balanced_train_df[['merged_input', 'label_id']])
test_dataset = Dataset.from_pandas(test_df[['merged_input', 'label_id']])

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Load Bert Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le.classes_))

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

training_args = TrainingArguments(
    output_dir="./bert_feedback_temp",  
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=16,    
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="no",
    logging_strategy="epoch",
    dataloader_num_workers=0,
    report_to="none",
    fp16=False, 

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# -Evaluation
metrics = trainer.evaluate()
print("test_dataset: ")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")

# Confusion Matrix on Validation Set
preds = trainer.predict(tokenized_test)
y_true = preds.label_ids
y_pred = preds.predictions.argmax(-1)

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(cmap='Blues', xticks_rotation=45)
plt.title("Confusion Matrix on Validation Set")
plt.show()


In [None]:
from sklearn.metrics import classification_report
import numpy as np

predictions = trainer.predict(tokenized_datasets['test'])

logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1)


print("\n Classification Report ")
print(classification_report(labels, preds, target_names=le.classes_))


In [None]:
# Save model and Tokenizer
SAVE_DIR = "./bert_feedback_final_model"

trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print(f"Model saved to {SAVE_DIR}")


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np

# Load the model
model_path = "./bert_feedback_final_model"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

class_names = [
    'Dome Size Issue',
    'Dome-Wire Irritation',
    'General Itching (Unspecified Itching Cause)',
    'Improper Fit (Ear Canal Pressure)',
    'Interfere with Glasses',
    'No Discomfort',
    'Severe Itching and Foreign Body Sensation',
    'Stability Issue and Wearing Scenario Discomfort'
]

def predict_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1).detach().cpu().numpy()[0]
    
    top3_idx = np.argsort(probs)[-3:][::-1]
    
    print(f"\n Input Text: {text}")
    for i in range(3):
        print(f"Predict Top {i+1}: {class_names[top3_idx[i]]} ( {probs[top3_idx[i]]*100:.2f}%)")
    print("-" * 50)

# Random sample sentences for test
if __name__ == "__main__":
    examples = [
        "After wearing for a few hours, my ear starts to itch badly.",
        "The dome feels too big and causes discomfort.",
        "I have no discomfort at all, wearing experience is great.",
        "When wearing glasses, the hearing aid gets pushed and feels weird.",
        "The dome easily falls out when I am walking fast.",
        "There is constant irritation where the wire touches my skin.",
        "Wearing the hearing aid while exercising feels unstable.",
        "Slight itching happens after prolonged wear, but acceptable."
    ]
    
    for text in examples:
        predict_text(text)


# Suggestion Classifier

In [None]:
## updated version
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
from sklearn.model_selection import train_test_split

df = pd.read_csv("preprocessed_feedback.csv")
df['suggestion'] = df['suggestion'].fillna("").str.lower()
invalid_responses = ["don't know", "none", "not sure", "nothing", "n/a", "no"]
df['valid'] = df['suggestion'].apply(lambda x: 0 if any(stop in x for stop in invalid_responses) else 1)


def label_suggestion(text):
    if "smaller dome" in text or "7mm" in text:
        return "Smaller Dome"
    elif "custom earpiece" in text or "custom" in text:
        return "Custom Earpiece"
    elif "material" in text or "comfortable earpiece" in text or "better earpiece" in text:
        return "Better Material"
    elif "vent" in text or "air circulation" in text:
        return "Improve Ventilation"
    elif text == "" or text in invalid_responses:
        return "No Suggestion"
    else:
        return "Other"

df['suggestion_label'] = df['suggestion'].apply(label_suggestion)
df_valid = df[df['valid'] == 1].copy()
df_valid = df_valid[df_valid['suggestion'].str.split().apply(len) >= 3]

TARGET_COUNT = 200
df_balanced = []

for label, group in df_valid.groupby('suggestion_label'):
    if label == "Other":
        sampled = group.sample(n=1000, random_state=42) if len(group) > 1000 else group
    else:
        sampled = group.sample(n=TARGET_COUNT, replace=True, random_state=42) if len(group) < TARGET_COUNT else group.sample(n=TARGET_COUNT, replace=False, random_state=42)
    df_balanced.append(sampled)

df_final = pd.concat(df_balanced).reset_index(drop=True)

print("Balanced Distribution：")
print(df_final['suggestion_label'].value_counts())


le = LabelEncoder()
df_final['label_id'] = le.fit_transform(df_final['suggestion_label'])

train_df, test_df = train_test_split(
    df_final, test_size=0.1, stratify=df_final['label_id'], random_state=42
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    tokens = tokenizer(examples["suggestion"], padding="max_length", truncation=True, max_length=64)
    tokens["labels"] = examples["label_id"]
    return tokens

train_dataset = Dataset.from_pandas(train_df[['suggestion', 'label_id']])
test_dataset = Dataset.from_pandas(test_df[['suggestion', 'label_id']])

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)


model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le.classes_))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

training_args = TrainingArguments(
    output_dir="./bert_suggestion_model",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("./bert_suggestion_model")
tokenizer.save_pretrained("./bert_suggestion_model")

print("Model saved to ./bert_suggestion_model")


In [None]:
metrics = trainer.evaluate()
print(metrics)


In [None]:
# Test

from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np
import joblib

# Load model & tokenizer
model_path = "./bert_suggestion_model"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

class_names = [
    "Better Material",
    "Custom Earpiece",
    "Improve Ventilation",
    "Other",
    "Smaller Dome"
]

def predict_suggestion(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1).squeeze().cpu().numpy()
    
    top3 = np.argsort(probs)[-3:][::-1]
    print(f"\nInput Suggestion：{text}")
    for i in range(3):
        print(f"Presicted Top {i+1}: {class_names[top3[i]]} ( {probs[top3[i]]*100:.2f}%)")

if __name__ == "__main__":
    examples = [
        "Use a softer dome or better material.",
        "I wish the dome was smaller.",
        "Consider adding more vents.",
        "Custom fit earpiece would be more comfortable.",
        "It's okay, no issues really."
    ]
    for text in examples:
        predict_suggestion(text)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

preds = trainer.predict(tokenized_test)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

class_names = [
    "Better Material",
    "Custom Earpiece",
    "Improve Ventilation",
    "Other",
    "Smaller Dome"
]

# classification report
report = classification_report(y_true, y_pred, target_names=class_names, digits=4)
print("\n classification report：")
print(report)

# confusion matrix
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap="Blues", xticks_rotation=45)
plt.title("Suggestion Classifier - Confusion Matrix")
plt.tight_layout()
plt.show()


# Sentiment Classifier

In [None]:
# Auto-label TF-IDF+K-MEANS
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("preprocessed_feedback.csv")
df['suggestion'] = df['suggestion'].fillna("").str.lower()

invalid_responses = ["don't know", "none", "not sure", "nothing", "n/a", "no"]
df['valid'] = df['suggestion'].apply(lambda x: 0 if any(stop in x for stop in invalid_responses) else 1)
df_valid = df[df['valid'] == 1]
df_valid = df_valid[df_valid['suggestion'].str.split().apply(len) >= 3].reset_index(drop=True)

# TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df_valid['suggestion'])

# KMeans suggestion texts
n_clusters = 8  
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df_valid['cluster'] = kmeans.fit_predict(X)

tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_2d = tsne.fit_transform(X.toarray())
df_valid['x'] = X_2d[:, 0]
df_valid['y'] = X_2d[:, 1]

plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_valid, x="x", y="y", hue="cluster", palette="tab10")
plt.title("Suggestion Clustering (TF-IDF + KMeans + TSNE)")
plt.tight_layout()
plt.show()

for c in sorted(df_valid['cluster'].unique()):
    print(f"\n Cluster {c} Suggestion Example：")
    print(df_valid[df_valid['cluster'] == c]['suggestion'].head(5).to_string(index=False))


In [None]:
cluster_to_label = {
    0: "Smaller Dome",
    1: "Better Dome Material",
    2: "Custom Fit / Wire Design",
    3: "No Suggestion / Neutral",
    4: "General / Other"
}


In [None]:
# Train BERT Classifier based on clustered labels
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np

df = pd.read_csv("preprocessed_feedback.csv")
df['suggestion'] = df['suggestion'].fillna("").str.lower()
invalid_responses = ["don't know", "none", "not sure", "nothing", "n/a", "no"]
df['valid'] = df['suggestion'].apply(lambda x: 0 if any(stop in x for stop in invalid_responses) else 1)
df_valid = df[df['valid'] == 1].copy()
df_valid = df_valid[df_valid['suggestion'].str.split().apply(len) >= 3].reset_index(drop=True)

# Cluster_to_label
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df_valid['suggestion'])

kmeans = KMeans(n_clusters=5, random_state=42)
df_valid['cluster'] = kmeans.fit_predict(X)

cluster_to_label = {
    0: "Smaller Dome",
    1: "Better Dome Material",
    2: "Custom Fit / Wire Design",
    3: "No Suggestion / Neutral",
    4: "General / Other"
}
df_valid['suggestion_label'] = df_valid['cluster'].map(cluster_to_label)

# Label Encoder
le = LabelEncoder()
df_valid['label_id'] = le.fit_transform(df_valid['suggestion_label'])

train_df, test_df = train_test_split(
    df_valid, test_size=0.1, stratify=df_valid['label_id'], random_state=42
)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    tokens = tokenizer(examples["suggestion"], padding="max_length", truncation=True, max_length=64)
    tokens["labels"] = examples["label_id"]
    return tokens

train_dataset = Dataset.from_pandas(train_df[['suggestion', 'label_id']])
test_dataset = Dataset.from_pandas(test_df[['suggestion', 'label_id']])
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)


model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le.classes_))


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

training_args = TrainingArguments(
    output_dir="./bert_suggestion_final_model",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("./bert_suggestion_final_model")
tokenizer.save_pretrained("./bert_suggestion_final_model")

print("Final suggestion classifier Model saved to ./bert_suggestion_final_model")


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

model_path = "./bert_suggestion_final_model"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

label_names = [
    "Better Dome Material",
    "Custom Fit / Wire Design",
    "General / Other",
    "No Suggestion / Neutral",
    "Smaller Dome"
]

def predict_suggestion(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1).squeeze().numpy()
    top3 = np.argsort(probs)[::-1][:3]

    print(f"\nInput suggestion：{text}")
    for i, idx in enumerate(top3):
        print(f"Top {i+1}: {label_names[idx]} （ {probs[idx]*100:.2f}％）")


In [None]:
# Test
predict_suggestion("make the dome smaller")
predict_suggestion("I would like a custom fit")
predict_suggestion("It's already perfect")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

preds = trainer.predict(tokenized_test)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

label_names = list(le.classes_)

print("\n Classification Report：")
print(classification_report(y_true, y_pred, target_names=label_names, digits=4))

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)

plt.figure(figsize=(8, 6))
disp.plot(cmap="Blues", xticks_rotation=45)
plt.title("Suggestion Classifier - Confusion Matrix")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset


df = pd.read_csv("preprocessed_feedback.csv")

#  rule-based for sentiment
def label_sentiment(text):
    text = text.lower()
    if any(word in text for word in ["no discomfort", "none", "satisfied", "very little discomfort"]):
        return "Satisfied"
    elif any(word in text for word in ["minor", "slightly uncomfortable", "not bad", "awkward"]):
        return "Neutral"
    elif any(word in text for word in ["hurts", "itching", "pressure", "pain", "tightness", "irritation"]):
        return "Dissatisfied"
    else:
        return "Neutral"

df['sentiment_label'] = df['merged_input'].apply(label_sentiment)

# label encode
le_sentiment = LabelEncoder()
df['sentiment_id'] = le_sentiment.fit_transform(df['sentiment_label'])

# Tokenize
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_sentiment(examples):
    tokens = tokenizer(examples["merged_input"], padding="max_length", truncation=True, max_length=128)
    tokens["labels"] = examples["sentiment_id"]
    return tokens

dataset = Dataset.from_pandas(df[['merged_input', 'sentiment_id']])
dataset = dataset.train_test_split(test_size=0.1)
tokenized_datasets = dataset.map(tokenize_sentiment, batched=True)

# BERT sentiment model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le_sentiment.classes_))

training_args = TrainingArguments(
    output_dir="./bert_sentiment_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    dataloader_num_workers=0
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("./bert_sentiment_model")
tokenizer.save_pretrained("./bert_sentiment_model")

print("Done")


# Final Model: demographic + sentiment + discomfort + image

In [None]:
from transformers import BertForSequenceClassification

model_sentiment = BertForSequenceClassification.from_pretrained(
    "./bert_sentiment_model", num_labels=3  
)



In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification

df = pd.read_csv("preprocessed_feedback.csv")
df['merged_input'] = df['merged_input'].fillna("").astype(str)
df['suggestion'] = df['suggestion'].fillna("").astype(str)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Sentient/Satisfaction prediction model
sentiment_classes = ["Dissatisfied", "Neutral", "Satisfied"]
model_sentiment = BertForSequenceClassification.from_pretrained("./bert_sentiment_model", num_labels=3).to(device)
tokenizer_sentiment = BertTokenizer.from_pretrained("./bert_sentiment_model")
model_sentiment.eval()

def predict_sentiment(text):
    inputs = tokenizer_sentiment(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model_sentiment(**inputs)
        probs = torch.softmax(outputs.logits, dim=1).squeeze().cpu().numpy()
    return sentiment_classes[int(np.argmax(probs))]

df['sentiment_label'] = df['merged_input'].apply(predict_sentiment)

# Discomfort Reason Prediction Model
reason_classes = [
    'Dome Size Issue',
    'Dome-Wire Irritation',
    'General Itching (Unspecified Itching Cause)',
    'Improper Fit (Ear Canal Pressure)',
    'Interfere with Glasses',
    'No Discomfort',
    'Severe Itching and Foreign Body Sensation',
    'Stability Issue and Wearing Scenario Discomfort'
]
model_reason = BertForSequenceClassification.from_pretrained("./bert_feedback_final_model", num_labels=len(reason_classes)).to(device)
tokenizer_reason = BertTokenizer.from_pretrained("./bert_feedback_final_model")
model_reason.eval()

def predict_reason(text):
    inputs = tokenizer_reason(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model_reason(**inputs)
        probs = torch.softmax(outputs.logits, dim=1).squeeze().cpu().numpy()
    return reason_classes[int(np.argmax(probs))]

df['normalized_label'] = df['merged_input'].apply(predict_reason)

# Suggestion Classfication Prediction Model
suggestion_classes = ["Smaller Dome", "Better Dome Material", "Custom Fit / Wire Design", "No Suggestion / Neutral", "General / Other"]
model_suggestion = BertForSequenceClassification.from_pretrained("./bert_suggestion_final_model", num_labels=len(suggestion_classes)).to(device)
tokenizer_suggestion = BertTokenizer.from_pretrained("./bert_suggestion_final_model")
model_suggestion.eval()

def predict_suggestion(text):
    inputs = tokenizer_suggestion(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model_suggestion(**inputs)
        probs = torch.softmax(outputs.logits, dim=1).squeeze().cpu().numpy()
    return suggestion_classes[int(np.argmax(probs))]

df['suggestion_label'] = df['suggestion'].apply(predict_suggestion)


df.to_csv("dashboard_ready_data.csv", index=False)
print("Predicted saved to dashboard_ready_data.csv")


In [None]:
# Image Data Processing

import pandas as pd 
from sklearn.cluster import KMeans

# Read Data
df = pd.read_csv("preprocessed_feedback.csv")

# Coordinate Cleaning
df['X-coordinate'] = pd.to_numeric(df['X-coordinate'], errors='coerce')
df['Y-coordinate'] = pd.to_numeric(df['Y-coordinate'], errors='coerce')

df_coords = df.dropna(subset=['X-coordinate', 'Y-coordinate'])
df_coords = df_coords[
    (df_coords['X-coordinate'] >= 0) & (df_coords['X-coordinate'] <= 100) &
    (df_coords['Y-coordinate'] >= -100) & (df_coords['Y-coordinate'] <= 100)
].copy()

# Clusters
kmeans = KMeans(n_clusters=6, random_state=42)
df_coords['zone_cluster'] = kmeans.fit_predict(df_coords[['X-coordinate', 'Y-coordinate']])

df['zone_cluster'] = -1
df.loc[df_coords.index, 'zone_cluster'] = df_coords['zone_cluster'].astype(int)

# One-hot Encoded
zone_dummies = pd.get_dummies(df['zone_cluster'], prefix="zone")
df = pd.concat([df[['merged_input', 'X-coordinate', 'Y-coordinate', 'zone_cluster']], zone_dummies], axis=1)


df.to_csv("image_features_ready.csv", index=False)
print("Saved to image_features_ready.csv")

# Only include coordinate + zone_cluster + zone one-hot）

In [None]:
import pandas as pd

# main data
df_main = pd.read_csv("preprocessed_feedback.csv")

# Sentiment + reason + suggestion
df_text = pd.read_csv("dashboard_ready_data.csv")
df_main = df_main.merge(
    df_text[["merged_input", "sentiment_label", "normalized_label", "suggestion_label"]],
    on="merged_input", how="left"
)

# Image_feature 
df_image = pd.read_csv("image_features_ready.csv")

# Coordinate, region one-hot, zone cluster one-hot
image_feature_cols = [
    "merged_input", "X-coordinate", "Y-coordinate", "zone_cluster"
] + [col for col in df_image.columns if col.startswith("zone_")]

df_main = df_main.merge(df_image[image_feature_cols], on="merged_input", how="left")

df_main = df_main.drop(columns=["Ear region marked", "has_region", 
                                "region_inner_ear", "region_behind_the_ear", "region_upper_front_part_of_ear"],
                       errors="ignore")

df_main.to_csv("final_model_dataset.csv", index=False)
print("final_model_dataset.csv saved (includes structured + text + simplified image features)")

In [None]:
# Check
print(df_coords["X-coordinate"].describe())


In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

df_clean = df_coords[
    (df_coords["X-coordinate"] >= 0) & (df_coords["X-coordinate"] <= 100) &
    (df_coords["Y-coordinate"] >= -100) & (df_coords["Y-coordinate"] <= 100)
].copy()

kmeans = KMeans(n_clusters=6, random_state=42)
df_clean["zone_cluster"] = kmeans.fit_predict(df_clean[["X-coordinate", "Y-coordinate"]])

plt.figure(figsize=(6, 6))
for cluster in range(6):
    subset = df_clean[df_clean["zone_cluster"] == cluster]
    plt.scatter(subset["X-coordinate"], subset["Y-coordinate"], label=f"Cluster {cluster}", alpha=0.6)

plt.xlabel("X")
plt.ylabel("Y")
plt.title("Coordinate Clusters")
plt.legend(loc='upper right') 
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Full workflow - One-hot encoded + Standarized + MLP Training

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


df = pd.read_csv("final_model_dataset.csv")
drop_cols = [col for col in df.columns if any(keyword in col for keyword in [
    "Respondent ID", "Collector ID", "Survey Collector", "comparison INDEX", "Comfort Deviation",
    "Plot", "Start Date", "End Date", "Language"
])]
df = df.drop(columns=drop_cols, errors="ignore")

selected_columns = [
    # structured data
    "Style", "Platform", "Earpiece Configuration", "What is your gender?",
    "What is your current age?", "How long have you been using hearing aids?",

    # label
    "sentiment_label", "normalized_label",

    # image region
    "zone_cluster", 
    # "has_region", "region_inner_ear", "region_behind_the_ear", "region_upper_front_part_of_ear",  

    # 
    "Slip Out", "Annoying", "Change Position", "Too tight", "Itchiness",
    "Soreness", "Take off hearing aids", "Painful",

    # Target variable
    "Average comfort score", "Satisfaction"
]

df = df[selected_columns].copy()


target_comfort = df["Average comfort score"]
# target_satisfaction = df["Satisfaction"]
# df = df.drop(columns=["Average comfort score", "Satisfaction"])

# Satisfaction Classification
def map_satisfaction(score):
    if score <= 2:
        return "Dissatisfied"
    elif score == 3:
        return "Neutral"
    else:  # 4 or 5
        return "Satisfied"


df["satisfaction_class"] = df["Satisfaction"].apply(map_satisfaction)
le = LabelEncoder()
y_satisfaction_class = le.fit_transform(df["satisfaction_class"])

df = df.drop(columns=["Average comfort score", "Satisfaction", "satisfaction_class"])


# Clean Earpiece Configuration 
def simplify_config(val):
    val = str(val).lower()
    if "dome" in val and "custom" in val:
        return "Custom/Dome"
    elif "custom" in val:
        return "Custom"
    elif "dome" in val:
        return "Domes"
    else:
        return "Other"

df["Earpiece Configuration"] = df["Earpiece Configuration"].apply(simplify_config)

# One hot encoded
categorical_cols = [
    "Style", "Platform", "Earpiece Configuration", "What is your gender?",
    "What is your current age?", "How long have you been using hearing aids?",
    "sentiment_label", "normalized_label"
]
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

X = df_encoded.fillna(0)  
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Comfort Regression Model
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_scaled, target_comfort, test_size=0.2, random_state=42
)

def build_mlp(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError(), metrics=['mae'])
    return model

model_c = build_mlp(X_train_c.shape[1])
model_c.fit(X_train_c, y_train_c, epochs=10, batch_size=32, verbose=1)
pred_c = model_c.predict(X_test_c).flatten()

print("\nComfort Model:")
print("MSE:", mean_squared_error(y_test_c, pred_c))
print("R²:", r2_score(y_test_c, pred_c))

# Satisfaction Clssifiacation Model
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_scaled, y_satisfaction_class, test_size=0.2, random_state=42
)

def build_classification_mlp(input_dim, num_classes):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model_s = build_classification_mlp(X_train_s.shape[1], num_classes=3)
model_s.fit(
    X_train_s, y_train_s,
    epochs=10,
    batch_size=32,
    verbose=1,
    class_weight=class_weight_dict 
)


pred_probs = model_s.predict(X_test_s)
pred_classes = np.argmax(pred_probs, axis=1)

print("\nSatisfaction Classification Report:")
print(classification_report(y_test_s, pred_classes, target_names=le.classes_))

sns.heatmap(confusion_matrix(y_test_s, pred_classes), annot=True, fmt='d',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Satisfaction Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


In [None]:
# Use Comfort MLP Model
# Check the most important 10 features 

from sklearn.inspection import permutation_importance
from scikeras.wrappers import KerasRegressor  

def build_model():
    model = Sequential([
        tf.keras.Input(shape=(X_train_c.shape[1],)), 
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

wrapped_model = KerasRegressor(model=build_model, epochs=5, batch_size=32, verbose=0)
wrapped_model.fit(X_train_c, y_train_c)
result = permutation_importance(
    wrapped_model, X_test_c, y_test_c,
    n_repeats=5, scoring='r2', random_state=42
)

importance_df = pd.DataFrame({
    'feature': df_encoded.columns,
    'importance_mean': result.importances_mean,
    'importance_std': result.importances_std
}).sort_values(by="importance_mean", ascending=False)

print(importance_df.head(10))


In [None]:
# SHAP For Comfort Model
import shap

X_sample = pd.DataFrame(X_test_c[:1000], columns=df_encoded.columns)

explainer = shap.Explainer(model_c, X_sample)
shap_values = explainer(X_sample)

shap.plots.bar(shap_values)


In [None]:
# SHAP For Comfort Model
import pandas as pd

X_sample = pd.DataFrame(X_test_c[:1000], columns=df_encoded.columns)
explainer = shap.Explainer(model_c, X_sample)
shap_values = explainer(X_sample)

shap.plots.beeswarm(shap_values)


In [None]:
# Just for check
df_plot = pd.read_csv("final_model_dataset.csv")

sns.boxplot(x=df_plot['Slip Out'], y=df_plot['Average comfort score'])
plt.title("Comfort Score vs Slip Out")
plt.xlabel("Slip Out (0 = No, 1 = Yes)")
plt.ylabel("Comfort Score")
plt.show()


In [None]:
# SHAP For Comfort Model - 2
import shap
import matplotlib.pyplot as plt

X_sample = pd.DataFrame(X_test_c[:100], columns=df_encoded.columns)
explainer = shap.Explainer(model_c, X_sample)
shap_values = explainer(X_sample)

mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
top_indices = np.argsort(mean_abs_shap)[::-1][:20]  # Top 20 here 

top_features = X_sample.columns[top_indices]
top_importances = mean_abs_shap[top_indices]


plt.figure(figsize=(8, 6))
plt.barh(range(20), top_importances[::-1], color="#80cfff")  
plt.yticks(range(20), top_features[::-1])
plt.xlabel("Mean |SHAP value|")
plt.title("Top 20 SHAP Feature Importances (Comfort Model)")
plt.tight_layout()
plt.show()


In [None]:
# Satisfaction Model Shapley Additive Explanations - satisfied
import shap
import matplotlib.pyplot as plt
import numpy as np

X_sample_s = pd.DataFrame(X_test_s[:100], columns=df_encoded.columns)

explainer_s = shap.Explainer(model_s, X_sample_s)
shap_values_s = explainer_s(X_sample_s)

class_idx = 2  # "Satisfied"
mean_abs_shap = np.abs(shap_values_s.values[:, :, class_idx]).mean(axis=0)
top_indices = np.argsort(mean_abs_shap)[::-1][:20]

top_features = X_sample_s.columns[top_indices]
top_importances = mean_abs_shap[top_indices]

plt.figure(figsize=(8, 6))
plt.barh(range(20), top_importances[::-1], color="#80cfff")
plt.yticks(range(20), top_features[::-1])
plt.xlabel("Mean |SHAP value|")
plt.title("Top 20 SHAP Feature Importances (Satisfaction Class: Satisfied)")
plt.tight_layout()
plt.show()


In [None]:
# Satisfaction MOdel Shapley Additive Explanations - dissatisfied
shap.plots.beeswarm(shap_values_s[..., 0])

In [None]:
# Summary For Satisfaction
shap.plots.bar(shap_values_s[..., 2], max_display=10)


In [None]:
# Summary For Dissatisfied
shap.plots.bar(shap_values_s[..., 0], max_display=10)


In [None]:
# Dissatisfied
import shap
import matplotlib.pyplot as plt
import numpy as np

X_sample_s = pd.DataFrame(X_test_s[:100], columns=df_encoded.columns)

explainer_s = shap.Explainer(model_s, X_sample_s)
shap_values_s = explainer_s(X_sample_s)

class_idx = 0  # Dissatisfied

mean_abs_shap = np.abs(shap_values_s.values[:, :, class_idx]).mean(axis=0)
top_indices = np.argsort(mean_abs_shap)[::-1][:20]
top_features = X_sample_s.columns[top_indices]
top_importances = mean_abs_shap[top_indices]

plt.figure(figsize=(8, 6))
plt.barh(range(20), top_importances[::-1], color="#80cfff")
plt.yticks(range(20), top_features[::-1])
plt.xlabel("Mean |SHAP value|")
plt.title("Top 20 SHAP Feature Importances (Satisfaction Class: Dissatisfied)")
plt.tight_layout()
plt.show()


In [None]:
# Satisfied
import shap
import pandas as pd

X_sample_s = pd.DataFrame(X_test_s[:100], columns=df_encoded.columns)

explainer_s = shap.Explainer(model_s, X_sample_s)
shap_values_s = explainer_s(X_sample_s)


shap.plots.beeswarm(shap_values_s[..., 2]) 



In [None]:
# Just for test
pivot_table = df_raw.pivot_table(values='Average comfort score',
                             index='What is your current age?',
                             columns='How long have you been using hearing aids?',
                             aggfunc='mean')

sns.heatmap(pivot_table, annot=True, cmap="YlGnBu", fmt=".2f")
plt.title("Mean Comfort Score by Age & HA Usage Duration")
plt.xlabel("HA Usage Duration")
plt.ylabel("Age Group")
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Comfort
plt.scatter(y_test_c, pred_c, alpha=0.5)
plt.xlabel("True Comfort Score")
plt.ylabel("Predicted Comfort Score")
plt.title("Comfort Score: True vs Predicted")
plt.grid(True)
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.show()

# Satisfaction
plt.scatter(y_test_s, pred_s, alpha=0.5)
plt.xlabel("True Satisfaction Score")
plt.ylabel("Predicted Satisfaction Score")
plt.title("Satisfaction Score: True vs Predicted")
plt.grid(True)
plt.plot([0, 5], [0, 5], color='red', linestyle='--')
plt.show()


In [None]:
# Save MLP Model
model_c.save("mlp_comfort_model.keras")
model_s.save("mlp_satisfaction_model.keras")

import joblib
joblib.dump(scaler, "scaler.save")
print("Scaler saved as 'scaler.save'")

columns_template = pd.DataFrame(columns=df_encoded.columns)
columns_template.to_csv("onehot_template.csv", index=False)
print("One-hot template saved as 'onehot_template.csv'")

