In [54]:
import pandas as pd
df = pd.read_csv('reply_classification_dataset.csv')

In [55]:
conflicts = df[df.duplicated(subset='reply', keep=False)].sort_values('reply')
print(conflicts.head(20))

                          reply    label
1276  Can u provide case study?  neutral
673   Can u provide case study?  neutral
1540  Can u provide case study?  neutral
1108  Can u provide case study?  neutral
709   Can u provide case study?  neutral
1612  Can u provide case study?  neutral
463   Can u provide case study?  neutral
745   Can u provide case study?  neutral
1144  Can u provide case study?  neutral
259   Can u provide case study?  neutral
784   Can u provide case study?  neutral
1042  Can u provide case study?  neutral
1672  Can u provide case study?  neutral
403   Can u provide case study?  neutral
1756  Can u provide case study?  neutral
964   Can u provide case study?  neutral
1792  Can u provide case study?  neutral
1642  Can u provide case study?  neutral
1492  Can u provide case study?  neutral
1576  Can u provide case study?  neutral


In [56]:
df = df.drop_duplicates(subset='reply', keep='first')

In [57]:
print(df.shape)   

(321, 2)


In [58]:
print(df['label'].value_counts())

label
neutral     116
positive     73
POSITIVE     40
negative     32
NEGATIVE     27
Negative     27
Neutral       3
NEUTRAL       2
Positive      1
Name: count, dtype: int64


In [59]:
df['label'] = df['label'].str.lower()
print(df['label'].value_counts())

label
neutral     121
positive    114
negative     86
Name: count, dtype: int64


In [60]:

label_map = {
    "positive": 1,
    "neutral": 0,
    "negative": -1
}


df['label_encoded'] = df['label'].map(label_map)

print(df[['label', 'label_encoded']].head())

      label  label_encoded
0   neutral              0
1  positive              1
2  negative             -1
3   neutral              0
4  positive              1


In [61]:
import re
import string

def clean_text_custom(text):

    text = text.lower()

    text = text.translate(str.maketrans('', '', string.punctuation.replace("?", "").replace("!", "")))

    text = text.replace("?", "").replace("!", "")

    text = re.sub(r'\d+', '', text)

    text = re.sub(r'\bu\b', 'you', text)
    text = re.sub(r'\bplz\b', 'please', text)
    text = re.sub(r'\bw/\b', 'with', text)
    text = re.sub(r'\bw\b', 'with', text)   

    text = re.sub(r"\bill\b", "i'll", text)

    text = text.replace('schdule', 'schedule')
    text = text.replace('intrsted', 'interested')
    text = text.replace('alredy', 'already')
    text = text.replace('oppurtunity', 'opportunity')
    text = text.replace('intrest', 'interest')
    text = text.replace('commited', 'committed')
    text = text.replace('lets', 'let us')

    text = " ".join(text.split())

    return text


df['reply'] = df['reply'].apply(clean_text_custom)

print(df['reply'].head(10))


0                               can we discuss pricing
1    im excited to explore this further please send...
2                     we not looking for new solutions
3                  could you clarify features included
4             let us schedule a meeting to dive deeper
5                           please remove me from list
6                      this looks promising send specs
7                      i'll need to check with my team
8                   were already using similar product
9                              looking forward to demo
Name: reply, dtype: object


In [62]:
df.to_excel("preprocessed2.xlsx", index=False)

In [71]:
#logistic regression
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report


df = pd.read_excel('preprocessed2.xlsx')
X = df['reply']
y = df['label_encoded']

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000, ngram_range=(1,2), stop_words='english')),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


print("Training model...")
pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))


joblib.dump(pipeline, 'sentiment1_model.pkl')
print("Model saved as 'sentiment1_model.pkl'")


loaded_model = joblib.load('sentiment1_model.pkl')
test_prediction = loaded_model.predict(["this looks great!"])[0]
test_proba = loaded_model.predict_proba(["this looks great!"])[0]
print(f"Test prediction: {test_prediction}, probabilities: {test_proba}")

Training model...
Model accuracy: 0.9692
              precision    recall  f1-score   support

    negative       1.00      0.94      0.97        17
     neutral       1.00      0.96      0.98        25
    positive       0.92      1.00      0.96        23

    accuracy                           0.97        65
   macro avg       0.97      0.97      0.97        65
weighted avg       0.97      0.97      0.97        65

Model saved as 'sentiment1_model.pkl'
Test prediction: 1, probabilities: [0.13758398 0.12183695 0.74057907]


In [73]:
#naive bayes 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
import joblib

df = pd.read_excel('preprocessed2.xlsx')
print(f"Dataset shape: {df.shape}")
print(f"Class distribution:\n{df['label'].value_counts()}")

X = df['reply'] 
y = df['label_encoded']  

print(f"Sample texts: {X.head().tolist()}")
print(f"Sample labels: {y.head().tolist()}")

Dataset shape: (321, 3)
Class distribution:
label
neutral     121
positive    114
negative     86
Name: count, dtype: int64
Sample texts: ['can we discuss pricing', 'im excited to explore this further please send contract', 'we not looking for new solutions', 'could you clarify features included', 'let us schedule a meeting to dive deeper']
Sample labels: [0, 1, -1, 0, 1]


In [74]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=1000,
    min_df=1,
    max_df=0.95,
    ngram_range=(1, 2),
    lowercase=True,
    stop_words='english'
)

In [75]:
models = {
    'Multinomial NB + TF-IDF': Pipeline([
        ('vectorizer', tfidf_vectorizer),
        ('classifier', MultinomialNB(alpha=1.0))
    ])
}


results = {}
for name, pipeline in models.items():
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
    results[name] = {
        'mean_accuracy': cv_scores.mean(),
        'std_accuracy': cv_scores.std(),
        'scores': cv_scores
    }
    print(f"{name}:")
    print(f"  Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print()


best_model_name = max(results.keys(), key=lambda k: results[k]['mean_accuracy'])
print(f"Best model: {best_model_name}")

Multinomial NB + TF-IDF:
  Mean CV Accuracy: 0.9597 (+/- 0.0868)

Best model: Multinomial NB + TF-IDF


In [76]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42,
    stratify=y
)


best_pipeline = models[best_model_name]
best_pipeline.fit(X_train, y_train)


y_pred = best_pipeline.predict(X_test)
y_pred_proba = best_pipeline.predict_proba(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
target_names = ['negative', 'neutral', 'positive']
print(classification_report(y_test, y_pred, target_names=target_names))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)


cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
print("\nConfusion Matrix (Percentages):")
print(cm_percentage.round(1))

Test Accuracy: 0.9077

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      0.71      0.83        17
     neutral       0.96      0.96      0.96        25
    positive       0.82      1.00      0.90        23

    accuracy                           0.91        65
   macro avg       0.93      0.89      0.90        65
weighted avg       0.92      0.91      0.90        65


Confusion Matrix:
[[12  1  4]
 [ 0 24  1]
 [ 0  0 23]]

Confusion Matrix (Percentages):
[[ 70.6   5.9  23.5]
 [  0.   96.    4. ]
 [  0.    0.  100. ]]


In [2]:

pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


The value specified in an AutoRun registry key could not be parsed.

[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertTokenizer, 
    DistilBertForSequenceClassification,
    get_linear_schedule_with_warmup
)
import torch.nn.functional as F
from torch.optim import AdamW



df = pd.read_excel('preprocessed2.xlsx')

label_map = {
    "positive": 2,    
    "neutral": 1,     
    "negative": 0     
}

df['label_encoded'] = df['label'].map(label_map)

print("Dataset loaded:")
print(f"Shape: {df.shape}")
print(f"Label distribution:\n{df['label'].value_counts()}")

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['reply'].tolist(),
    df['label_encoded'].tolist(),  
    test_size=0.2,
    random_state=42,
    stratify=df['label_encoded']
)

print(f"\nTrain size: {len(train_texts)}")
print(f"Validation size: {len(val_texts)}")

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class CustomerResponseDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

train_dataset = CustomerResponseDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomerResponseDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=3  
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    
    return total_loss / len(data_loader)

def evaluate_model(model, data_loader, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    
    return total_loss / len(data_loader), predictions, true_labels

print("\nStarting training...")
for epoch in range(3):
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    val_loss, val_preds, val_labels = evaluate_model(model, val_loader, device)
    
    print(f"Epoch {epoch+1}/3:")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss: {val_loss:.4f}")

print("\nFinal Evaluation:")
_, predictions, true_labels = evaluate_model(model, val_loader, device)

reverse_label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
pred_labels = [reverse_label_map[pred] for pred in predictions]
true_label_names = [reverse_label_map[label] for label in true_labels]

print("\nClassification Report:")
print(classification_report(true_label_names, pred_labels))

print("\nConfusion Matrix:")
print(confusion_matrix(true_label_names, pred_labels))

def predict_sentiment(text, model, tokenizer, device):
    model.eval()
    
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probabilities = F.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
    
    label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    confidence = probabilities[0][predicted_class].item()
    
    return {
        'predicted_label': label_map[predicted_class],
        'confidence': confidence,
        'probabilities': {
            'negative': probabilities[0][0].item(),
            'neutral': probabilities[0][1].item(),
            'positive': probabilities[0][2].item()
        }
    }

print("\nTesting predictions on sample texts:")
test_texts = [
    "im excited explore plz send contract",
    "not looking new solution",
    "can we discuss pricing"
]

for text in test_texts:
    result = predict_sentiment(text, model, tokenizer, device)
    print(f"\nText: '{text}'")
    print(f"Prediction: {result['predicted_label']} (confidence: {result['confidence']:.3f})")

model.save_pretrained('./customer_sentiment_model')
tokenizer.save_pretrained('./customer_sentiment_model')
print("\nModel saved to './customer_sentiment_model'")

Dataset loaded:
Shape: (321, 3)
Label distribution:
label
neutral     121
positive    114
negative     86
Name: count, dtype: int64

Train size: 256
Validation size: 65


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training...
Epoch 1/3:
  Train Loss: 0.9937
  Val Loss: 0.8520
Epoch 2/3:
  Train Loss: 0.6831
  Val Loss: 0.5989
Epoch 3/3:
  Train Loss: 0.5050
  Val Loss: 0.5036

Final Evaluation:

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      0.94      0.97        17
     neutral       0.95      0.84      0.89        25
    positive       0.85      1.00      0.92        23

    accuracy                           0.92        65
   macro avg       0.94      0.93      0.93        65
weighted avg       0.93      0.92      0.92        65


Confusion Matrix:
[[16  1  0]
 [ 0 21  4]
 [ 0  0 23]]

Testing predictions on sample texts:

Text: 'im excited explore plz send contract'
Prediction: positive (confidence: 0.552)

Text: 'not looking new solution'
Prediction: negative (confidence: 0.623)

Text: 'can discus pricing'
Prediction: neutral (confidence: 0.524)

Model saved to './customer_sentiment_model'


In [70]:
print("\nTesting predictions on sample texts:")
test_texts = [
    "im excited explore please send contract",
    "we not looking for new solution",
    "can we discuss pricing"
]

for text in test_texts:
    result = predict_sentiment(text, model, tokenizer, device)
    print(f"\nText: '{text}'")
    print(f"Prediction: {result['predicted_label']} (confidence: {result['confidence']:.3f})")


Testing predictions on sample texts:

Text: 'im excited explore please send contract'
Prediction: positive (confidence: 0.573)

Text: 'we not looking for new solution'
Prediction: negative (confidence: 0.517)

Text: 'can we discuss pricing'
Prediction: neutral (confidence: 0.417)


In [78]:
import pandas as pd


data = {
    "Model": ["Logistic Regression", "DistilBERT", "Naive Bayes"],
    "Accuracy": ["96.92%", "92.31%", "90.77%"],
    "F1-Score": [0.97, 0.93, 0.90]
}


df = pd.DataFrame(data)


df


Unnamed: 0,Model,Accuracy,F1-Score
0,Logistic Regression,96.92%,0.97
1,DistilBERT,92.31%,0.93
2,Naive Bayes,90.77%,0.9


Logistic Regression demonstrated superior performance on this small dataset, achieving a high accuracy of 96%. This model proved to be highly effective for making predictions, unlike Naive Bayes and DistilBERT, which both yielded lower accuracy scores. Their performance could likely be improved with a larger dataset for training.