In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Load Datset**

In [None]:
# looad datasets
import pandas as pd

train_df = pd.read_csv('/content/drive/My Drive/new_train.csv')
val_df = pd.read_csv('/content/drive/My Drive/new_val.csv')
test_df = pd.read_csv('/content/drive/My Drive/new_test.csv')


print("Train DataFrame head:")
print(train_df.head())
print(len(train_df))

print("\nVal DataFrame head:")
print(val_df.head())
print(len(val_df))

print("\nTest DataFrame head:")
print(test_df.head())
print(len(test_df))

**Data Pre-Processing**

In [None]:
# Text cleaning, normalization and chunking
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower().strip()                        # lowercase and strip
    text = re.sub(r'http\S+|www\S+', '', text)         # remove urls
    text = re.sub(r'\S+@\S+', '', text)                # remove emails
    text = re.sub(r'\d+', '', text)                    # remove numbers
    text = re.sub(r'[^a-z\s]', '', text)               # remove punctuation/symbols
    tokens = text.split()                              # tokenize
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    # tokens = [lemmatizer.lemmatize(w) for w in tokens] # keep stopwords
    return " ".join(tokens)

train_df["clean_text"] = train_df["text"].apply(clean_text)
val_df["clean_text"]   = val_df["text"].apply(clean_text)
test_df["clean_text"]  = test_df["text"].apply(clean_text)

print("Sample Cleaned Training Data:\n", train_df["clean_text"].head())
print("Sample Cleaned Validation Data:\n", val_df["clean_text"].head())
print("Sample Cleaned Testing Data:\n", test_df["clean_text"].head())

train_df.to_csv("train_cleaned.csv")

In [None]:
# check imbalancing
from collections import Counter

print("Training set class distribution:")
print(Counter(train_df["label"]))


**Feature extracion and Model Selection**

In [None]:
# Vectorization (Pipeline)
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,     # keep top 5000 features
    ngram_range=(1,2),     # unigrams + bigrams
    stop_words="english"   # optional: remove stopwords
    # stop_words=None
)

# Fit on training data, transform train/val/test
X_train = vectorizer.fit_transform(train_df["clean_text"])
X_val   = vectorizer.transform(val_df["clean_text"])
X_test  = vectorizer.transform(test_df["clean_text"])


y_train = train_df["label"]
y_val   = val_df["label"]
y_test  = test_df["label"]

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)


**Model Development & Optimization**

In [None]:
# logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score

# Define model
log_reg = LogisticRegression(max_iter=500)

# Hyperparameter grid
param_grid = {
    'C': [0.01, 1, 5],        # Regularization strength
    'penalty': ['l2'],         # L2 regularization
    'solver': ['lbfgs', 'liblinear']
}

# Grid search with 5-fold cross-validation
grid = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

# Best model
best_log_reg = grid.best_estimator_
print("Best Parameters:", grid.best_params_)

# Evaluate on validation set
y_val_pred = best_log_reg.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))


**Comprehensive Evaluation**

In [None]:
# Core Metrics
from sklearn.metrics import classification_report, accuracy_score

# Predictions on test set
y_test_pred = best_log_reg.predict(X_test)

# Accuracy
acc = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", acc)

# Precision, Recall, F1-score per class
print(classification_report(y_test, y_test_pred))


In [None]:
# save model
import joblib

# Save trained vectorizer and model
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(best_log_reg, "intent_classifier.pkl")

In [None]:
# confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Compute confusion matrix
cm = confusion_matrix(y_test, y_test_pred, labels=best_log_reg.classes_)

# Plot
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=best_log_reg.classes_,
            yticklabels=best_log_reg.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# classification performance chart
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# Compute metrics
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_test_pred, labels=best_log_reg.classes_)

x = np.arange(len(best_log_reg.classes_))
width = 0.2

plt.figure(figsize=(10,6))
plt.bar(x - width, precision, width, label='Precision', color='skyblue')
plt.bar(x, recall, width, label='Recall', color='lightgreen')
plt.bar(x + width, f1, width, label='F1-score', color='salmon')

plt.xticks(x, best_log_reg.classes_)
plt.ylim(0, 1.1)
plt.ylabel("Score")
plt.title("Classification Performance per Class")
plt.legend()
plt.show()

In [None]:
# learning curves
from sklearn.model_selection import learning_curve
import numpy as np

train_sizes, train_scores, val_scores = learning_curve(
    best_log_reg, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1,
    train_sizes=np.linspace(0.3, 1.0, 10)
)

train_mean = np.mean(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)

plt.figure(figsize=(8,6))
plt.plot(train_sizes, train_mean, 'o-', label="Training Accuracy")
plt.plot(train_sizes, val_mean, 'o-', label="Validation Accuracy")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy")
plt.title("Learning Curve")
plt.legend()
plt.show()

In [None]:
# Feature Importance
import pandas as pd
import numpy as np

feature_names = vectorizer.get_feature_names_out()
coef = best_log_reg.coef_

# For multi-class, each row corresponds to a class
for i, class_label in enumerate(best_log_reg.classes_):
    top_features = np.argsort(coef[i])[-10:]  # Top 10 features
    print(f"Top features for class '{class_label}':")
    print([feature_names[j] for j in top_features])
    print()

In [None]:
# Prediction Confidence & Thresholding
import matplotlib.pyplot as plt

# Probabilities
y_proba = best_log_reg.predict_proba(X_test)

# Max confidence per sample
max_conf = y_proba.max(axis=1)

plt.figure(figsize=(8,5))
plt.hist(max_conf, bins=20, color='skyblue', edgecolor='black')
plt.xlabel("Prediction Confidence")
plt.ylabel("Number of Samples")
plt.title("Prediction Confidence Distribution")
plt.show()

# Optional: Thresholding example (e.g., only accept predictions with >0.8 confidence)
threshold = 0.8
y_pred_thresh = [best_log_reg.classes_[np.argmax(p)] if max(p) >= threshold else "uncertain" for p in y_proba]


**Error Analysis & Insights**

In [None]:
# 1. Analyze Misclassified Examples
import pandas as pd

# Get predictions and probabilities
y_test_pred = best_log_reg.predict(X_test)
y_test_proba = best_log_reg.predict_proba(X_test)

# Find misclassified samples
misclassified_idx = np.where(y_test != y_test_pred)[0]

# Create a DataFrame for inspection
error_df = pd.DataFrame({
    "True_Label": y_test[misclassified_idx],
    "Predicted_Label": y_test_pred[misclassified_idx],
    "Confidence": np.max(y_test_proba[misclassified_idx], axis=1)
})

print("Misclassified examples:")
print(error_df.head(10))

In [None]:
# Confidence-based Prediction Filtering
threshold = 0.8
filtered_preds = []
for i, probs in enumerate(y_test_proba):
    if np.max(probs) >= threshold:
        filtered_preds.append(y_test_pred[i])   # confident prediction
    else:
        filtered_preds.append("uncertain")      # mark as uncertain
print(filtered_preds)

**Error Types**

- **Systematic Errors**: Model confuses two classes (e.g., web_search vs general_chat).

- **Ambiguity Errors**: When Input text is genuinely vague. (e.g., "remind me later" could be calendar or chat).

- **Data Quality Errors**: Typos, short inputs etc..

- **Imbalanced Class Errors**: If one class has more data in this case   **web_search** class is bigger.

- **Numeric Inputs**: If we give numeric input, the model predicts wrong intent.

**Improvements**

- We need to add more diverse training data for confusing classes.

- Try with other vectoirzation techniques.

- Consider class-weight adjustment in Logistic Regression to balance misclassification.

- Try alternative models (SVM, Random Forest,deep learning, ...).

- For vague and numeric inputs, we can try input validation and confidence threshold i.e check confidence_score if it's less than x.x , return "uncertain" instead of wrong label.

- For **general_chat** and **knowledge_query** classes, model is overfitting, for it some of techniques could be used i.e data set review etc..

**Model Limitations & Edge Cases**

Model struggles with short, vague queries.

Some queries belong to multiple classes at once (multi-label issue).

Limited generalization outside the training dataset

returns wrong intent if we give numeric input.

if model isn't confident enough to predict, then we can implement confidence based filtering to return "unknown."