#### Intent Classifier Module
Train on the predefined question set with Intent, and classify the intent

In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# checking if nltk data is available, if not download it
try:
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('stopwords')
    nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mgmgk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mgmgk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# intent classifier class
class IntentClassifier:
    """
    A class for classifying the intent of user questions.
    """
    
    def __init__(self, model_type='logistic', vectorizer_type='tfidf'):
        """
        Initialize the IntentClassifier with the specified model type.
        
        Args:
            model_type (str): The type of model to use for classification.
                Options: 'naive_bayes', 'logistic', 'random_forest', 'svm'
            vectorizer_type (str): The type of vectorizer to use.
                Options: 'tfidf', 'count'
        """
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.model_type = model_type
        self.vectorizer_type = vectorizer_type
        self.pipeline = None
        self.intent_labels = None
        
    def preprocess_text(self, text):
        """
        Preprocess the text by removing punctuation, converting to lowercase,
        removing stopwords, and lemmatizing.
        
        Args:
            text (str): The text to preprocess.
            
        Returns:
            str: The preprocessed text.
        """
        # Convert to lowercase
        text = text.lower()
        
        # Remove punctuation
        text = re.sub(f'[{string.punctuation}]', ' ', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Simple tokenization by splitting on whitespace
        tokens = text.split()
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
        
        return ' '.join(tokens)
    
    def create_model(self):
        """
        Create a pipeline with vectorizer and the selected classifier.
        
        Returns:
            sklearn.pipeline.Pipeline: The created pipeline.
        """
        # Select vectorizer
        if self.vectorizer_type == 'tfidf':
            vectorizer = TfidfVectorizer(
                preprocessor=self.preprocess_text,
                ngram_range=(1, 2),  # Use both unigrams and bigrams
                min_df=2,  # Minimum document frequency
                max_df=0.9  # Maximum document frequency
            )
        else:  # count vectorizer
            vectorizer = CountVectorizer(
                preprocessor=self.preprocess_text,
                ngram_range=(1, 2),
                min_df=2,
                max_df=0.9
            )
        
        # Select classifier
        if self.model_type == 'naive_bayes':
            classifier = MultinomialNB(alpha=0.1)
        elif self.model_type == 'logistic':
            classifier = LogisticRegression(
                C=10.0,
                max_iter=1000,
                class_weight='balanced',
                random_state=42
            )
        elif self.model_type == 'random_forest':
            classifier = RandomForestClassifier(
                n_estimators=100,
                max_depth=None,
                min_samples_split=2,
                class_weight='balanced',
                random_state=42
            )
        elif self.model_type == 'svm':
            classifier = SVC(
                kernel='linear',
                C=1.0,
                probability=True,
                class_weight='balanced',
                random_state=42
            )
        else:
            raise ValueError(f"Unknown model type: {self.model_type}")
        
        return Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', classifier)
        ])
    
    def train(self, data_path, test_size=0.2, random_state=42, optimize=False):
        """
        Train the intent classification model using the provided data.
        
        Args:
            data_path (str): Path to the CSV file containing the training data.
            test_size (float): Proportion of the data to use for testing.
            random_state (int): Random seed for reproducibility.
            optimize (bool): Whether to perform hyperparameter optimization.
            
        Returns:
            dict: A dictionary containing the evaluation metrics.
        """
        # Load the data
        df = pd.read_csv(data_path)
        
        # Clean column names (remove leading/trailing whitespace)
        df.columns = df.columns.str.strip()
        
        # Clean data (remove leading/trailing whitespace)
        for col in df.columns:
            if df[col].dtype == 'object':
                df[col] = df[col].str.strip()
        
        # Store the intent labels
        self.intent_labels = sorted(df['Intent'].unique())
        
        # Print dataset statistics
        print(f"Dataset size: {len(df)} questions")
        print(f"Intent categories: {len(self.intent_labels)}")
        print("Intent distribution:")
        intent_counts = df['Intent'].value_counts()
        for intent, count in intent_counts.items():
            print(f"  {intent}: {count} ({count/len(df)*100:.1f}%)")
        
        # Split the data into training and testing sets
        # Use stratification only if all classes have at least 2 samples
        min_samples_per_class = df['Intent'].value_counts().min()
        if min_samples_per_class >= 2:
            X_train, X_test, y_train, y_test = train_test_split(
                df['Question'], df['Intent'], test_size=test_size, 
                random_state=random_state, stratify=df['Intent']
            )
        else:
            print("Warning: Some classes have too few samples for stratified splitting. Using regular train-test split.")
            X_train, X_test, y_train, y_test = train_test_split(
                df['Question'], df['Intent'], test_size=test_size, 
                random_state=random_state
            )
        
        # Create the pipeline
        self.pipeline = self.create_model()
        
        # Perform hyperparameter optimization if requested
        if optimize:
            print("\nPerforming hyperparameter optimization...")
            if self.model_type == 'naive_bayes':
                param_grid = {
                    'classifier__alpha': [0.01, 0.1, 0.5, 1.0, 2.0]
                }
            elif self.model_type == 'logistic':
                param_grid = {
                    'classifier__C': [0.1, 1.0, 10.0, 100.0],
                    'classifier__solver': ['liblinear', 'saga']
                }
            elif self.model_type == 'random_forest':
                param_grid = {
                    'classifier__n_estimators': [50, 100, 200],
                    'classifier__max_depth': [None, 10, 20, 30]
                }
            elif self.model_type == 'svm':
                param_grid = {
                    'classifier__C': [0.1, 1.0, 10.0],
                    'classifier__kernel': ['linear', 'rbf']
                }
            
            grid_search = GridSearchCV(
                self.pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1
            )
            grid_search.fit(X_train, y_train)
            
            print(f"Best parameters: {grid_search.best_params_}")
            self.pipeline = grid_search.best_estimator_
        else:
            # Train the pipeline
            self.pipeline.fit(X_train, y_train)
        
        # Evaluate the model
        y_pred = self.pipeline.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        
        return {
            'accuracy': accuracy,
            'classification_report': report,
            'test_data': (X_test, y_test, y_pred)
        }
    
    def predict(self, question):
        """
        Predict the intent of a given question.
        
        Args:
            question (str): The question to classify.
            
        Returns:
            str: The predicted intent.
        """
        if self.pipeline is None:
            raise ValueError("Model not trained. Call train() first.")
        
        return self.pipeline.predict([question])[0]
    
    def predict_proba(self, question):
        """
        Predict the probability of each intent for a given question.
        
        Args:
            question (str): The question to classify.
            
        Returns:
            dict: A dictionary mapping intent labels to their probabilities.
        """
        if self.pipeline is None:
            raise ValueError("Model not trained. Call train() first.")
        
        probabilities = self.pipeline.predict_proba([question])[0]
        return {intent: prob for intent, prob in zip(self.pipeline.classes_, probabilities)}
    
    def save_model(self, file_path):
        """
        Save the trained model to a file.
        
        Args:
            file_path (str): Path to save the model.
        """
        if self.pipeline is None:
            raise ValueError("Model not trained. Call train() first.")
        
        import joblib
        joblib.dump(self.pipeline, file_path)
        print(f"Model saved to {file_path}")
    
    def load_model(self, file_path):
        """
        Load a trained model from a file.
        
        Args:
            file_path (str): Path to the saved model.
        """
        import joblib
        self.pipeline = joblib.load(file_path)
        print(f"Model loaded from {file_path}")

In [4]:
# model evaluation function
def evaluate_models(data_path, test_size=0.2, random_state=42):
    """
    Evaluate different models and vectorizers to find the best combination.
    
    Args:
        data_path (str): Path to the CSV file containing the training data.
        test_size (float): Proportion of the data to use for testing.
        random_state (int): Random seed for reproducibility.
        
    Returns:
        tuple: The best model type and vectorizer type.
    """
    models = ['naive_bayes', 'logistic', 'random_forest', 'svm']
    vectorizers = ['tfidf', 'count']
    
    results = {}
    
    print("Evaluating different model and vectorizer combinations:")
    for model_type in models:
        for vectorizer_type in vectorizers:
            print(f"\nTesting {model_type} with {vectorizer_type} vectorizer...")
            classifier = IntentClassifier(model_type=model_type, vectorizer_type=vectorizer_type)
            metrics = classifier.train(data_path, test_size=test_size, random_state=random_state)
            
            key = f"{model_type}_{vectorizer_type}"
            results[key] = metrics['accuracy']
            print(f"Accuracy: {metrics['accuracy']:.4f}")
    
    # Find the best combination
    best_key = max(results, key=results.get)
    best_model, best_vectorizer = best_key.split('_')
    
    print("\nResults summary:")
    for key, accuracy in sorted(results.items(), key=lambda x: x[1], reverse=True):
        print(f"{key}: {accuracy:.4f}")
    
    print(f"\nBest combination: {best_model} with {best_vectorizer} vectorizer (Accuracy: {results[best_key]:.4f})")
    
    return best_model, best_vectorizer

In [None]:
# data_path = 'it_faqs_and_logs.csv'         # short set
# data_path = 'it_support_questions.csv'      # 100 samples
data_path = 'it_support_questions_500.csv'  # 500 samples
     
classifier = IntentClassifier(model_type='logistic', vectorizer_type='tfidf')
    
# Train the model
print("Training the model...")
metrics = classifier.train(data_path, optimize=False)  # Disable optimization for small datasets
    
# Print evaluation metrics
print(f"\nModel Accuracy: {metrics['accuracy']:.4f}")
print("\nClassification Report:")
for intent, values in metrics['classification_report'].items():
    if intent not in ['accuracy', 'macro avg', 'weighted avg']:
        print(f"Intent: {intent}")
        print(f"  Precision: {values['precision']:.4f}")
        print(f"  Recall: {values['recall']:.4f}")
        print(f"  F1-score: {values['f1-score']:.4f}")
        print(f"  Support: {values['support']}")
    
# Test with some example questions
print("\nTesting with example questions:")
test_questions = [
    "I need to change my password for Windows",
    "My computer won't connect to the internet",
    "How do I install a new software?",
    "I can't access my email account",
    "My printer is not working properly",
    "How do I connect to the company VPN?",
    "My laptop battery is draining too quickly"
]
    
for question in test_questions:
    intent = classifier.predict(question)
    probabilities = classifier.predict_proba(question)
    top_intents = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)[:3]
        
    print(f"\nQuestion: {question}")
    print(f"Predicted Intent: {intent}")
    print("Top 3 Probabilities:")
    for intent_label, prob in top_intents:
        print(f"  {intent_label}: {prob:.4f}")
    
# Save the model (uncomment to save)
# classifier.save_model('intent_classifier_model.joblib')

Training the model...
Dataset size: 500 questions
Intent categories: 7
Intent distribution:
  password: 75 (15.0%)
  network: 75 (15.0%)
  application: 75 (15.0%)
  email: 75 (15.0%)
  internet: 75 (15.0%)
  hardware: 75 (15.0%)
  device: 50 (10.0%)

Model Accuracy: 1.0000

Classification Report:
Intent: application
  Precision: 1.0000
  Recall: 1.0000
  F1-score: 1.0000
  Support: 15.0
Intent: device
  Precision: 1.0000
  Recall: 1.0000
  F1-score: 1.0000
  Support: 10.0
Intent: email
  Precision: 1.0000
  Recall: 1.0000
  F1-score: 1.0000
  Support: 15.0
Intent: hardware
  Precision: 1.0000
  Recall: 1.0000
  F1-score: 1.0000
  Support: 15.0
Intent: internet
  Precision: 1.0000
  Recall: 1.0000
  F1-score: 1.0000
  Support: 15.0
Intent: network
  Precision: 1.0000
  Recall: 1.0000
  F1-score: 1.0000
  Support: 15.0
Intent: password
  Precision: 1.0000
  Recall: 1.0000
  F1-score: 1.0000
  Support: 15.0

Testing with example questions:

Question: I need to change my password for Windo

In [13]:
# Accept user input
user_question = input("Enter your question: ")

# Classify the intent
predicted_intent = classifier.predict(user_question)
predicted_probabilities = classifier.predict_proba(user_question)

# Display the results
print(f"\nPredicted Intent: {predicted_intent}")
print("Top Probabilities:")
for intent_label, prob in sorted(predicted_probabilities.items(), key=lambda x: x[1], reverse=True)[:3]:
    print(f"  {intent_label}: {prob:.4f}")


Predicted Intent: internet
Top Probabilities:
  internet: 0.7240
  application: 0.1035
  hardware: 0.0394
