In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier


# Download necessary NLTK data
try:
    stop_words = stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
    stop_words = stopwords.words('english')

try:
    lemmatizer = WordNetLemmatizer()
except LookupError:
    nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')


class TicketAutoTagger:
    """
    A high-level class for an NLP-based model to automatically tag support tickets
    with priority levels and departments.

    This class demonstrates the workflow from data preprocessing to model training
    and prediction using a multi-class classification approach.
    """

    def __init__(self):
        self.priority_model = None
        self.department_model = None
        self.priority_encoder = LabelEncoder()
        self.department_encoder = LabelEncoder()
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stop_words)

    def preprocess_text(self, text):
        """
        Cleans and preprocesses text data.
        Steps include:
        1. Lowercasing
        2. Removing special characters and numbers
        3. Tokenization
        4. Removing stopwords
        5. Lemmatization
        """
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        tokens = nltk.word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stopwords]
        return ' '.join(tokens)

    def load_and_preprocess_data(self, filepath):
        """
        Loads the dataset and applies text preprocessing.

        Args:
            filepath (str): Path to the CSV file containing ticket data.
                            Expected columns: 'Category' for issue description,
                            'Priority' for priority level, and 'Category' for department.

        Returns:
            pd.DataFrame: Preprocessed DataFrame with 'cleaned_description' column.
        """
        print(f"Loading data from {filepath}...")
        try:
            df = pd.read_csv(filepath)
        except FileNotFoundError:
            print(f"Error: File not found at {filepath}. Please provide a valid path.")
            return None

        required_columns = ['Category', 'Priority']
        if not all(col in df.columns for col in required_columns):
            print(f"Error: DataFrame must contain all of the following columns: {required_columns}.")
            return None

        print("Applying text preprocessing...")
        df['cleaned_description'] = df['Category'].astype(str).apply(self.preprocess_text)
        print("Data preprocessing complete.")
        return df

    def train_models(self, df):
        """
        Trains the NLP models for priority and department classification.

        Args:
            df (pd.DataFrame): Preprocessed DataFrame with 'cleaned_description',
                               'Priority', and 'Category' columns (from ITSM_data.csv).
        """
        if df is None or df.empty:
            print("No data to train models. Exiting training.")
            return

        X = df['cleaned_description']
        y_priority = self.priority_encoder.fit_transform(df['Priority'])
        y_department = self.department_encoder.fit_transform(df['Category'])

        X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
            X, y_priority, test_size=0.2, random_state=42, stratify=y_priority
        )

        X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
            X, y_department, test_size=0.2, random_state=42, stratify=y_department
        )

        print("\nTraining Priority Classification Model...")
        self.priority_model = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
            ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear', random_state=42)))
        ])
        self.priority_model.fit(X_train_p, y_train_p)
        y_pred_p = self.priority_model.predict(X_test_p)
        print("Priority Model Evaluation:")
        print(f"Accuracy: {accuracy_score(y_test_p, y_pred_p):.4f}")
        print(classification_report(y_test_p, y_pred_p, target_names=self.priority_encoder.classes_))

        print("\nTraining Department Classification Model...")
        self.department_model = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
            ('clf', OneVsRestClassifier(MultinomialNB()))
        ])
        self.department_model.fit(X_train_d, y_train_d)
        y_pred_d = self.department_model.predict(X_test_d)
        print("Department Model Evaluation:")
        print(f"Accuracy: {accuracy_score(y_test_d, y_pred_d):.4f}")
        print(classification_report(y_test_d, y_pred_d, target_names=self.department_encoder.classes_))
        print("Model training complete.")

    def predict_ticket_tags(self, issue_description):
        """
        Predicts the priority and department for a new issue description.

        Args:
            issue_description (str): The raw text description of the issue (e.g., from 'Category' column).

        Returns:
            tuple: A tuple containing the predicted priority and department.
                   Returns (None, None) if models are not trained.
        """
        if self.priority_model is None or self.department_model is None:
            print("Models are not trained. Please train models first.")
            return None, None

        cleaned_description = self.preprocess_text(issue_description)
        
        predicted_priority_encoded = self.priority_model.predict([cleaned_description])
        predicted_priority = self.priority_encoder.inverse_transform(predicted_priority_encoded)[0]

        predicted_department_encoded = self.department_model.predict([cleaned_description])
        predicted_department = self.department_encoder.inverse_transform(predicted_department_encoded)[0]

        return predicted_priority, predicted_department

if __name__ == "__main__":
    data_filepath = "ITSM_data.csv"

    tagger = TicketAutoTagger()

    processed_df = tagger.load_and_preprocess_data(data_filepath)

    tagger.train_models(processed_df)

    new_ticket_description_1 = "My email is not working"
    new_ticket_description_2 = "printer is out of paper"
    new_ticket_description_3 = "cannot connect to vpn"

    priority_1, department_1 = tagger.predict_ticket_tags(new_ticket_description_1)
    print(f"\nNew Ticket: '{new_ticket_description_1}'")
    print(f"Predicted Priority: {priority_1}, Predicted Department: {department_1}")

    priority_2, department_2 = tagger.predict_ticket_tags(new_ticket_description_2)
    print(f"\nNew Ticket: '{new_ticket_description_2}'")
    print(f"Predicted Priority: {priority_2}, Predicted Department: {department_2}")

    priority_3, department_3 = tagger.predict_ticket_tags(new_ticket_description_3)
    print(f"\nNew Ticket: '{new_ticket_description_3}'")
    print(f"Predicted Priority: {priority_3}, Predicted Department: {department_3}")

Loading data from ITSM_data.csv...
Applying text preprocessing...


  df = pd.read_csv(filepath)


LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - 'C:\\Users\\Ashutosh/nltk_data'
    - 'c:\\Users\\Ashutosh\\anaconda3\\nltk_data'
    - 'c:\\Users\\Ashutosh\\anaconda3\\share\\nltk_data'
    - 'c:\\Users\\Ashutosh\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Ashutosh\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
