In [29]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler # This import is no longer used but kept for context if user wants to re-add it
from collections import Counter

import warnings
warnings.filterwarnings('ignore')


In [3]:

# Set the option to display the full content of columns
pd.set_option('display.max_colwidth', None)


In [4]:
# --- Configuration ---
N_FEATURES_CHI2 = 5000 # Number of top features to select with Chi-Square (adjust based on vocab size)


In [5]:
# Ensure NLTK data is downloaded (run these once)
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4') # Open Multilingual Wordnet (required for WordNetLemmatizer)


### Load The dataset

In [6]:
df = pd.read_csv('It Support Ticket Data.csv',index_col=0)
df.head()


Unnamed: 0,Body,Department,Priority,Tags
0,"Dear Customer Support Team,I am writing to report a significant problem with the centralized account management portal, which currently appears to be offline. This outage is blocking access to account settings, leading to substantial inconvenience. I have attempted to log in multiple times using different browsers and devices, but the issue persists.Could you please provide an update on the outage status and an estimated time for resolution? Also, are there any alternative ways to access and manage my account during this downtime?",Technical Support,high,"['Account', 'Disruption', 'Outage', 'IT', 'Tech Support']"
1,"Dear Customer Support Team,I hope this message reaches you well. I am reaching out to request detailed information about the capabilities of your smart home integration products listed on your website. As a potential customer aiming to develop a seamlessly interconnected home environment, it is essential to understand how your products interact with various smart home platforms.Could you kindly provide detailed compatibility information with popular smart home ecosystems such as Amazon Alexa, Google Assistant, and Apple?",Returns and Exchanges,medium,"['Product', 'Feature', 'Tech Support']"
2,"Dear Customer Support Team,I hope this message finds you well. I am reaching out to request clarification about the billing and payment procedures linked to my account. Recently, I observed some inconsistencies in the charges applied and would like to ensure I fully understand the billing cycle, accepted payment options, and any potential extra charges.Firstly, I would be grateful if you could provide a detailed explanation of how the billing cycle functions. Specifically, I am interested in knowing the start and end dates.Thank you for your assistance regarding these billing inquiries.",Billing and Payments,low,"['Billing', 'Payment', 'Account', 'Documentation', 'Feedback']"
3,"Dear Support Team,I hope this message reaches you well. I am reaching out to ask about the compatibility of your products with the specific needs of marketing agencies. Our company is considering adopting these solutions to streamline our current marketing processes and wants to confirm that the products are fully compatible with the tools and platforms we currently utilize.Could you please supply detailed information regarding the compatibility of your products with popular marketing software, CRM systems, email marketing applications, and analytics platforms? Additionally, I would appreciate any relevant case studies or documentation tailored to our use case.",Sales and Pre-Sales,medium,"['Product', 'Feature', 'Feedback', 'Tech Support']"
4,"Dear Customer Support,I hope this message reaches you in good health. I am eager to learn more about the features of one of your products. Would you be able to share comprehensive details about its functionalities, specifications, and any distinctive characteristics it may possess? Additionally, if there are user manuals, tutorials, or demonstration videos available, I would be grateful if you could provide those resources. Gaining a thorough understanding of the features will assist me in making an informed decision regarding the product.Thank you very much for your assistance. I look forward to your prompt reply.Best regards",Technical Support,high,"['Feature', 'Product', 'Documentation', 'Feedback']"


In [7]:
df.dropna(inplace=True)

In [8]:
df.isnull().sum()

Body          0
Department    0
Priority      0
Tags          0
dtype: int64

In [9]:
departments = df['Department'].unique().tolist()

In [10]:
priority = df['Priority'].unique().tolist()[::-1]
priority  

['low', 'medium', 'high']

In [11]:
df.columns.tolist()

['Body', 'Department', 'Priority', 'Tags']

In [12]:
X = df['Body']
y_department = df['Department']
y_priority = df['Priority']

In [13]:
X

0                                                                                                                                                                                                                          Dear Customer Support Team,I am writing to report a significant problem with the centralized account management portal, which currently appears to be offline. This outage is blocking access to account settings, leading to substantial inconvenience. I have attempted to log in multiple times using different browsers and devices, but the issue persists.Could you please provide an update on the outage status and an estimated time for resolution? Also, are there any alternative ways to access and manage my account during this downtime?
1                                                                                                                                                                                                                                    Dear Customer S

In [17]:
# --- 2. Data Preprocessing ---
def preprocess_text(text):
    if not isinstance(text, str):
        return "" # Handle non-string inputs gracefully
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove non-alphabetic characters
    text = text.lower() # Convert to lowercase
    words = text.split() # Tokenize
    words = [word for word in words if word not in stopwords.words('english')] # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words] # Lemmatize
    return ' '.join(words)

print("\nApplying text preprocessing...")
df['clean_description'] = df['Body'].apply(preprocess_text)
print("Text preprocessing complete.")
print(f"Sample clean description: {df['clean_description'].iloc[0]}")





Applying text preprocessing...
Text preprocessing complete.
Sample clean description: dear customer support teami writing report significant problem centralized account management portal currently appears offline outage blocking access account setting leading substantial inconvenience attempted log multiple time using different browser device issue persistscould please provide update outage status estimated time resolution also alternative way access manage account downtime


In [21]:
# --- 4. Feature Extraction (TF-IDF) ---
print("\nExtracting features using TF-IDF...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000, # Max number of features (words/ngrams)
    min_df=2,           # Ignore terms that appear in less than 5 documents
    ngram_range=(1, 2)  # Include unigrams and bigrams
)
X_tfidf = tfidf_vectorizer.fit_transform(df['clean_description'])
print(f"TF-IDF feature matrix shape: {X_tfidf.shape}")



Extracting features using TF-IDF...
TF-IDF feature matrix shape: (29650, 5000)


In [22]:
# --- 5. Label Encoding ---
print("\nEncoding target variables (Department and Priority)...")
le_department = LabelEncoder()
df['department_encoded'] = le_department.fit_transform(df['Department'])
print(f"Department classes: {le_department.classes_}")

le_priority = LabelEncoder()
df['priority_encoded'] = le_priority.fit_transform(df['Priority'])
print(f"Priority classes: {le_priority.classes_}")



Encoding target variables (Department and Priority)...
Department classes: ['Billing and Payments' 'Customer Service' 'General Inquiry'
 'Human Resources' 'IT Support' 'Product Support' 'Returns and Exchanges'
 'Sales and Pre-Sales' 'Service Outages and Maintenance'
 'Technical Support']
Priority classes: ['high' 'low' 'medium']


In [23]:
# --- 6. Data Splitting, and Model Training for Department ---
print("\n--- Training Model for Department Prediction ---")
X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(
    X_tfidf, df['department_encoded'], test_size=0.2, random_state=42, stratify=df['department_encoded']
)

print(f"Department training class distribution: {Counter(y_train_dept)}")


--- Training Model for Department Prediction ---
Department training class distribution: Counter({9: 6894, 5: 4430, 1: 3586, 4: 2800, 0: 2413, 6: 1174, 8: 926, 7: 708, 3: 454, 2: 335})


In [None]:
# pd.DataFrame(X_train_tfidf_dept.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [27]:
# print("\n implement the logistic regression model")


# param_grid = {
#     'C': [0.01, 0.1, 1, 10], # Regularization strength
#     'l1_ratio': [0.5,0.8], # ElasticNet mixing parameter
#     'penalty': ['elasticnet'], # ElasticNet regularization
#     'solver': ['saga'], # SAGA solver supports ElasticNet
# }

# logistic_grid_search = GridSearchCV(
#     param_grid=param_grid,
#     estimator=LogisticRegression(random_state=42,n_jobs=1),
#     scoring='accuracy',
#     cv=5, # 5-fold cross-validation
#     verbose=2,
#     n_jobs=1 # Use 1 job to avoid memory issues in this environment

# )

# logistic_grid_search.fit(X_train_tfidf_dept, y_dept_train)


# Print the best parameters and best score found
# print("Best Parameters:", logistic_grid_search.best_params_) # The best combination of hyperparameters found
# print("Best Score:", logistic_grid_search.best_score_)       # The cross-validation score achieved with the best parameters

# # You can also access the best estimator directly
# best_log_reg_model = logistic_grid_search.best_estimator_ # The best model itself
# print(f"\n best model found: {best_log_reg_model}")


In [26]:
# <!-- Best Parameters: {'C': 10, 'l1_ratio': 0.8, 'penalty': 'elasticnet', 'solver': 'saga'}
# Best Score: 0.560443266682727

#  best model found: LogisticRegression(C=10, l1_ratio=0.8, n_jobs=1, penalty='elasticnet',
#                    random_state=42, solver='saga') -->

In [25]:
# Train Logistic Regression Model for Department
print("Training Logistic Regression model for Department...")
log_reg_dept = LogisticRegression(
    C=10, # Best parameter found from grid search
    l1_ratio=0.8, # Best parameter found from grid search
    penalty='elasticnet', # ElasticNet regularization
    solver='saga', # SAGA solver supports ElasticNet
    random_state=42,
    n_jobs=1 # Use 1 job to avoid memory issues in this environment
)
log_reg_dept.fit(X_train_dept, y_train_dept) # Changed to use X_train_dept, y_train_dept
print("Department model training complete.")



Training Logistic Regression model for Department...
Department model training complete.


In [28]:
# --- 7. Data Splitting, and Model Training for Priority ---
print("\n--- Training Model for Priority Prediction ---")
X_train_prio, X_test_prio, y_train_prio, y_test_prio = train_test_split(
    X_tfidf, df['priority_encoded'], test_size=0.2, random_state=42, stratify=df['priority_encoded']
)

print(f"Priority training class distribution: {Counter(y_train_prio)}")

# Train Logistic Regression Model for Priority
print("Training Logistic Regression model for Priority...")
log_reg_prio = LogisticRegression(
      C=10, # Best parameter found from grid search
    l1_ratio=0.8, # Best parameter found from grid search
    penalty='elasticnet', # ElasticNet regularization
    solver='saga', # SAGA solver supports ElasticNet
    random_state=42,
    n_jobs=1 # Use 1 job to avoid memory issues in this environment
)
log_reg_prio.fit(X_train_prio, y_train_prio) # Changed to use X_train_prio, y_train_prio
print("Priority model training complete.")



--- Training Model for Priority Prediction ---
Priority training class distribution: Counter({2: 9701, 0: 9209, 1: 4810})
Training Logistic Regression model for Priority...
Priority model training complete.


In [30]:
# --- 8. Model Evaluation ---
print("\n--- Evaluating Department Model ---")
y_pred_dept = log_reg_dept.predict(X_test_dept)
accuracy_dept = accuracy_score(y_test_dept, y_pred_dept)
print(f"Logistic Regression Accuracy for Department: {accuracy_dept:.4f}")

# Generate a detailed classification report for Department
print("\nClassification Report for Department:")
print(classification_report(y_test_dept, y_pred_dept, target_names=le_department.classes_))



--- Evaluating Department Model ---
Logistic Regression Accuracy for Department: 0.5880

Classification Report for Department:
                                 precision    recall  f1-score   support

           Billing and Payments       0.84      0.82      0.83       604
               Customer Service       0.51      0.51      0.51       896
                General Inquiry       0.73      0.43      0.54        84
                Human Resources       0.80      0.56      0.66       114
                     IT Support       0.50      0.46      0.48       700
                Product Support       0.51      0.55      0.53      1108
          Returns and Exchanges       0.65      0.49      0.56       293
            Sales and Pre-Sales       0.61      0.50      0.55       177
Service Outages and Maintenance       0.75      0.63      0.69       231
              Technical Support       0.58      0.65      0.61      1723

                       accuracy                           0.59     

In [31]:
print("\n--- Evaluating Priority Model ---")
y_pred_prio = log_reg_prio.predict(X_test_prio)
accuracy_prio = accuracy_score(y_test_prio, y_pred_prio)
print(f"Logistic Regression Accuracy for Priority: {accuracy_prio:.4f}")

# Generate a detailed classification report for Priority
print("\nClassification Report for Priority:")
print(classification_report(y_test_prio, y_pred_prio, target_names=le_priority.classes_))



--- Evaluating Priority Model ---
Logistic Regression Accuracy for Priority: 0.6046

Classification Report for Priority:
              precision    recall  f1-score   support

        high       0.64      0.64      0.64      2302
         low       0.53      0.48      0.50      1203
      medium       0.61      0.63      0.62      2425

    accuracy                           0.60      5930
   macro avg       0.59      0.58      0.59      5930
weighted avg       0.60      0.60      0.60      5930



In [39]:

# --- 9. Prediction Function ---
def predict_ticket_category(description: str) -> tuple[str, str]:
   
    print(f"\nPredicting for new ticket: '{description}'")
    # Preprocess the new description
    clean_description = preprocess_text(description)

    # Transform using the trained TF-IDF vectorizer
    description_tfidf = tfidf_vectorizer.transform([clean_description])

    # Predict Department
    predicted_dept_encoded = log_reg_dept.predict(description_tfidf)
    predicted_department = le_department.inverse_transform(predicted_dept_encoded)[0]

    # Predict Priority
    predicted_prio_encoded = log_reg_prio.predict(description_tfidf)
    predicted_priority = le_priority.inverse_transform(predicted_prio_encoded)[0]

    return predicted_department, predicted_priority

# --- Example Usage of the Prediction Function ---
if __name__ == '__main__':
    # This block will only run when the script is executed directly
    # and not when imported as a module.
    new_ticket_description = """I am writing to report persistent and highly disruptive issues with my company email (Outlook 365) and calendar synchronization. This problem began immediately after the system-wide software update that was pushed out last Tuesday.

Specifically, my Outlook email client on my desktop (Windows 10, Dell Latitude 7420) is failing to sync new emails in real-time. There's a significant delay, often up to 15-20 minutes, before new messages appear in my inbox. I've tried restarting Outlook, restarting my laptop, and even checking my internet connection (which is stable and fast). The issue persists whether I'm connected via Wi-Fi or Ethernet.

Furthermore, my Outlook calendar is not syncing correctly with my mobile device (iPhone 13, iOS 17.5.1). Meetings I accept or create on my desktop do not show up on my phone, and vice-versa. This is causing me to miss important appointments and double-book myself, leading to significant professional embarrassment and impacting project deadlines. I've already tried re-adding my email account on my iPhone, but the problem remains. I also noticed that shared calendars are particularly affected; updates from my team members' calendars are not reflecting on my end.

I rely heavily on real-time email communication and an accurate calendar for my role in project management. This ongoing issue is severely hindering my productivity and ability to collaborate effectively. Could you please investigate this matter as soon as possible? I am available for a remote session or a desk visit at your earliest convenience"""
    predicted_dept, predicted_prio = predict_ticket_category(new_ticket_description)

    print(f"Predicted Department: {predicted_dept}")
    print(f"Predicted Priority: {predicted_prio}")




Predicting for new ticket: 'I am writing to report persistent and highly disruptive issues with my company email (Outlook 365) and calendar synchronization. This problem began immediately after the system-wide software update that was pushed out last Tuesday.

Specifically, my Outlook email client on my desktop (Windows 10, Dell Latitude 7420) is failing to sync new emails in real-time. There's a significant delay, often up to 15-20 minutes, before new messages appear in my inbox. I've tried restarting Outlook, restarting my laptop, and even checking my internet connection (which is stable and fast). The issue persists whether I'm connected via Wi-Fi or Ethernet.

Furthermore, my Outlook calendar is not syncing correctly with my mobile device (iPhone 13, iOS 17.5.1). Meetings I accept or create on my desktop do not show up on my phone, and vice-versa. This is causing me to miss important appointments and double-book myself, leading to significant professional embarrassment and impacti

### logistic Regression output
1. for Department :- 58.80%
2. for Priority :- 60.46%

In [None]:
import pickle as pkl
# Define file paths for saving
tfidf_vectorizer_path = 'tfidf_vectorizer.pkl'
le_department_path = 'le_department.pkl'
le_priority_path = 'le_priority.pkl'
log_reg_dept_path = 'log_reg_dept_model.pkl'
log_reg_prio_path = 'log_reg_prio_model.pkl'

In [41]:
# Save TF-IDF Vectorizer
with open(tfidf_vectorizer_path, 'wb') as f:
    pkl.dump(tfidf_vectorizer, f)
print(f"TF-IDF Vectorizer saved to {tfidf_vectorizer_path}")

# Save Department LabelEncoder
with open(le_department_path, 'wb') as f:
    pkl.dump(le_department, f)
print(f"Department LabelEncoder saved to {le_department_path}")

# Save Priority LabelEncoder
with open(le_priority_path, 'wb') as f:
    pkl.dump(le_priority, f)
print(f"Priority LabelEncoder saved to {le_priority_path}")

# Save Department Logistic Regression Model
with open(log_reg_dept_path, 'wb') as f:
    pkl.dump(log_reg_dept, f)
print(f"Department Logistic Regression Model saved to {log_reg_dept_path}")

# Save Priority Logistic Regression Model
with open(log_reg_prio_path, 'wb') as f:
    pkl.dump(log_reg_prio, f)
print(f"Priority Logistic Regression Model saved to {log_reg_prio_path}")

print("\nAll models and transformers exported successfully!")

TF-IDF Vectorizer saved to tfidf_vectorizer.pkl
Department LabelEncoder saved to le_department.pkl
Priority LabelEncoder saved to le_priority.pkl
Department Logistic Regression Model saved to log_reg_dept_model.pkl
Priority Logistic Regression Model saved to log_reg_prio_model.pkl

All models and transformers exported successfully!
