In [None]:
# Load and preprocess the data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to C:\Users\PARMEET
[nltk_data]     KAUR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\PARMEET
[nltk_data]     KAUR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\PARMEET
[nltk_data]     KAUR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\PARMEET
[nltk_data]     KAUR\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [34]:
import pandas as pd

# Load your Excel file
df = pd.read_excel("ai_dev_assignment_tickets_complex_1000.xls")
print(df[['issue_type', 'urgency_level']].isnull().sum())
df = df.dropna(subset=['issue_type', 'urgency_level'])

df['ticket_text'] = df['ticket_text'].fillna('')
# Check it's loaded
df.head()


issue_type       76
urgency_level    52
dtype: int64


Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam
5,6,Can you tell me more about the PhotoSnap Cam w...,General Inquiry,Medium,PhotoSnap Cam
6,7,is malfunction. It stopped working after just...,Product Defect,Low,EcoBreeze AC


In [10]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer



# Hardcoded stopword list (simplified)
stop_words = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves",
    "you", "your", "yours", "yourself", "he", "him", "his",
    "she", "her", "hers", "it", "its", "they", "them", "their", "theirs",
    "what", "which", "who", "whom", "this", "that", "these", "those",
    "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "do", "does", "did", "a", "an", "the",
    "and", "but", "if", "or", "because", "as", "until", "while",
    "of", "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "above", "below", "to", "from",
    "up", "down", "in", "out", "on", "off", "over", "under", "again", "further",
    "then", "once", "here", "there", "when", "where", "why", "how", "all", "any",
    "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor",
    "not", "only", "own", "same", "so", "than", "too", "very", "can", "will", "just"
])

# Simple Lemmatizer (fallback if WordNet fails)
def basic_lemmatize(word):
    # Very basic rules (for noun/plural, verb endings)
    word = re.sub(r'(ing|ed|s)$', '', word)
    return word

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [basic_lemmatize(w) for w in tokens]
    return ' '.join(tokens)

In [11]:
import pandas as pd
df = pd.read_excel("ai_dev_assignment_tickets_complex_1000.xls")

df['clean_text'] = df['ticket_text'].apply(clean_text)

# Check results
print(df[['ticket_text', 'clean_text']].head())


                                         ticket_text  \
0  Payment issue for my SmartWatch V2. I was unde...   
1  Can you tell me more about the UltraClean Vacu...   
2  I ordered SoundWave 300 but got EcoBreeze AC i...   
3  Facing installation issue with PhotoSnap Cam. ...   
4  Order #30903 for Vision LED TV is 13 days late...   

                                          clean_text  
0         payment issue smartwatch v underbill order  
1  tell ultraclean vacuum warranty also available...  
2  order soundwave got ecobreeze ac instead order...  
3  fac installation issue photosnap cam setup fai...  
4  order vision l tv day late order march also co...  


In [12]:
# Feature Selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from textblob import TextBlob

In [13]:
# Sentiment Score
df['sentiment'] = df['ticket_text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Ticket Length
df['text_length'] = df['ticket_text'].apply(lambda x: len(str(x).split()))

# TF-IDF Features
tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf.fit_transform(df['clean_text'])

In [14]:
# Combine with custom features
import numpy as np
X = np.hstack((X_tfidf.toarray(),
               df[['sentiment', 'text_length']].values))

In [15]:
# Encode labels
issue_encoder = LabelEncoder()
urgency_encoder = LabelEncoder()
y_issue = issue_encoder.fit_transform(df['issue_type'])
y_urgency = urgency_encoder.fit_transform(df['urgency_level'])

In [25]:
# Train ML 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
import re

# 0. Define a basic text cleaning function
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Apply cleaning
df['clean_text'] = df['ticket_text'].apply(clean_text)

# 1. TF-IDF on cleaned text
tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf.fit_transform(df['clean_text'])

# 2. Add extra features
df['sentiment'] = df['ticket_text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df['text_length'] = df['ticket_text'].apply(lambda x: len(str(x).split()))

# 3. Combine features
X = np.hstack((X_tfidf.toarray(), df[['sentiment', 'text_length']].values))

# 4. Encode targets
issue_encoder = LabelEncoder()
urgency_encoder = LabelEncoder()

y_issue = issue_encoder.fit_transform(df['issue_type'])
y_urgency = urgency_encoder.fit_transform(df['urgency_level'])


In [35]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_issue_train, y_issue_test = train_test_split(X, y_issue, test_size=0.2, random_state=42)
_, _, y_urgency_train, y_urgency_test = train_test_split(X, y_urgency, test_size=0.2, random_state=42)

# Train classifiers
clf_issue = RandomForestClassifier()
clf_issue.fit(X_train, y_issue_train)

clf_urgency = RandomForestClassifier()
clf_urgency.fit(X_train, y_urgency_train)

# Evaluate
print("Issue Type Classification:\n", classification_report(y_issue_test, clf_issue.predict(X_test), target_names=issue_encoder.classes_.astype(str)))
print("Urgency Level Classification:\n", classification_report(y_urgency_test, clf_urgency.predict(X_test), target_names=urgency_encoder.classes_.astype(str)))


Issue Type Classification:
                     precision    recall  f1-score   support

    Account Access       0.91      0.94      0.92        32
   Billing Problem       0.94      0.91      0.93        35
   General Inquiry       0.71      0.96      0.82        26
Installation Issue       0.96      0.89      0.92        27
     Late Delivery       0.86      0.90      0.88        20
    Product Defect       0.90      0.82      0.86        22
        Wrong Item       0.87      0.95      0.91        21
               nan       0.00      0.00      0.00        17

          accuracy                           0.83       200
         macro avg       0.77      0.80      0.78       200
      weighted avg       0.81      0.83      0.82       200

Urgency Level Classification:
               precision    recall  f1-score   support

        High       0.29      0.33      0.31        67
         Low       0.32      0.27      0.29        64
      Medium       0.25      0.25      0.25        61
 

In [None]:
# Entity Extraction
import dateutil.parser

# Complaint Keywords (expand as needed)
keywords = ["broken", "error", "late", "missing", "damaged", "failed", "problem"]

# Product List (get from dataset column)
product_list = df['product'].dropna().unique().tolist()

def extract_entities(text):
    entities = {"product": None, "dates": [], "keywords": []}
    
    # Product
    for product in product_list:
        if product and product.lower() in text.lower():
            entities["product"] = product
            break

    # Dates
    found_dates = re.findall(r"\b(?:\d{1,2}[-/th|st|nd|rd\s]*)?(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*[-/\s]?\d{2,4}\b", text, re.IGNORECASE)
    entities["dates"] = found_dates

    # Keywords
    entities["keywords"] = [kw for kw in keywords if kw in text.lower()]
    
    return entities

<function extract_entities at 0x00000275A17F2CA0>


In [40]:
# Integration function
def analyze_ticket(text):
    clean = clean_text(text)
    tfidf_vec = tfidf.transform([clean])
    length = len(text.split())
    sentiment = TextBlob(str(text)).sentiment.polarity
    custom_feats = np.array([[sentiment, length]])
    full_input = np.hstack((tfidf_vec.toarray(), custom_feats))
    
    issue_pred = issue_encoder.inverse_transform(clf_issue.predict(full_input))[0]
    urgency_pred = urgency_encoder.inverse_transform(clf_urgency.predict(full_input))[0]
    entities = extract_entities(text)
    
    return {
        "issue_type": issue_pred,
        "urgency_level": urgency_pred,
        "entities": entities
    }
