In [1]:
import pandas as pd
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk_data_dir = os.path.abspath("venv/nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)

# Settting the path for NLTK data
nltk.data.path.append(nltk_data_dir)

# # Download resources to the correct directory
nltk.download('punkt_tab', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.download('averaged_perceptron_tagger_eng', download_dir=nltk_data_dir)

In [3]:
df = pd.read_excel('dataset/ai_dev_assignment_tickets_complex_1000.xls')
df.head()

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2
1,2,Can you tell me more about the UltraClean Vacu...,General Inquiry,,UltraClean Vacuum
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam
4,5,Order #30903 for Vision LED TV is 13 days late...,Late Delivery,,Vision LED TV


In [4]:
df

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2
1,2,Can you tell me more about the UltraClean Vacu...,General Inquiry,,UltraClean Vacuum
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam
4,5,Order #30903 for Vision LED TV is 13 days late...,Late Delivery,,Vision LED TV
...,...,...,...,...,...
995,996,I ordered EcoBreeze AC but got FitRun Treadmil...,Wrong Item,High,EcoBreeze AC
996,997,I ordered SoundWave 300 but got PowerMax Batte...,Wrong Item,Low,SoundWave 300
997,998,,Installation Issue,Medium,EcoBreeze AC
998,999,Payment issue fr mi SoundWave 300. I was debit...,Billing Problem,Low,SoundWave 300


In [5]:
df.shape

(1000, 5)

In [6]:
df.isnull().sum()

ticket_id         0
ticket_text      55
issue_type       76
urgency_level    52
product           0
dtype: int64

In [7]:
# Handling missing values
df.dropna(subset=['ticket_text', 'issue_type', 'urgency_level'], inplace=True)

In [8]:
df.shape

(826, 5)

In [9]:
df.isnull().sum()

ticket_id        0
ticket_text      0
issue_type       0
urgency_level    0
product          0
dtype: int64

In [10]:
df

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam
5,6,Can you tell me more about the PhotoSnap Cam w...,General Inquiry,Medium,PhotoSnap Cam
6,7,is malfunction. It stopped working after just...,Product Defect,Low,EcoBreeze AC
...,...,...,...,...,...
994,995,Is this item in stock?,General Inquiry,High,RoboChef Blender
995,996,I ordered EcoBreeze AC but got FitRun Treadmil...,Wrong Item,High,EcoBreeze AC
996,997,I ordered SoundWave 300 but got PowerMax Batte...,Wrong Item,Low,SoundWave 300
998,999,Payment issue fr mi SoundWave 300. I was debit...,Billing Problem,Low,SoundWave 300


In [11]:
from nltk.corpus import wordnet

# Function to map POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [12]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

## 1. Data Preprocessing

In [13]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tagged_tokens = nltk.pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged_tokens]
    return lemmatized_tokens

In [14]:
df['preprocessed_tokens'] = df['ticket_text'].apply(preprocess_text)


In [15]:
df.head()

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product,preprocessed_tokens
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2,"[payment, issue, smartwatch, v, underbilled, o..."
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300,"[order, soundwave, get, ecobreeze, ac, instead..."
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam,"[face, installation, issue, photosnap, cam, se..."
5,6,Can you tell me more about the PhotoSnap Cam w...,General Inquiry,Medium,PhotoSnap Cam,"[tell, photosnap, cam, warranty, also, availab..."
6,7,is malfunction. It stopped working after just...,Product Defect,Low,EcoBreeze AC,"[malfunction, stop, work, day]"


In [16]:
df.issue_type.value_counts()

issue_type
Account Access        133
Billing Problem       128
Installation Issue    128
General Inquiry       127
Product Defect        110
Wrong Item            103
Late Delivery          97
Name: count, dtype: int64

In [17]:
df.urgency_level.value_counts()

urgency_level
High      288
Medium    278
Low       260
Name: count, dtype: int64

In [None]:
df.to_csv('dataset/preprocessed_dataset.csv', index=False)

In [20]:
# create a transformer to preprocess the text in future
from sklearn.preprocessing import FunctionTransformer

def preprocess_series(series):
    return series.apply(preprocess_text)

preprocess_transformer = FunctionTransformer(preprocess_series, validate=False)

In [None]:
df['preprocessed_text'] = preprocess_transformer.transform(df['ticket_text'])

In [22]:
df

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product,preprocessed_tokens
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2,"[payment, issue, smartwatch, v, underbilled, o..."
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300,"[order, soundwave, get, ecobreeze, ac, instead..."
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam,"[face, installation, issue, photosnap, cam, se..."
5,6,Can you tell me more about the PhotoSnap Cam w...,General Inquiry,Medium,PhotoSnap Cam,"[tell, photosnap, cam, warranty, also, availab..."
6,7,is malfunction. It stopped working after just...,Product Defect,Low,EcoBreeze AC,"[malfunction, stop, work, day]"
...,...,...,...,...,...,...
994,995,Is this item in stock?,General Inquiry,High,RoboChef Blender,"[item, stock]"
995,996,I ordered EcoBreeze AC but got FitRun Treadmil...,Wrong Item,High,EcoBreeze AC,"[order, ecobreeze, ac, get, fitrun, treadmill,..."
996,997,I ordered SoundWave 300 but got PowerMax Batte...,Wrong Item,Low,SoundWave 300,"[order, soundwave, get, powermax, battery, ins..."
998,999,Payment issue fr mi SoundWave 300. I was debit...,Billing Problem,Low,SoundWave 300,"[payment, issue, fr, mi, soundwave, debit, inc..."


In [23]:
import joblib

joblib.dump(preprocess_transformer, 'models/preprocess_transformer.pkl')


['models/preprocess_transformer.pkl']