In [1]:
import re
from dateutil import parser
import pandas as pd

In [4]:
df = pd.read_csv("../dataset/preprocessed_dataset.csv")

In [5]:
df

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product,preprocessed_tokens
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2,"['payment', 'issue', 'smartwatch', 'v', 'under..."
1,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300,"['order', 'soundwave', 'get', 'ecobreeze', 'ac..."
2,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam,"['face', 'installation', 'issue', 'photosnap',..."
3,6,Can you tell me more about the PhotoSnap Cam w...,General Inquiry,Medium,PhotoSnap Cam,"['tell', 'photosnap', 'cam', 'warranty', 'also..."
4,7,is malfunction. It stopped working after just...,Product Defect,Low,EcoBreeze AC,"['malfunction', 'stop', 'work', 'day']"
...,...,...,...,...,...,...
821,995,Is this item in stock?,General Inquiry,High,RoboChef Blender,"['item', 'stock']"
822,996,I ordered EcoBreeze AC but got FitRun Treadmil...,Wrong Item,High,EcoBreeze AC,"['order', 'ecobreeze', 'ac', 'get', 'fitrun', ..."
823,997,I ordered SoundWave 300 but got PowerMax Batte...,Wrong Item,Low,SoundWave 300,"['order', 'soundwave', 'get', 'powermax', 'bat..."
824,999,Payment issue fr mi SoundWave 300. I was debit...,Billing Problem,Low,SoundWave 300,"['payment', 'issue', 'fr', 'mi', 'soundwave', ..."


In [6]:
product_list = df['product'].dropna().unique().tolist()

In [7]:
complaint_keywords = [
    'broken', 'damaged', 'not working', 'error', 'late', 'delay', 
    'issue', 'problem', 'failed', 'crash', 'defect', 'not received'
]

In [8]:
def extract_dates(text):
    dates = []
    tokens = re.findall(r'\b\w+\b', text)
    for i in range(len(tokens)):
        try:
            chunk = ' '.join(tokens[i:i+3])
            dt = parser.parse(chunk, fuzzy=False)
            dates.append(str(dt.date()))
        except:
            continue
    return list(set(dates))


In [9]:
def extract_product_names(text, product_list):
    found = []
    for product in product_list:
        if product and product.lower() in text.lower():
            found.append(product)
    return found


In [10]:
def extract_complaint_keywords(text, keywords):
    found = []
    for kw in keywords:
        if re.search(rf'\b{re.escape(kw)}\b', text.lower()):
            found.append(kw)
    return found

In [11]:
def extract_entities(ticket_text):
    return {
        'product_names': extract_product_names(ticket_text, product_list),
        'dates': extract_dates(ticket_text),
        'complaint_keywords': extract_complaint_keywords(ticket_text, complaint_keywords)
    }

In [12]:
# Apply to each ticket text
df['extracted_entities'] = df['ticket_text'].apply(extract_entities)

# Optional: to explode into separate columns
df_entities = pd.json_normalize(df['extracted_entities'])
df = pd.concat([df, df_entities], axis=1)


In [13]:
df[['ticket_text', 'product_names', 'dates', 'complaint_keywords']].head()

Unnamed: 0,ticket_text,product_names,dates,complaint_keywords
0,Payment issue for my SmartWatch V2. I was unde...,[SmartWatch V2],[],[issue]
1,I ordered SoundWave 300 but got EcoBreeze AC i...,"[SoundWave 300, EcoBreeze AC]",[],[]
2,Facing installation issue with PhotoSnap Cam. ...,[PhotoSnap Cam],[2025-06-01],[issue]
3,Can you tell me more about the PhotoSnap Cam w...,[PhotoSnap Cam],[],[]
4,is malfunction. It stopped working after just...,[],[],[]


In [14]:
df

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product,preprocessed_tokens,extracted_entities,product_names,dates,complaint_keywords
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2,"['payment', 'issue', 'smartwatch', 'v', 'under...","{'product_names': ['SmartWatch V2'], 'dates': ...",[SmartWatch V2],[],[issue]
1,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300,"['order', 'soundwave', 'get', 'ecobreeze', 'ac...","{'product_names': ['SoundWave 300', 'EcoBreeze...","[SoundWave 300, EcoBreeze AC]",[],[]
2,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam,"['face', 'installation', 'issue', 'photosnap',...","{'product_names': ['PhotoSnap Cam'], 'dates': ...",[PhotoSnap Cam],[2025-06-01],[issue]
3,6,Can you tell me more about the PhotoSnap Cam w...,General Inquiry,Medium,PhotoSnap Cam,"['tell', 'photosnap', 'cam', 'warranty', 'also...","{'product_names': ['PhotoSnap Cam'], 'dates': ...",[PhotoSnap Cam],[],[]
4,7,is malfunction. It stopped working after just...,Product Defect,Low,EcoBreeze AC,"['malfunction', 'stop', 'work', 'day']","{'product_names': [], 'dates': [], 'complaint_...",[],[],[]
...,...,...,...,...,...,...,...,...,...,...
821,995,Is this item in stock?,General Inquiry,High,RoboChef Blender,"['item', 'stock']","{'product_names': [], 'dates': [], 'complaint_...",[],[],[]
822,996,I ordered EcoBreeze AC but got FitRun Treadmil...,Wrong Item,High,EcoBreeze AC,"['order', 'ecobreeze', 'ac', 'get', 'fitrun', ...","{'product_names': ['EcoBreeze AC', 'FitRun Tre...","[EcoBreeze AC, FitRun Treadmill]",[],[]
823,997,I ordered SoundWave 300 but got PowerMax Batte...,Wrong Item,Low,SoundWave 300,"['order', 'soundwave', 'get', 'powermax', 'bat...","{'product_names': ['SoundWave 300', 'PowerMax ...","[SoundWave 300, PowerMax Battery]",[2025-04-22],[]
824,999,Payment issue fr mi SoundWave 300. I was debit...,Billing Problem,Low,SoundWave 300,"['payment', 'issue', 'fr', 'mi', 'soundwave', ...","{'product_names': ['SoundWave 300'], 'dates': ...",[SoundWave 300],[],[issue]


In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
import re
from dateutil import parser

class EntityExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, product_list, complaint_keywords):
        self.product_list = product_list
        self.complaint_keywords = complaint_keywords

    def extract_dates(self, text):
        dates = []
        tokens = re.findall(r'\b\w+\b', text)
        for i in range(len(tokens)):
            try:
                chunk = ' '.join(tokens[i:i+3])
                dt = parser.parse(chunk, fuzzy=False)
                dates.append(str(dt.date()))
            except:
                continue
        return list(set(dates))

    def extract_product_names(self, text):
        return [prod for prod in self.product_list if prod and prod.lower() in text.lower()]

    def extract_complaint_keywords(self, text):
        return [kw for kw in self.complaint_keywords if re.search(rf'\b{re.escape(kw)}\b', text.lower())]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # X is expected to be a pandas Series of ticket texts
        entities = X.apply(lambda text: self.extract_entities(text))
        return pd.DataFrame(entities.tolist())

    def extract_entities(self, ticket_text):
        return {
            'product_names': self.extract_product_names(ticket_text),
            'dates': self.extract_dates(ticket_text),
            'complaint_keywords': self.extract_complaint_keywords(ticket_text)
        }


In [16]:
from sklearn.pipeline import Pipeline

In [17]:
entity_pipeline = Pipeline([
    ('entity_extractor', EntityExtractor(product_list=product_list, complaint_keywords=complaint_keywords))
])

In [18]:
entity_features = entity_pipeline.fit_transform(df['ticket_text'])

In [19]:
entity_features

Unnamed: 0,product_names,dates,complaint_keywords
0,[SmartWatch V2],[],[issue]
1,"[SoundWave 300, EcoBreeze AC]",[],[]
2,[PhotoSnap Cam],[2025-06-01],[issue]
3,[PhotoSnap Cam],[],[]
4,[],[],[]
...,...,...,...
821,[],[],[]
822,"[EcoBreeze AC, FitRun Treadmill]",[],[]
823,"[SoundWave 300, PowerMax Battery]",[2025-04-22],[]
824,[SoundWave 300],[],[issue]


In [20]:
import joblib
joblib.dump(entity_pipeline, '../pipeline/entity_extraction_pipeline.pkl')

['../pipeline/entity_extraction_pipeline.pkl']