In [19]:
import matplotlib.pyplot as plt
import numpy as np
import polars as pol
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [10]:
df_short = pol.read_csv("../data/vodafone_data_short_head_long_emails.csv")
df_long = pol.read_csv("../data/vodafone_data_long_tail_long_emails.csv")

In [11]:
print(f"Shape of the short head dataset: {df_short.shape}")
print(f"Shape of the long tail dataset: {df_long.shape}")

Shape of the short head dataset: (11340, 9)
Shape of the long tail dataset: (1430, 9)


In [12]:
df_short.head()

Incident_No,Description,Assigned_Group,Incident_Type,Production_Category,Operational_Category,n_words,clean_text,Assigned_Group_fixed
str,str,str,str,str,str,i64,str,str
"""INC00000776689…","""CRM Applicatio…","""War Room""","""User Service R…","""Business Appli…","""Issue_Applicat…",182,"""crm applicatio…","""war room"""
"""INC00000776477…","""HR | VOICE TT …","""CM Support""","""User Service R…","""Business Appli…","""Critical Appli…",124,"""hr | voice tt …","""cm support"""
"""INC00000776473…","""Pending Active…","""L1- CRM Postpa…","""User Service R…","""Business Appli…","""Critical Appli…",173,"""pending active…","""l1-crm postpai…"
"""INC00000776402…","""AV definitions…","""L1-Wintel""","""User Service R…","""Software_Opera…","""Other_other_ot…",412,"""av definitions…","""l1-wintel"""
"""INC00000776390…","""old Doc not re…","""L2 - CRM Ops S…","""User Service R…","""Business Appli…","""Issue_Applicat…",229,"""old doc not re…","""l2-crm ops sup…"


In [13]:
df_short = df_short.with_columns(
    (pol.col("Assigned_Group_fixed").map_elements(lambda x: x).alias("label"))
)
df_long = df_long.with_columns(
    (pol.col("Assigned_Group_fixed").map_elements(lambda x: "unk")).alias("label")
)

In [14]:
df_short.head()

Incident_No,Description,Assigned_Group,Incident_Type,Production_Category,Operational_Category,n_words,clean_text,Assigned_Group_fixed,label
str,str,str,str,str,str,i64,str,str,str
"""INC00000776689…","""CRM Applicatio…","""War Room""","""User Service R…","""Business Appli…","""Issue_Applicat…",182,"""crm applicatio…","""war room""","""war room"""
"""INC00000776477…","""HR | VOICE TT …","""CM Support""","""User Service R…","""Business Appli…","""Critical Appli…",124,"""hr | voice tt …","""cm support""","""cm support"""
"""INC00000776473…","""Pending Active…","""L1- CRM Postpa…","""User Service R…","""Business Appli…","""Critical Appli…",173,"""pending active…","""l1-crm postpai…","""l1-crm postpai…"
"""INC00000776402…","""AV definitions…","""L1-Wintel""","""User Service R…","""Software_Opera…","""Other_other_ot…",412,"""av definitions…","""l1-wintel""","""l1-wintel"""
"""INC00000776390…","""old Doc not re…","""L2 - CRM Ops S…","""User Service R…","""Business Appli…","""Issue_Applicat…",229,"""old doc not re…","""l2-crm ops sup…","""l2-crm ops sup…"


In [15]:
df_long.head()

Incident_No,Description,Assigned_Group,Incident_Type,Production_Category,Operational_Category,n_words,clean_text,Assigned_Group_fixed,label
str,str,str,str,str,str,i64,str,str,str
"""INC00000776594…","""Regarding recu…","""Middleware""","""User Service R…","""Software_SubSy…","""Request_Applic…",120,"""regarding recu…","""middleware""","""unk"""
"""INC00000776263…","""KER- ODI - Ass…","""SSO - EAI""","""User Service R…","""Application_Op…","""Issue_Applicat…",127,"""ker- odi - ass…","""sso-eai""","""unk"""
"""INC00000776007…","""UPC not receiv…","""L2-MNP Operati…","""User Service R…","""Business Appli…","""Issue_Applicat…",277,"""upc not receiv…","""l2-mnp operati…","""unk"""
"""INC00000775977…","""Voucher Status…","""mpesa core""","""User Service R…","""Business Appli…","""Request_Applic…",111,"""voucher status…","""mpesa core""","""unk"""
"""INC00000775945…","""Required Chann…","""L2 - VTOPUP""","""User Service R…","""Business Appli…","""Critical Appli…",106,"""required chann…","""l2-vtopup""","""unk"""


In [16]:
df = pol.concat([df_short, df_long])
print(f"Shape of the concatenated dataframe: {df.shape}")

Shape of the concatenated dataframe: (12770, 10)


In [17]:
df = df.select(pol.col("clean_text"), pol.col("label"))
df_train, df_test = train_test_split(
    df, test_size=0.2, random_state=32, shuffle=True, stratify=df.select("label")
)

In [18]:
print(f"Shape of the training data: {df_train.shape}")
print(f"Shape of the test data: {df_test.shape}")

Shape of the training data: (10216, 2)
Shape of the test data: (2554, 2)


In [33]:
ensemble = VotingClassifier(
    estimators=[
        ("svm", SVC(kernel="rbf", random_state=42)),
        ("rf", RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)),
    ],
    voting="soft",
)

In [34]:
clf = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1, 3), stop_words="english")),
        ("tfidf", TfidfTransformer()),
        ("clf", ensemble),
    ]
)

In [35]:
clf.fit(X=df_train["clean_text"].to_list(), y=df_train["label"].to_list())

In [36]:
clf.score(X=df_test["clean_text"].to_list(), y=df_test["label"].to_list())

AttributeError: predict_proba is not available when  probability=False