In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [2]:
train_df = pd.read_csv("clean_train.csv")
test_df = pd.read_csv("clean_test.csv")

print(train_df.shape)
print(test_df.shape)


(10780, 7)
(2697, 7)


In [3]:
vectorizer = TfidfVectorizer(max_features=10000)

X_train = vectorizer.fit_transform(train_df['clean_text'])
X_test = vectorizer.transform(test_df['clean_text'])

y_train = train_df['category']
y_test = test_df['category']

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


X_train shape: (10780, 7575)
X_test shape: (2697, 7575)


In [4]:
model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

print("Model training completed.")


Model training completed.


In [5]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.9840563589173156

Classification Report:

              precision    recall  f1-score   support

       forum       0.98      0.98      0.98       450
  promotions       0.99      0.99      0.99       449
social_media       0.98      0.98      0.98       449
        spam       0.98      0.98      0.98       449
     updates       0.98      0.98      0.98       449
 verify_code       0.99      1.00      0.99       451

    accuracy                           0.98      2697
   macro avg       0.98      0.98      0.98      2697
weighted avg       0.98      0.98      0.98      2697



In [6]:
import pickle

pickle.dump(model, open("category_model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.


In [7]:
def assign_urgency(text):
    text = text.lower()
    
    high_keywords = ["urgent", "asap", "immediately", "not working", "failed", "error"]
    medium_keywords = ["issue", "problem", "delay", "help", "support"]
    
    for word in high_keywords:
        if word in text:
            return "High"
    
    for word in medium_keywords:
        if word in text:
            return "Medium"
    
    return "Low"


In [8]:
train_df["urgency"] = train_df["clean_text"].apply(assign_urgency)
test_df["urgency"] = test_df["clean_text"].apply(assign_urgency)

train_df["urgency"].value_counts()


urgency
Low       8730
Medium    1326
High       724
Name: count, dtype: int64

In [9]:
y_train_urgency = train_df["urgency"]
y_test_urgency = test_df["urgency"]


In [10]:
urgency_model = LogisticRegression(max_iter=1000, class_weight="balanced")

urgency_model.fit(X_train, y_train_urgency)

print("Urgency model trained successfully.")


Urgency model trained successfully.


In [11]:
y_pred_urgency = urgency_model.predict(X_test)

print("Urgency Accuracy:", accuracy_score(y_test_urgency, y_pred_urgency))
print("\nUrgency Classification Report:\n")
print(classification_report(y_test_urgency, y_pred_urgency))


Urgency Accuracy: 0.9888765294771968

Urgency Classification Report:

              precision    recall  f1-score   support

        High       0.99      0.93      0.96       172
         Low       0.99      1.00      0.99      2179
      Medium       0.98      0.97      0.98       346

    accuracy                           0.99      2697
   macro avg       0.99      0.97      0.98      2697
weighted avg       0.99      0.99      0.99      2697



In [12]:
import pickle

pickle.dump(urgency_model, open("urgency_model.pkl", "wb"))

print("Urgency model saved successfully.")


Urgency model saved successfully.


In [13]:
train_df.to_csv("clean_train.csv", index=False)


In [14]:
test_df.to_csv("clean_test.csv", index=False)
