In [1]:
import pandas as pd
import string
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin

In [70]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\parma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\parma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\parma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [71]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [72]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'\d+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = nltk.word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]
        return " ".join(tokens)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.Series(X).apply(self.clean_text)


In [73]:
df = pd.read_csv(r'C:\Users\parma\OneDrive\Desktop\govSer\grievance_data.csv')

In [74]:
X = df['complaint_text']
yd = df['department']
yu = df['urgency']

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, yd, test_size=0.2, random_state=42, stratify=yd)
X_train_u, X_test_u, y_train_u, y_test_u = train_test_split(X, yu, test_size=0.2, random_state=42)

In [79]:
dept_pipeline = Pipeline([
    ('preprocess',TextPreprocessor()),
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter = 1000))
])
urgency_pipeline = Pipeline([
    ('preprocess', TextPreprocessor()),
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter = 1000))
])

In [80]:
dept_pipeline.fit(X_train, y_train)
urgency_pipeline.fit(X_train_u, y_train_u)

In [81]:
yd_pred = dept_pipeline.predict(X_test)
print("Department Classification Report:")
print(classification_report(y_test, y_pred))

Department Classification Report:
              precision    recall  f1-score   support

 Electricity       0.00      0.00      0.00         3
Municipality       0.40      0.33      0.36         6
         PWD       0.20      0.25      0.22         4
  Sanitation       0.00      0.00      0.00         4
 Water Board       0.00      0.00      0.00         5

    accuracy                           0.14        22
   macro avg       0.12      0.12      0.12        22
weighted avg       0.15      0.14      0.14        22



In [82]:
yu_pred = urgency_pipeline.predict(X_test_u)
print("Urgency Classification Report:")
print(classification_report(y_test_u, yu_pred))

Urgency Classification Report:
              precision    recall  f1-score   support

        High       0.44      0.64      0.52        11
         Low       0.00      0.00      0.00         0
      Medium       0.20      0.09      0.13        11

    accuracy                           0.36        22
   macro avg       0.21      0.24      0.21        22
weighted avg       0.32      0.36      0.32        22



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [83]:
test_text = "There's a brpken electric pole in the street"
predicted_dept = dept_pipeline.predict([test_text])[0]
print("Predicted Department:", predicted_dept)
predicted_urgency = urgency_pipeline.predict([test_text])[0]
print("Predicted level of urgency:", predicted_urgency)

Predicted Department: Electricity
Predicted level of urgency: High


In [84]:
import pickle
pickle.dump(dept_pipeline, open('dept_pipeline.pkl', 'wb'))
pickle.dump(urgency_pipeline, open('urgency_pipeline.pkl', 'wb'))

In [87]:
#test prediction function
def predict_complaint(text):
    dept_model = pickle.load(open("dept_pipeline.pkl", "rb"))
    urg_model = pickle.load(open("urgency_pipeline.pkl", "rb"))

    dept = dept_model.predict([text])[0]
    urgency = urg_model.predict([text])[0]

    return {
        "text":text,
        "predicted_dept":dept,
        "predicted_urg":urgency
    }

res = predict_complaint("Power cut in Rajendra Nagar since last night")
print(res)

{'text': 'Power cut in Rajendra Nagar since last night', 'predicted_dept': 'Electricity', 'predicted_urg': 'High'}
