In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [None]:
# Load the dataset
data = pd.read_json('/content/idmanual (1).json')


In [None]:
# Drop the 'status' column
data = data.drop('status', axis=1)

In [None]:
# Data Cleaning
data['description'] = data['description'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

In [None]:
# Text Preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
data['description'] = data['description'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word.lower() not in stop_words]))

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['description'], data['class_id'], test_size=0.2, random_state=42)

In [None]:
# Feature Engineering and Model Training
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])


In [None]:
# Perform cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
average_accuracy = cv_scores.mean()
print("Cross-Validation Accuracy: {:.2f}%".format(average_accuracy * 100))



Cross-Validation Accuracy: 62.47%


In [None]:
# Train the final model on the entire training set
pipeline.fit(X_train, y_train)

In [None]:
# Predict the class_id for test data
predictions = pipeline.predict(X_test)


In [None]:
# Calculate accuracy on the testing set
accuracy = accuracy_score(y_test, predictions)
print("Testing Set Accuracy: {:.2f}%".format(accuracy * 100))

Testing Set Accuracy: 64.33%


In [None]:
# Evaluate the model on the separate testing set
accuracy = (predictions == y_test).mean()
print("Testing Set Accuracy: {:.2f}%".format(accuracy * 100))

Testing Set Accuracy: 79.66%


In [None]:
# User Input and Prediction
user_input = input("Enter a description: ")
user_input_cleaned = re.sub('[^a-zA-Z]', ' ', user_input)
user_input_preprocessed = ' '.join([stemmer.stem(word) for word in user_input_cleaned.split() if word.lower() not in stop_words])
prediction = pipeline.predict([user_input_preprocessed])
print("Predicted Class ID: ", prediction[0])

Enter a description: i need a fried chicken barnd
Predicted Class ID:  029
