In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score

In [18]:
# Load the dataset from JSON file
data = pd.read_json('idmanual (1).json')
columns = data.columns

In [19]:
# Drop unwanted columns
data = data.drop(['status'], axis=1)


In [20]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['description'], data['class_id'], test_size=0.2, random_state=42)

In [21]:
# Create TF-IDF vectorizer with NLP techniques
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 2), lowercase=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dudec\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
# Fit and transform the training data
X_train_vec = vectorizer.fit_transform(X_train.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()])))

# Transform the testing data
X_test_vec = vectorizer.transform(X_test.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()])))

In [23]:
# Perform cross-validation
classifier = LinearSVC()
cv_scores = cross_val_score(classifier, X_train_vec, y_train, cv=5)
average_accuracy = cv_scores.mean()
print("Cross-Validation Accuracy: {:.2f}%".format(average_accuracy * 100))




Cross-Validation Accuracy: 81.04%


In [24]:
# Train the final model on the entire training set
classifier.fit(X_train_vec, y_train)


LinearSVC()

In [25]:
# Predict the class_id for test data
predictions = classifier.predict(X_test_vec)

In [26]:
# Evaluate the model on the separate testing set
accuracy = accuracy_score(y_test, predictions)
print("Testing Set Accuracy: {:.2f}%".format(accuracy * 100))

Testing Set Accuracy: 82.10%


In [27]:
# Save the trained model and vectorizer
joblib.dump(classifier, "svm_model.joblib")
joblib.dump(vectorizer, "vectorizer.joblib")

['vectorizer.joblib']

In [45]:
# User input for prediction
user_input = input("Enter a description: ")
user_input_vec = vectorizer.transform([user_input])
prediction = classifier.predict(user_input_vec)

print("Prediction: ", prediction)

Enter a description: i need new beer barnd
Prediction:  ['032']
