In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Load the data from JSON file
with open('/content/idmanual.json', 'r') as f:
    data = json.load(f)

In [None]:
# Convert data to pandas DataFrame
df = pd.DataFrame(data)

In [None]:
df.head()


Unnamed: 0,id_tx,class_id,description,status
0,009-4140,9,Bank note acceptors for separating good bank n...,A
1,009-4136,9,Fingerprint imagers,A
2,009-4133,9,Laboratory swabs [laboratory instruments],A
3,009-4131,9,Ear plugs for divers,A
4,009-4130,9,DVD recorders,A


In [None]:
# Preprocessing - Stemming and Stop-word Removal
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


In [None]:
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # Remove stop words
    filtered_tokens = [token for token in stemmed_tokens if token.lower() not in stop_words]
    # Rejoin the tokens into a single string
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text


In [None]:
# Apply preprocessing to the 'description' column
df['preprocessed_description'] = df['description'].apply(preprocess_text)

In [None]:
# Preprocessing - Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['preprocessed_description'])
y = df['class_id']


In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)


In [None]:
# Predict on the test set
y_pred = rf_classifier.predict(X_test)


In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.7464


In [None]:
# Perform cross-validation
cv_scores = cross_val_score(rf_classifier, X, y, cv=5)
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {np.mean(cv_scores):.4f}')



Cross-Validation Scores: [0.70564688 0.70385827 0.71064736 0.73143101 0.7334753 ]
Mean CV Accuracy: 0.7170


In [None]:
# Retrain the model with the preprocessed data
rf_classifier.fit(X, y)

In [None]:

# Preprocess and vectorize the new input
new_input =input()
new_input_preprocessed = preprocess_text(new_input)
new_input_vector = vectorizer.transform([new_input_preprocessed])

 i have a new  cement  formula


In [None]:
# Predict the class for the new input
predicted_class = rf_classifier.predict(new_input_vector)
print(f'Predicted Class: {predicted_class[0]}')

Predicted Class: 019
