In [11]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [12]:
# Step 1: Data Preprocessing - Training Dataset
# Read training JSON file
with open('traindata (2).json') as file:
    train_data = json.load(file)

In [14]:
# Remove unwanted classes from training data
unwanted_classes = ["000", "200", "B","A"]
train_data = {class_number: lines for class_number, lines in train_data.items() if class_number not in unwanted_classes}

In [15]:
# Extract class labels and lines from training data
train_classes = []
train_lines = []

for key, values in train_data.items():
    class_number = key.strip('"')
    train_classes.extend([class_number] * len(values))
    train_lines.extend(values)


In [16]:
# Step 2: Feature Extraction - Training Dataset
# Convert training lines into numerical representations
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_lines)

# Train the Random Forest Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, train_classes)

RandomForestClassifier()

In [17]:
# Step 3: Model Evaluation - Testing Dataset
# Read testing JSON file
with open('testdata (2).json') as file:
    test_data = json.load(file)

# Remove unwanted classes from testing data
test_data = {class_number: lines for class_number, lines in test_data.items() if class_number not in unwanted_classes}

# Extract class labels and lines from testing data
test_classes = []
test_lines = []

for key, values in test_data.items():
    class_number = key.strip('"')
    test_classes.extend([class_number] * len(values))
    test_lines.extend(values)

In [18]:
# Step 4: Feature Extraction - Testing Dataset
# Convert testing lines into numerical representations
X_test = vectorizer.transform(test_lines)

# Predict using the Random Forest Model
y_pred = rf_model.predict(X_test)

# Print Classification Report and Accuracy Score
print(classification_report(test_classes, y_pred))
accuracy = accuracy_score(test_classes, y_pred)
print("Accuracy Score:", accuracy)

              precision    recall  f1-score   support

         001       0.75      0.70      0.73       210
         002       0.85      0.82      0.83        61
         003       0.73      0.75      0.74       159
         004       0.82      0.69      0.75        52
         005       0.80      0.74      0.77       211
         006       0.81      0.87      0.83       163
         007       0.82      0.80      0.81       233
         008       0.86      0.75      0.80       100
         009       0.55      0.87      0.67       546
         010       0.73      0.69      0.71       185
         011       0.80      0.68      0.73       172
         012       0.84      0.78      0.81       155
         013       0.81      0.61      0.69        41
         014       0.69      0.54      0.61        57
         015       0.80      0.62      0.70        39
         016       0.82      0.82      0.82       288
         017       0.84      0.64      0.73        77
         018       0.69    

In [24]:
# Step 5: Prediction
query = " i need a patent for mynew beer"
preprocessed_query = vectorizer.transform([query])
predicted_class = rf_model.predict(preprocessed_query)
print("Predicted Class:", predicted_class[0])

Predicted Class: 032
