In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import wordnet
import random
import nltk

# Download the WordNet resource
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load data
data = {
    'text': [
        "This is a sports article about football.",
        "The new scientific discovery is groundbreaking.",
        "The stock market is showing positive growth.",
        "Astronomers observe a rare cosmic event.",
        "The basketball team won the championship.",
        "Scientists conduct an experiment in the lab.",
        "The company's revenue increased this quarter.",
    ],
    'label': ['Sport', 'Science', 'Business', 'Science', 'Sport', 'Science', 'Business']
}

# Define a function for data augmentation with synonyms
def augment_with_synonyms(text):
    words = text.split()
    augmented_texts = []
    for idx, word in enumerate(words):
        synsets = wordnet.synsets(word)
        if synsets:
            synonyms = [synset.lemmas()[0].name() for synset in synsets]
            for synonym in synonyms:
                words_copy = words.copy()
                words_copy[idx] = synonym
                augmented_texts.append(" ".join(words_copy))
    return augmented_texts

# Augment the data using synonyms
data_augmented = {'text': [], 'label': []}
for idx, row in enumerate(data['text']):
    augmented_texts = augment_with_synonyms(row)
    data_augmented['text'].extend(augmented_texts)
    data_augmented['label'].extend([data['label'][idx]] * len(augmented_texts))

# Combine original data and augmented data
data_combined = {
    'text': data['text'] + data_augmented['text'],
    'label': data['label'] + data_augmented['label']
}

# Convert the data into a DataFrame
df = pd.DataFrame(data_combined)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Transform the text data into TF-IDF features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# User Interface for predicting new text class
while True:
    new_text = input("Enter a text (type 'exit' to quit): ")
    if new_text.lower() == 'exit':
        break

    # Check if the word is available in WordNet
    words = new_text.split()
    all_words_found = True
    for word in words:
        synsets = wordnet.synsets(word)
        if not synsets:
            print(f"Word not found : {word}")
            all_words_found = False
            break
    
    if not all_words_found:
        continue

    text_tfidf = vectorizer.transform([new_text])
    predicted_class = classifier.predict(text_tfidf)[0]
    print("Predicted Class:", predicted_class)

[nltk_data] Downloading package wordnet to C:\Users\THIS
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\THIS
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Accuracy: 1.0
              precision    recall  f1-score   support

    Business       1.00      1.00      1.00        21
     Science       1.00      1.00      1.00        14
       Sport       1.00      1.00      1.00         9

    accuracy                           1.00        44
   macro avg       1.00      1.00      1.00        44
weighted avg       1.00      1.00      1.00        44



Enter a text (type 'exit' to quit):  market


Predicted Class: Business


Enter a text (type 'exit' to quit):  discovery


Predicted Class: Science


Enter a text (type 'exit' to quit):  astronomers


Predicted Class: Science


Enter a text (type 'exit' to quit):  basketball


Predicted Class: Sport


Enter a text (type 'exit' to quit):  football


Predicted Class: Sport


Enter a text (type 'exit' to quit):  lab


Predicted Class: Science


Enter a text (type 'exit' to quit):  revenue


Predicted Class: Business


Enter a text (type 'exit' to quit):  scientific


Predicted Class: Science


Enter a text (type 'exit' to quit):  stock


Predicted Class: Business


Enter a text (type 'exit' to quit):  article


Predicted Class: Sport


Enter a text (type 'exit' to quit):  experiment


Predicted Class: Science


Enter a text (type 'exit' to quit):  cdbhsjhmafukjnawmkdsileji


Word not found : cdbhsjhmafukjnawmkdsileji


Enter a text (type 'exit' to quit):  navyasree


Word not found : navyasree


Enter a text (type 'exit' to quit):  mobile


Predicted Class: Science


Enter a text (type 'exit' to quit):  exit
