In [1]:
pip install nltk scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Text preprocessing function
#Lowercasing: Converts the text to lowercase so that "Football" and "football" are treated as the same word.
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove stopwords
    #Stopword Removal: Removes common words (like "the", "and") that don’t provide useful information for classification.
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Apply Stemming
    #Stemming: Reduces words to their root form using the PorterStemmer. For example, "running" becomes "run".
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Apply Lemmatization
    #Converts words to their dictionary form using the WordNetLemmatizer. For example, "better" becomes "good". Lemmatization is more accurate than stemming because it returns proper words.
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join words back into a single string
    return ' '.join(words)

# Sample documents (sports vs technology)
documents = [
    "The football match was thrilling and intense!",
    "A new smartphone has been released with great features.",
    "The team won the championship and celebrated with fans.",
    "The new laptop features faster processing and more storage.",
    "Basketball players are working hard for the upcoming season.",
    "Technology companies are introducing new AI systems."
]

# Labels (1 for Sports, 0 for Technology)
labels = [1, 0, 1, 0, 1, 0]

# Preprocess the documents
processed_documents = [preprocess_text(doc) for doc in documents]

# Vectorize the text data (convert text to numerical format)
#We use CountVectorizer from scikit-learn to convert text into a numerical form. It creates a "bag of words" representation, where each document is represented by the frequency of words in it.
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(processed_documents)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.33, random_state=42)

# Train a Naive Bayes model
#The data is split into training and testing sets using train_test_split. This ensures we can train our model on one portion of the data and evaluate it on another portion.
#We use the Naive Bayes classifier (MultinomialNB) to train the model. This is a simple but effective model for text classification.
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

#After training the model, we evaluate it on the test set and print the accuracy. We also use an example sentence to predict its category.

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Example prediction
sample_text = "The new AI technology is revolutionizing industries."
processed_sample = preprocess_text(sample_text)
sample_vector = vectorizer.transform([processed_sample])
predicted_label = model.predict(sample_vector)

category = "Sports" if predicted_label == 1 else "Technology"
print(f"The document is categorized as: {category}")
#The model has 100% accuracy for this very small dataset (in practice, it will be lower for larger datasets)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/misha_personal/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/misha_personal/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/misha_personal/nltk_data...


Accuracy: 100.00%
The document is categorized as: Technology
