In [1]:
pip install nltk scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    #Lowercasing: We convert the text to lowercase to standardize it.
    text = text.lower()

    # Remove stopwords
    #Stopword Removal: We remove common words (like "the", "and", etc.) that don't add much value to classification.
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Apply Lemmatization
    #Lemmatization: We use WordNetLemmatizer to convert words to their base form (e.g., "better" to "good"
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join words back into a single string
    return ' '.join(words)
#10 sports-related documents: These are now varied and cover a broad range of sports.
# Updated and expanded documents (sports vs technology)
documents = [
    # Sports-related documents
    "The football match was thrilling and intense!",
    "A new football player just broke a world record for goals.",
    "The basketball team celebrated after winning the final match.",
    "Baseball season is starting soon, and fans are excited.",
    "The soccer game was a nail-biting experience.",
    "Athletes are training hard for the upcoming tournament.",
    "The tennis match was fast-paced with many exciting rallies.",
    "The volleyball championship will be held next month.",
    "Cycling competitions are becoming more popular worldwide.",
    "Olympic athletes are preparing for the next games.",

    #10 technology-related documents: Similarly, these cover various technologies like AI, quantum computing, 5G, and others.
    # Technology-related documents
    "The new smartphone features amazing AI capabilities.",
    "Artificial intelligence is shaping the future of technology.",
    "Quantum computing is revolutionizing the tech industry.",
    "The tech conference introduced several new innovations.",
    "New advancements in 5G technology will change the world.",
    "Electric vehicles are becoming a popular choice among consumers.",
    "The tech startup just raised millions in funding for their app.",
    "Blockchain technology is being integrated into various industries.",
    "Virtual reality is becoming mainstream in entertainment.",
    "Wearable tech like smartwatches is growing rapidly."
]

#
# Labels (1 for Sports, 0 for Technology)
labels = [1] * 10 + [0] * 10  # 10 Sports and 10 Technology labels

# Preprocess the documents
processed_documents = [preprocess_text(doc) for doc in documents]

# Vectorize the text data using TF-IDF (Term Frequency - Inverse Document Frequency)
# We use TfidfVectorizer from sklearn to convert the documents into a TF-IDF matrix. 
# This method calculates the importance of a word in a document by considering its frequency and its inverse frequency across all documents.
# This helps in giving less weight to common words and more weight to words that are unique to a particular document.
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_documents)

# Split the data into training and testing sets
#We split the dataset into training and testing sets (67% training and 33% testing) using
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.33, random_state=42)

# Train a Logistic Regression model
# We use Logistic Regression, which is a powerful classification model for text. It works well with high-dimensional data (like text data).
# After training the model, we use it to predict labels on the test set.

model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
# We use accuracy_score to evaluate the model’s performance on the test data.
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print the classification report
# We also print a detailed classification report that includes precision, recall, and F1-score for each class (Sports and Technology).
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Example prediction
sample_text = "AI technology is transforming the way we live and work."
processed_sample = preprocess_text(sample_text)
sample_vector = vectorizer.transform([processed_sample])
predicted_label = model.predict(sample_vector)

category = "Sports" if predicted_label == 1 else "Technology"
print(f"\nThe document is categorized as: {category}")

#Classification Report: The report provides detailed metrics for both classes (Sports and Technology). We now expect the model to be able to distinguish between these categories more effectively.
# This provides detailed performance metrics like:
#Precision: The percentage of correctly predicted instances for each class.
#Recall: The percentage of actual instances for each class that were correctly identified.
#F1-score: The harmonic mean of precision and recall.

#Even Larger Dataset: You can improve the model by adding even more training data (ideally in the tens of thousands of samples) to capture more variations in sports and technology text.

Accuracy: 42.86%

Classification Report:
              precision    recall  f1-score   support

           0       0.43      1.00      0.60         3
           1       0.00      0.00      0.00         4

    accuracy                           0.43         7
   macro avg       0.21      0.50      0.30         7
weighted avg       0.18      0.43      0.26         7


The document is categorized as: Technology


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/misha_personal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/misha_personal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/misha_personal/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
