## Supervised learning

This time we also import labels from the newspaper dataset from scikit-learn library.

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Fetch the dataset
newsgroups_data = fetch_20newsgroups(subset='all')
X, y = newsgroups_data.data, newsgroups_data.target

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a TfidfVectorizer: Convert a collection of raw documents to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Initialize the LinearSVC model
model = LinearSVC()

# Fit the model on the train data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.93      0.93      0.93       151
           1       0.84      0.88      0.86       202
           2       0.89      0.86      0.88       195
           3       0.77      0.78      0.78       183
           4       0.91      0.91      0.91       205
           5       0.92      0.91      0.91       215
           6       0.85      0.87      0.86       193
           7       0.93      0.95      0.94       196
           8       0.98      0.96      0.97       168
           9       0.99      0.99      0.99       211
          10       0.96      0.99      0.98       198
          11       0.98      0.97      0.97       201
          12       0.93      0.87      0.90       202
          13       0.95      0.95      0.95       194
          14       0.96      0.99      0.97       189
          15       0.97      0.98      0.97       202
          16       0.94      0.96      0.95       188
          17       0.98    

## Visualizing the results

In [2]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np


# Reduce the dimensionality of the data to two dimensions using PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train.toarray())
X_test_pca = pca.transform(X_test.toarray())

# Train the LinearSVC model on the reduced data
model_pca = LinearSVC()
model_pca.fit(X_train_pca, y_train)

# The function to visualize the decision boundaries
def plot_decision_boundary(model, X, y):
    # Create a mesh to plot in
    h = .02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    # Obtain labels for each point in mesh
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, alpha=0.8)

    # Plot also the original points
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', linewidth=1, marker='o', s=65)
    plt.xlim(X[:, 0].min() - 1, X[:, 0].max() + 1)
    plt.ylim(X[:, 1].min() - 1, X[:, 1].max() + 1)

# Visualize the model
plot_decision_boundary(model_pca, X_test_pca, y_test)
plt.show()