In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import numpy as np

In [44]:
labeled_data = [("This is a document about sports", "Sports"),
("Gujarati news says here is a savage PM "," News"),
("Ipl gets into peak in sports", "Sports"),
("There is no need of elections among the autcracy minded pm", "Politics"),
("Flute plays a good  music","Music")]
unlabeled_data = ["This document discusses machine learning",
"Another document about music",
"A short text sample"]

In [69]:
all_data = [text for text, _ in labeled_data] + unlabeled_data

In [68]:
texts, labels = zip(*labeled_data)

In [67]:
vectorizer = TfidfVectorizer(max_features=500)
features = vectorizer.fit_transform(all_data)

In [70]:
features_dense = features.toarray()

In [71]:
all_labels = sorted(set(labels))

In [72]:
label_distributions = np.zeros((len(texts), len(all_labels)))
for i, label in enumerate(labels):
 label_distributions[i, all_labels.index(label)] = 1

In [78]:
X_train, X_test, y_train, y_test = train_test_split(features_dense[:len(texts)],
labels, test_size=0.2, random_state=2)

In [79]:
y_train_indices = np.array([all_labels.index(label) for label in y_train])

In [80]:
semi_clf = LabelPropagation()
semi_clf.fit(X_train, y_train_indices)

In [81]:
predictions = semi_clf.predict(X_test)

In [82]:
accuracy = accuracy_score(np.array([all_labels.index(label) for label in y_test]),
predictions)
precision = precision_score(np.array([all_labels.index(label) for label in
y_test]), predictions, average='weighted', labels=np.unique(predictions))
recall = recall_score(np.array([all_labels.index(label) for label in y_test]),
predictions, average='weighted', labels=np.unique(predictions))
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
