In [None]:
from sklearn.datasets import fetch_20newsgroups

# Cargamos los datos (ya separados en train y test)
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instanciamos el vectorizador
tfidfvect = TfidfVectorizer()

# Ajustamos y transformamos los datos de entrenamiento
X_train = tfidfvect.fit_transform(newsgroups_train.data)
X_test = tfidfvect.transform(newsgroups_test.data)


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Tomamos 5 documentos al azar
np.random.seed(42)
random_docs = np.random.choice(X_train.shape[0], 5, replace=False)

for idx in random_docs:
    cossim = cosine_similarity(X_train[idx], X_train)[0]
    most_similar_docs = np.argsort(cossim)[::-1][1:6]  # 5 más similares excluyendo el mismo documento
    print(f"Documento original (ID: {idx}): {newsgroups_train.data[idx][:200]}...")
    print(f"Clase original: {newsgroups_train.target_names[newsgroups_train.target[idx]]}")
    for sim_idx in most_similar_docs:
        print(f"Documento similar (ID: {sim_idx}): {newsgroups_train.data[sim_idx][:200]}...")
        print(f"Clase similar: {newsgroups_train.target_names[newsgroups_train.target[sim_idx]]}")
    print("\n")


Documento original (ID: 7492): Could someone please post any info on these systems.

Thanks.
BoB
-- 
---------------------------------------------------------------------- 
Robert Novitskey | "Pursuing women is similar to banging o...
Clase original: comp.sys.mac.hardware
Documento similar (ID: 10935): Hey everybody:

   I want to buy a mac and I want to get a good price...who doesn't?  So,
could anyone out there who has found a really good deal on a Centris 650
send me the price.  I don't want to k...
Clase similar: comp.sys.mac.hardware
Documento similar (ID: 7258): Hay all:

    Has anyone out there heard of any performance stats on the fabled p24t.
 I was wondering what it's performance compared to the 486/66 and/or
pentium would be.  Any info would be helpful....
Clase similar: comp.sys.ibm.pc.hardware
Documento similar (ID: 4971): Could someone please send instructions for installing simms and vram to 
jmk13@po.cwru.edu?  He's just gotten his 700 and wants to drop in some 
extra 

In [None]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score

# Entrenamos el modelo MultinomialNB
clf_multinomial = MultinomialNB()
clf_multinomial.fit(X_train, newsgroups_train.target)
y_pred_multinomial = clf_multinomial.predict(X_test)
f1_multinomial = f1_score(newsgroups_test.target, y_pred_multinomial, average='macro')

# Entrenamos el modelo ComplementNB
clf_complement = ComplementNB()
clf_complement.fit(X_train, newsgroups_train.target)
y_pred_complement = clf_complement.predict(X_test)
f1_complement = f1_score(newsgroups_test.target, y_pred_complement, average='macro')

print(f"F1-score MultinomialNB: {f1_multinomial}")
print(f"F1-score ComplementNB: {f1_complement}")


F1-score MultinomialNB: 0.5854345727938506
F1-score ComplementNB: 0.692953349950875


In [None]:
# Transponemos la matriz documento-término
X_train_transposed = X_train.T

# Seleccionamos 5 palabras al azar
vocab_size = X_train_transposed.shape[0]
random_words = np.random.choice(vocab_size, 5, replace=False)

# Obtenemos el índice a palabra
idx2word = {v: k for k, v in tfidfvect.vocabulary_.items()}

for word_idx in random_words:
    word = idx2word[word_idx]
    cossim_words = cosine_similarity(X_train_transposed[word_idx], X_train_transposed)[0]
    most_similar_words = np.argsort(cossim_words)[::-1][1:6]
    print(f"Palabra original: {word}")
    for sim_idx in most_similar_words:
        similar_word = idx2word[sim_idx]
        print(f"Palabra similar: {similar_word}")
    print("\n")


Palabra original: 5k6
Palabra similar: ajwmw
Palabra similar: rk1ir
Palabra similar: kjznkh
Palabra similar: mk0
Palabra similar: lyjs


Palabra original: calmed
Palabra similar: decoys
Palabra similar: beater
Palabra similar: smack
Palabra similar: canadians
Palabra similar: bb


Palabra original: difficult
Palabra similar: emr
Palabra similar: conqueror
Palabra similar: grinds
Palabra similar: actuaries
Palabra similar: riflemen


Palabra original: 9zq6
Palabra similar: g_s
Palabra similar: 47x
Palabra similar: g__r
Palabra similar: g_eb
Palabra similar: 12aa8


Palabra original: fs3knsl
Palabra similar: e8270825a
Palabra similar: hvx
Palabra similar: mlyj
Palabra similar: 46s8v
Palabra similar: 9s13l


