<a href="https://colab.research.google.com/github/nh0875/Desafios-Procesamiento-del-Habla-974/blob/main/Desafio_Datasets_C4_Hilliard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Vectorización de texto y modelo de clasificación Naïve Bayes con el dataset 20 newsgroups**

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_20newsgroups
import numpy as np
newsgroups_train=fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test=fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
tfidfvect=TfidfVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 2), sublinear_tf=True)
newsgroups_train.data[0]
X_train=tfidfvect.fit_transform(newsgroups_train.data)
X_test=tfidfvect.transform(newsgroups_test.data)
print(type(X_train))
print(f'shape: {X_train.shape}')
print(f'cantidad de documentos: {X_train.shape[0]}')
print(f'tamaño del vocabulario (dimensionalidad de los vectores): {X_train.shape[1]}')
idx2word={v: k for k,v in tfidfvect.vocabulary_.items()}
y_train=newsgroups_train.target
y_train[:10]
print(f'clases {np.unique(newsgroups_test.target)}')
newsgroups_test.target_names

# **1) Vectorizar Documentos**

In [30]:
import random
from sklearn.metrics.pairwise import cosine_similarity

if X_test.shape[0] >= 20:
    random_indices=random.sample(range(20), 5)
    selected_docs=X_test[random_indices]
    similarities=cosine_similarity(selected_docs, X_test[:20])

    for idx, sim in zip(random_indices, similarities):
        similar_indices=np.argsort(sim)[-6:-1][::-1]
        print(f"Document ID: {idx}")
        print("Most similar documents:")
        for sim_idx in similar_indices:
            print(f"Document ID: {sim_idx}, Similarity: {sim[sim_idx]:.4f}")
        print()
else:
    print("Not enough documents in X_test.")


Document ID: 5
Most similar documents:
Document ID: 3, Similarity: 0.0578
Document ID: 14, Similarity: 0.0535
Document ID: 0, Similarity: 0.0462
Document ID: 1, Similarity: 0.0436
Document ID: 7, Similarity: 0.0391

Document ID: 14
Most similar documents:
Document ID: 5, Similarity: 0.0535
Document ID: 3, Similarity: 0.0512
Document ID: 12, Similarity: 0.0498
Document ID: 18, Similarity: 0.0448
Document ID: 7, Similarity: 0.0351

Document ID: 11
Most similar documents:
Document ID: 14, Similarity: 0.0267
Document ID: 8, Similarity: 0.0252
Document ID: 12, Similarity: 0.0218
Document ID: 5, Similarity: 0.0206
Document ID: 3, Similarity: 0.0199

Document ID: 12
Most similar documents:
Document ID: 14, Similarity: 0.0498
Document ID: 3, Similarity: 0.0415
Document ID: 5, Similarity: 0.0350
Document ID: 1, Similarity: 0.0316
Document ID: 18, Similarity: 0.0265

Document ID: 10
Most similar documents:
Document ID: 7, Similarity: 0.0183
Document ID: 5, Similarity: 0.0139
Document ID: 9, Simi

# **2) Entrenar el Modelo de Clasificacion**

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

X_train, X_val, y_train, y_val=train_test_split(X_train, y_train, test_size=0.2, random_state=42)

models={
    "MultinomialNB": MultinomialNB(),
    "ComplementNB": ComplementNB()
}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred=model.predict(X_val)
    f1=f1_score(y_val, y_pred, average='macro')
    print(f"{model_name} F1 Score: {f1:.4f}")

MultinomialNB F1 Score: 0.5718
ComplementNB F1 Score: 0.7314


# **3) Transponer Matriz y estudiar la similaridad**

In [32]:
X_term_doc=X_train.T

In [48]:
import re

words_to_check=["mouse", "science", "technology", "news", "sports"]

word_indices=[tfidfvect.vocabulary_.get(word) for word in words_to_check if word in tfidfvect.vocabulary_]

for word, idx in zip(words_to_check, word_indices):
    if idx is not None:
        word_vector=tfidfvect.transform([word])
        similarities=cosine_similarity(word_vector, X_train).flatten()
        similar_indices=np.argsort(similarities)[-10:][::-1]
        print(f"Word being considered: {word}")
        print("Most similar words:")
        count=0
        for sim_idx in similar_indices:
            similar_word=idx2word[sim_idx]
            if re.match("^[A-Za-z]+$", similar_word):
                print(f"Word: {similar_word}, Similarity: {similarities[sim_idx]:.4f}")
                count += 1
            if count == 5:
                break
        print()
    else:
        print(f"The word '{word}' is not in the vocabulary.")

Word being considered: mouse
Most similar words:
Word: darwin, Similarity: 0.4084
Word: beb, Similarity: 0.3933
Word: akron, Similarity: 0.3720
Word: axis, Similarity: 0.3354
Word: beginning, Similarity: 0.3296

Word being considered: science
Most similar words:
Word: couriers, Similarity: 0.2383
Word: busch, Similarity: 0.2255
Word: console, Similarity: 0.2118
Word: ashland, Similarity: 0.2089
Word: airframe, Similarity: 0.2029

Word being considered: technology
Most similar words:
Word: arianespace, Similarity: 0.3920
Word: anarchists, Similarity: 0.3652
Word: crunching, Similarity: 0.2950
Word: bandwagon, Similarity: 0.2934
Word: damp, Similarity: 0.2318

Word being considered: news
Most similar words:
Word: archival, Similarity: 0.3344
Word: cora, Similarity: 0.3103
Word: aloud, Similarity: 0.3103
Word: ceilings, Similarity: 0.2972
Word: awesley, Similarity: 0.2918

Word being considered: sports
Most similar words:
Word: atf, Similarity: 0.3238
Word: cache, Similarity: 0.3093
Word: