In [1]:
%pip install numpy scikit-learn

Collecting numpy
  Using cached numpy-2.3.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.16.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached numpy-2.3.1-cp311-cp311-win_amd64.whl (13.0 MB)
Downloading scikit_learn-1.7.1-cp311-cp311-win_amd64.whl (8.9 MB)
   ---------------------------------------- 0.0/8.9 MB ? eta -:--:--
   ------------------------------- -------- 7.1/8.9 MB 36.4 MB/s eta 0:00:01
   ---------------------------------------  8.9/8.9 MB 37.0 MB/s eta 0:00:01
   ---------------------------------------  8.9/8.9 MB 37.0 MB/s eta 0:00:01
   ---------------------------------------

### Vectorización de texto y modelo de clasificación Naïve Bayes con el dataset 20 newsgroups

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score

# 20newsgroups por ser un dataset clásico de NLP ya viene incluido y formateado
# en sklearn
from sklearn.datasets import fetch_20newsgroups
import numpy as np

## Carga de datos

In [3]:
# cargamos los datos (ya separados de forma predeterminada en train y test)
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

## Vectorización

In [4]:
# instanciamos un vectorizador
# ver diferentes parámetros de instanciación en la documentación de sklearn https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidfvect = TfidfVectorizer()

In [5]:
# en el atributo `data` accedemos al texto
print(newsgroups_train.data[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [6]:
# con la interfaz habitual de sklearn podemos fitear el vectorizador
# (obtener el vocabulario y calcular el vector IDF)
# y transformar directamente los datos
X_train = tfidfvect.fit_transform(newsgroups_train.data)
# `X_train` la podemos denominar como la matriz documento-término

In [7]:
# recordar que las vectorizaciones por conteos son esparsas
# por ello sklearn convenientemente devuelve los vectores de documentos
# como matrices esparsas
print(type(X_train))
print(f'shape: {X_train.shape}')
print(f'Cantidad de documentos: {X_train.shape[0]}')
print(f'Tamaño del vocabulario (dimensionalidad de los vectores): {X_train.shape[1]}')

<class 'scipy.sparse._csr.csr_matrix'>
shape: (11314, 101631)
Cantidad de documentos: 11314
Tamaño del vocabulario (dimensionalidad de los vectores): 101631


In [8]:
# una vez fiteado el vectorizador, podemos acceder a atributos como el vocabulario
# aprendido. Es un diccionario que va de términos a índices.
# El índice es la posición en el vector de documento.
tfidfvect.vocabulary_['car']

25775

In [9]:
# es muy útil tener el diccionario opuesto que va de índices a términos
idx2word = {v: k for k,v in tfidfvect.vocabulary_.items()}

In [10]:
# en `y_train` guardamos los targets que son enteros
y_train = newsgroups_train.target
y_train[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [11]:
# hay 20 clases correspondientes a los 20 grupos de noticias
print(f'clases {np.unique(newsgroups_test.target)}')
newsgroups_test.target_names

clases [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Similaridad de documentos

In [12]:
# Veamos similaridad de documentos. Tomemos algún documento
idx = 4811
print(newsgroups_train.data[idx])

THE WHITE HOUSE

                  Office of the Press Secretary
                   (Pittsburgh, Pennslyvania)
______________________________________________________________
For Immediate Release                         April 17, 1993     

             
                  RADIO ADDRESS TO THE NATION 
                        BY THE PRESIDENT
             
                Pittsburgh International Airport
                    Pittsburgh, Pennsylvania
             
             
10:06 A.M. EDT
             
             
             THE PRESIDENT:  Good morning.  My voice is coming to
you this morning through the facilities of the oldest radio
station in America, KDKA in Pittsburgh.  I'm visiting the city to
meet personally with citizens here to discuss my plans for jobs,
health care and the economy.  But I wanted first to do my weekly
broadcast with the American people. 
             
             I'm told this station first broadcast in 1920 when
it reported that year's presidential elec

In [None]:
# midamos la similaridad coseno con todos los documentos de train
cossim = cosine_similarity(X_train[idx], X_train)[0]

In [14]:
# podemos ver los valores de similaridad ordenados de mayor a menos
np.sort(cossim)[::-1]

array([1.        , 0.70930477, 0.67474953, ..., 0.        , 0.        ,
       0.        ], shape=(11314,))

In [15]:
# y a qué documentos corresponden
np.argsort(cossim)[::-1]

array([ 4811,  6635,  4253, ...,  1534, 10055,  4750], shape=(11314,))

In [16]:
# los 5 documentos más similares:
mostsim = np.argsort(cossim)[::-1][1:6]

In [17]:
# el documento original pertenece a la clase:
newsgroups_train.target_names[y_train[idx]]

'talk.politics.misc'

In [18]:
# y los 5 más similares son de las clases:
for i in mostsim:
  print(newsgroups_train.target_names[y_train[i]])

talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc


### Modelo de clasificación Naïve Bayes

In [19]:
# es muy fácil instanciar un modelo de clasificación Naïve Bayes y entrenarlo con sklearn
clf = MultinomialNB()
clf.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [20]:
# con nuestro vectorizador ya fiteado en train, vectorizamos los textos
# del conjunto de test
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred =  clf.predict(X_test)

In [21]:
# el F1-score es una metrica adecuada para reportar desempeño de modelos de claificación
# es robusta al desbalance de clases. El promediado 'macro' es el promedio de los
# F1-score de cada clase. El promedio 'micro' es equivalente a la accuracy que no
# es una buena métrica cuando los datasets son desbalanceados
f1_score(y_test, y_pred, average='macro')

0.5854345727938506

### Consigna del desafío 1

**1**. Vectorizar documentos. Tomar 5 documentos al azar y medir similaridad con el resto de los documentos.
Estudiar los 5 documentos más similares de cada uno analizar si tiene sentido
la similaridad según el contenido del texto y la etiqueta de clasificación.

**2**. Entrenar modelos de clasificación Naïve Bayes para maximizar el desempeño de clasificación
(f1-score macro) en el conjunto de datos de test. Considerar cambiar parámteros
de instanciación del vectorizador y los modelos y probar modelos de Naïve Bayes Multinomial
y ComplementNB.

**3**. Transponer la matriz documento-término. De esa manera se obtiene una matriz
término-documento que puede ser interpretada como una colección de vectorización de palabras.
Estudiar ahora similaridad entre palabras tomando 5 palabras y estudiando sus 5 más similares. **La elección de palabras no debe ser al azar para evitar la aparición de términos poco interpretables, elegirlas "manualmente"**.


Cantidad de documentos: 11314

Tomaremos el doc 2100, 3100, 4100, 4101, 5100

In [22]:
idx1 = 2100
idx2 = 3100
idx3 = 4100
idx4 = 4101
idx5 = 5100
cossim1 = cosine_similarity(X_train[idx1], X_train)[0]
cossim2 = cosine_similarity(X_train[idx2], X_train)[0]
cossim3 = cosine_similarity(X_train[idx3], X_train)[0]
cossim4 = cosine_similarity(X_train[idx4], X_train)[0]
cossim5 = cosine_similarity(X_train[idx5], X_train)[0]

El primero va a ser nuestro doc seleccionado

In [38]:
top_indices = np.argsort(cossim1)[::-1][:6]

for idx in top_indices:
    print(newsgroups_train.target_names[y_train[idx]])
    print(f"Index: {idx}")
    print(newsgroups_train.data[idx])  # Print the first 500 characters
    print("-" * 80)

sci.electronics
Index: 2100
Does any one know if the 6551 is timing/pin compatible with the 6551.. 
It seems the 6551 has in iheirent bug with cts/rts handshaking and i need
a suitable pin replacement to put in my serial card... possibly a buffered
version perhaps?


--------------------------------------------------------------------------------
sci.electronics
Index: 7268


No, the 6551A is able to operate in a 2 MHz system, the 6551 can only
take 1 MHz without problems.

If you see a 8551 made by MOS or CSG, take it, its a 6551A.


I know no fixed version of the 6551. There are different possibilities
to work around this bug. The easiest is to tie _CTS to GND and
use _DSR or _DCD as _CTS. It should be easy to fix the software,
_DSR is bit 6 and DCD ist bit 5 of the Status Register (Read 6551
with RS1 = low and RS0 = high).

Using the _CTS-line can lead into big trouble. The 6551 _instantly_
stops transmission if _CTS goes high. This means, that you may
get only a half byte...

Gerri

Primer caso, todos parecen estar hablando de chips o leds, el doc mas similar hasta parece una respuesta directa. Aquellos con una etiqueta distinta tambien se encuentran en una tematica similar.

In [37]:
top_indices = np.argsort(cossim2)[::-1][:6]

for idx in top_indices:
    print(newsgroups_train.target_names[y_train[idx]])
    print(f"Index: {idx}")
    print(newsgroups_train.data[idx])  # Print the first 500 characters
    print("-" * 80)

talk.politics.mideast
Index: 3100
In today's Israeline posting, at the end (an afterthought?), I read:


While I applaud investing of money in Yehuda, Shomron, v'Chevel-Azza,
in order to create jobs for their residents, I find it deplorable that
this has never been an active policy of any Israeli administration
since 1967, *with regard to their Jewish residents*. Past governments
found funds to subsidize cheap (read: affordable) housing and the
requisite infrastructure, but where was the investment for creating
industry (which would have generated income *and* jobs)? 

After 26 years, Yehuda and Shomron remain barren, bereft of even 
middle-sized industries, and the Jewish settlements are sterile
"bedroom communities", havens for (in the main) Israelis (both
secular *and* religious) who work in Tel-Aviv or Jerusalem but
cannot afford to live in either city or their surrounding suburbs.

There's an old saying: "bli giboosh, ayn kivoosh" -- just living there
wasn't enough, we had to *rea

Todos estan etiquetados en politica, y hablan de trabajo o inversiones, ninguno habla especificamente de los temas del texto seleccionado.

In [36]:
top_indices = np.argsort(cossim3)[::-1][:6]

for idx in top_indices:
    print(newsgroups_train.target_names[y_train[idx]])
    print(f"Index: {idx}")
    print(newsgroups_train.data[idx])  # Print the first 500 characters
    print("-" * 80)

sci.crypt
Index: 4100
Someone in Canada asked me to send him some public domain DES file
encryption code I have.  Is it legal for me to send it?

Thanx.
--
Eschew Obfuscation

Rob deFriesse                    Mail:  rj@ri.cadre.com
Cadre Technologies Inc.          Phone:  (401) 351-5950
222 Richmond St.                 Fax:    (401) 351-7380
Providence, RI  02903
--------------------------------------------------------------------------------
rec.sport.hockey
Index: 10211
I found this press release from Trial Lawyers for Public Justice on
another system, and thought it would be of interest on campuses 
where the administration or the athletics department wants to 
eliminate the women's ice hockey team.

 Women Athletes, TLP Win Sex Discrimination Ruling: Brown University
Ordered to Restore Two Women's Varsity Teams
To: National Desk, Sports Writer
 Contact: Lynette Labinger, 401-421-9794, home 401-274-7507, or
          Ray Marcaccio, 401-831-8900, both of Trial Lawyers For
          P

No tan preciso como los anteriores, la construccion de similitudes dieron como resultado una interpretación muy abierta de la palabra "public". El etiquetado refleja esto.

In [35]:
top_indices = np.argsort(cossim4)[::-1][:6]

for idx in top_indices:
    print(newsgroups_train.target_names[y_train[idx]])
    print(f"Index: {idx}")
    print(newsgroups_train.data[idx])  # Print the first 500 characters
    print("-" * 80)

talk.politics.mideast
Index: 4101
Mr. Freeman:

Please find something more constructive to do with your time rather
than engaging in fantasy..... Not that I have a particular affinty
to Arafat or anything.

John


--------------------------------------------------------------------------------
talk.politics.mideast
Index: 1577



	Hate to be simple minded about this Tim, but I think its
really very simple.  He was a dirty Jew.  And the only good Jew, in
some peoples mind, is a dead Jew.  Thats what 40 years of propaganda
that fails to discriminate between Jew and Zionist will do.  Thats
what 20 years of statements like the ones I've appended will do to
someones mind.  They make people sick.  They drag down political
discourse to the point where killing your opponent is an honorable way
to resolve a dispute.

	What else can come of such demagogery?  Peace?

Adam


Arafat on political pluralism:

	``Any Palestinian leader who suggests ending the intifada
	exposes himself to the bullets o

Los primeros 2 textos estan bastante relacionados, mas bien por nombre que otra cosa, pero es algo esperable considerando lo breve y general del texto seleccionado.

In [39]:
top_indices = np.argsort(cossim5)[::-1][:6]

for idx in top_indices:
    print(newsgroups_train.target_names[y_train[idx]])
    print(f"Index: {idx}")
    print(newsgroups_train.data[idx])  # Print the first 500 characters
    print("-" * 80)

rec.sport.baseball
Index: 5100
Name            Pos   AB    H    2B    3B    HR    RBI    RS    SB    E    AVG
------------------------------------------------------------------------------
Boston          OF    12    7                        2     6              .583
Galarraga       1B    28   13     3           1      9     2              .464
Tatum           3B     5    2     1                                       .400
Cole            CF    24    9           1            2     8     2        .375
E. Young        2B    28    9     1     1     1      5    10     5    3   .321
Hayes           3B    25    7     1           2      5     2     1    2   .280
Murphy          OF     4    1                        1                    .250
Bichette        RF    21    5                 1      5     3     1        .238
Clark           LF    24    5     2                        2          1   .208
Girardi          C    25    5     1     1            3     2              .200
Castilla        SS   

El texto original era solamente una tabla, selecciono todos textos con tablas, 3 de los cuales son de baseball, en ese sentido su precisión es initeresante, parece que la estructura común de las tablas de baseball afecta su selección

### Prueba con otros modelos

In [40]:
#multinomial con un alpha de 0.5
nb_model = MultinomialNB(alpha=0.5, fit_prior=True)
nb_model.fit(X_train, y_train)

0,1,2
,alpha,0.5
,force_alpha,True
,fit_prior,True
,class_prior,


In [42]:
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred =  nb_model.predict(X_test)

In [43]:
f1_score(y_test, y_pred, average='macro')

0.615341523969213

Mejor que los resultados originales, no por mucho igual.

In [44]:
#ComplementNb default
com_model = ComplementNB()
com_model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,
,norm,False


In [46]:
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred = com_model.predict(X_test)

In [47]:
f1_score(y_test, y_pred, average='macro')

0.692953349950875

Mucho mejor, una mejora del 10% respecto al original.

In [52]:
#ComplementNb con un alpha de 0.5
com_model2 = ComplementNB(alpha= 0.5)
com_model2.fit(X_train, y_train)

0,1,2
,alpha,0.5
,force_alpha,True
,fit_prior,True
,class_prior,
,norm,False


In [54]:
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred = com_model2.predict(X_test)

In [55]:
f1_score(y_test, y_pred, average='macro')

0.6961156947315815

Ninguna diferencia notable con el otro ComplementNB.

**3**. Transponer la matriz documento-término. De esa manera se obtiene una matriz
término-documento que puede ser interpretada como una colección de vectorización de palabras.
Estudiar ahora similaridad entre palabras tomando 5 palabras y estudiando sus 5 más similares. **La elección de palabras no debe ser al azar para evitar la aparición de términos poco interpretables, elegirlas "manualmente"**.

In [56]:
X_train_T = X_train.T

In [59]:
#cossim1 = cosine_similarity(X_train[idx1], X_train)[0]
palabras = ['car', 'war', 'computer', 'sports', 'job']
word2idx = tfidfvect.vocabulary_

for word in palabras:
    if word not in word2idx:
        print(f"{word} no esta")

In [61]:
for word in palabras:
    idx = word2idx[word]
    word_vec = X_train_T[idx]

    # Similaridad coseno entre esta palabra y todas las demás
    cossim = cosine_similarity(word_vec, X_train_T).flatten()

    # Obtener los índices de las 6 más similares (incluye la palabra misma)
    top_indices = np.argsort(cossim)[::-1][1:6]  # excluye a sí misma (índice 0)

    # Mapear índices a palabras
    idx2word = {i: w for w, i in word2idx.items()}

    print(f"\nPalabra: {word}")
    for i, sim_idx in enumerate(top_indices):
        print(f"  Similar {i+1}: {idx2word[sim_idx]} (cos sim = {cossim[sim_idx]:.4f})")


Palabra: car
  Similar 1: cars (cos sim = 0.1797)
  Similar 2: criterium (cos sim = 0.1770)
  Similar 3: civic (cos sim = 0.1748)
  Similar 4: owner (cos sim = 0.1689)
  Similar 5: dealer (cos sim = 0.1681)

Palabra: war
  Similar 1: irag (cos sim = 0.2548)
  Similar 2: dresden (cos sim = 0.2376)
  Similar 3: 1948 (cos sim = 0.2366)
  Similar 4: lauches (cos sim = 0.2071)
  Similar 5: drugs (cos sim = 0.1981)

Palabra: computer
  Similar 1: decwriter (cos sim = 0.1563)
  Similar 2: harkens (cos sim = 0.1522)
  Similar 3: deluged (cos sim = 0.1522)
  Similar 4: shopper (cos sim = 0.1443)
  Similar 5: the (cos sim = 0.1361)

Palabra: sports
  Similar 1: wip (cos sim = 0.3699)
  Similar 2: rockin (cos sim = 0.3599)
  Similar 3: pollute (cos sim = 0.3599)
  Similar 4: jockeys (cos sim = 0.3599)
  Similar 5: admittdly (cos sim = 0.2565)

Palabra: job
  Similar 1: estatic (cos sim = 0.2049)
  Similar 2: wage (cos sim = 0.1814)
  Similar 3: computationally (cos sim = 0.1725)
  Similar 4: han

Las relaciones estan bastante bien construidas, excepto en computer, donde incluso hace uso de la palabra "the" la cual es en exceso general.