# Taller Clasificación

1. Crear copia del programa del taller anterior. Realizar cambios para construir un clasificador que pueda clasificar un nuevo documento a una clase particular usando Naive Bayes. Probar con los ejercicios realizados en clase.

In [1]:
import pandas
import functools
import math
from nltk.tokenize import WordPunctTokenizer

In [2]:
def naive_bayes(training, test):
    categories = {}
    for document in training:
        if document['cat'] in categories:
            categories[document['cat']]['count'] += 1
        else:
            categories[document['cat']] = {}
            categories[document['cat']]['count'] = 1
            categories[document['cat']]['tokens'] = {}
            
    for category in categories:
        categories[category]['probability'] = categories[category]['count'] / len(training)
        
    for document in training:
        document['tokens'] = WordPunctTokenizer().tokenize(document['doc'])
    
    for document in training:
        for token in document['tokens']:
            if token in categories[document['cat']]['tokens']:
                categories[document['cat']]['tokens'][token] += 1
            else:
                categories[document['cat']]['tokens'][token] = 1

    vocabulary = [y for y in {token: 1 for x in categories for token in categories[x]['tokens']}]
    
    test_tokens = {token: 1 for token in WordPunctTokenizer().tokenize(test)}
    
    test = {
        'categories': {
            category: {
                'probability': 0,
                'tokens': dict(test_tokens)
            } for category in categories
        }
    }
        
    for category in test['categories']:
        for test_token in test_tokens:
            try:
                test['categories'][category]['tokens'][test_token] += categories[category]['tokens'][test_token]
            except KeyError:
                #print(f"Skipping token '{test_token}' on category '{category}'")
                category
    
    for category in test['categories']:
        test['categories'][category]['probability'] = categories[category]['probability']
        test['categories'][category]['probability'] *= functools.reduce(lambda x, y: x * y,
            [(test['categories'][category]['tokens'][token] / (len(categories[category]['tokens']) + len(vocabulary))) for token in test['categories'][category]['tokens']])
    
    maximum = 0
    classified_category = ''
    
    for category in test['categories']:
        if test['categories'][category]['probability'] > maximum:
            maximum = test['categories'][category]['probability']
            classified_category = category
    
    maximum *= 100
    maximum = f"{round(maximum, 4)}%"

    print(f"The test document was classified on the '{classified_category}' category with a probability of {maximum}")

In [3]:
training_documents = [
    {'cat': '-', 'doc': 'just plain boring'},
    {'cat': '-', 'doc': 'entirely predictable and lacks energy'},
    {'cat': '-', 'doc': 'no surprises and very few laughs'},
    {'cat': '+', 'doc': 'very powerful'},
    {'cat': '+', 'doc': 'the most fun film of the summer'}
]

test = 'predictable with no fun'

naive_bayes(training_documents, test)

The test document was classified on the '-' category with a probability of 0.0002%


In [4]:
training_documents = [
    {'cat': 'China', 'doc': 'Chinese Beijing Chinese'},
    {'cat': 'China', 'doc': 'Chinese Chinese Shanghai'},
    {'cat': 'China', 'doc': 'Chinese Macao'},
    {'cat': 'Japan', 'doc': 'Tokyo Japan Chinese'}
]

test = 'Chinese Chinese Chinese Tokyo Japan'

naive_bayes(training_documents, test)

The test document was classified on the 'China' category with a probability of 0.45%


2. Basado en la función que implementaron para calcular el tf (conteo de términos para el modelo de lenguaje), calcular el tf-idf y armar los vectores de documentos basados en tfidf y luego normalizar para que un vector de documentos tenga una magnitud de 1. Implementar la similitud coseno entre los dos vectores.

In [5]:
# Frecuencia de terminos en el documento dado
def tf_d(document):
    tokens = {}
    for token in WordPunctTokenizer().tokenize(document):
        tokens[token] = 1 if token not in tokens else tokens[token] + 1
    return(tokens)

In [6]:
def df_t(documents, term):
    return len(
        [
            lst for lst in [
            [
                token for token in WordPunctTokenizer().tokenize(document)] for document in documents
            ] if term in lst
        ]
    )

In [7]:
def tf_idf(documents):
    vocabulary = {token: 0 for token in[token for document in documents for token in WordPunctTokenizer().tokenize(document)]}
    df_t_vocabulary = dict(vocabulary)
    
    for token in df_t_vocabulary:
        df_t_vocabulary[token] = df_t(documents, token)
    
    idf_t = dict(df_t_vocabulary)
    
    for token in idf_t:
        idf_t[token] = math.log10(len(vocabulary) / idf_t[token])
    
    w = {}
    
    for item in range(len(documents)):
        document_tf_d = tf_d(documents[item])
        w[f"Document {item}"] = dict(vocabulary)
        for token in document_tf_d:
            w[f"Document {item}"][token] = math.log10(1 + document_tf_d[token]) * idf_t[token]
        
    return w

In [8]:
print('Documents')
print('---------')
documents = [document['doc'] for document in training_documents]
[print(doc) for doc in documents]

tf_idf_docs = tf_idf(documents)

print('\nDocuments tf_idf (w)')
print('--------------------')
print(pandas.DataFrame(tf_idf_docs))

Documents
---------
Chinese Beijing Chinese
Chinese Chinese Shanghai
Chinese Macao
Tokyo Japan Chinese

Documents tf_idf (w)
--------------------
          Document 0  Document 1  Document 2  Document 3
Beijing     0.234247    0.000000    0.000000    0.000000
Chinese     0.084017    0.084017    0.053009    0.053009
Japan       0.000000    0.000000    0.000000    0.234247
Macao       0.000000    0.000000    0.234247    0.000000
Shanghai    0.000000    0.234247    0.000000    0.000000
Tokyo       0.000000    0.000000    0.000000    0.234247


In [9]:
for document in tf_idf_docs:
    norm = math.sqrt(sum([x * x for x in [tf_idf_docs[document][x] for x in tf_idf_docs[document]]]))
    for token in tf_idf_docs[document]:
        tf_idf_docs[document][token] /= norm
        
print('Vectors normalized')
print('------------------')
print(pandas.DataFrame(tf_idf_docs))

Vectors normalized
------------------
          Document 0  Document 1  Document 2  Document 3
Beijing     0.941286    0.000000    0.000000    0.000000
Chinese     0.337609    0.337609    0.220714    0.158004
Japan       0.000000    0.000000    0.000000    0.698224
Macao       0.000000    0.000000    0.975339    0.000000
Shanghai    0.000000    0.941286    0.000000    0.000000
Tokyo       0.000000    0.000000    0.000000    0.698224


In [10]:
def cosine_similarity(vector_A, vector_B):
    
    return sum([
        element[0] * element[1]
        for element in list(zip([vector_A[x] for x in vector_A], [vector_B[x] for x in vector_B]))
    ])

In [11]:
elements = [tf_idf_docs[item] for item in tf_idf_docs]

for idx, item in enumerate(elements):
    for idy, element in enumerate(elements[idx + 1:]):
        print(f"Cosine between {idx} and {idy + idx + 1}:")
        print(cosine_similarity(elements[idx], elements[idy + idx + 1]))
    print('\n')

Cosine between 0 and 1:
0.11398009621209179
Cosine between 0 and 2:
0.07451500289402613
Cosine between 0 and 3:
0.05334372216788929


Cosine between 1 and 2:
0.07451500289402613
Cosine between 1 and 3:
0.05334372216788929


Cosine between 2 and 3:
0.034873699389777406






3. Con el programa del punto anterior, usando los mismos ejercicios usados en el punto 1 clasificar usando K vecinos más cercanos.

En este punto se tomará el último documento de la lista y se comparará con los tres primeros, con `k = 1`.

In [35]:
k = 1

docs = [tf_idf_docs[doc] for doc in tf_idf_docs]

test = docs.pop(len(docs) - 1)

maximum = (0, 0)

for idx, item in enumerate(docs):
    maximum = (idx, cosine_similarity(item, test))

print(f"Test document is more likely to be in the same category as the {maximum[0] + 1} document, having a score of {round(maximum[1], 4)}:")

for x in docs[maximum[0]]:
    print(f"{x}: {docs[maximum[0]][x]}") 

Test document is more likely to be in the same category as the 3 document, having a score of 0.0349:
Chinese: 0.2207136602535519
Beijing: 0.0
Shanghai: 0.0
Macao: 0.9753386489714635
Tokyo: 0.0
Japan: 0.0
