# Taller Clasificación

1. Crear copia del programa del taller anterior. Realizar cambios para construir un clasificador que pueda clasificar un nuevo documento a una clase particular usando Naive Bayes. Probar con los ejercicios realizados en clase.

In [1]:
import pandas
import functools
from nltk.tokenize import WordPunctTokenizer

In [2]:
def naive_bayes(training, test):
    categories = {}
    for document in training:
        if document['cat'] in categories:
            categories[document['cat']]['count'] += 1
        else:
            categories[document['cat']] = {}
            categories[document['cat']]['count'] = 1
            categories[document['cat']]['tokens'] = {}
            
    for category in categories:
        categories[category]['probability'] = categories[category]['count'] / len(training)
        
    for document in training:
        document['tokens'] = WordPunctTokenizer().tokenize(document['doc'])
    
    for document in training:
        for token in document['tokens']:
            if token in categories[document['cat']]['tokens']:
                categories[document['cat']]['tokens'][token] += 1
            else:
                categories[document['cat']]['tokens'][token] = 1

    vocabulary = [y for y in {token: 1 for x in categories for token in categories[x]['tokens']}]
    
    test_tokens = {token: 1 for token in WordPunctTokenizer().tokenize(test)}
    
    test = {
        'categories': {
            category: {
                'probability': 0,
                'tokens': dict(test_tokens)
            } for category in categories
        }
    }
        
    for category in test['categories']:
        for test_token in test_tokens:
            try:
                test['categories'][category]['tokens'][test_token] += categories[category]['tokens'][test_token]
            except KeyError:
                #print(f"Skipping token '{test_token}' on category '{category}'")
                category
    
    for category in test['categories']:
        test['categories'][category]['probability'] = categories[category]['probability']
        test['categories'][category]['probability'] *= functools.reduce(lambda x, y: x * y,
            [(test['categories'][category]['tokens'][token] / (len(categories[category]['tokens']) + len(vocabulary))) for token in test['categories'][category]['tokens']])
    
    maximum = 0
    classified_category = ''
    
    for category in test['categories']:
        if test['categories'][category]['probability'] > maximum:
            maximum = test['categories'][category]['probability']
            classified_category = category
    
    maximum *= 100
    maximum = f"{round(maximum, 4)}%"

    print(f"The test document was classified on the '{classified_category}' category with a probability of {maximum}")

In [3]:
training_documents = [
    {'cat': '-', 'doc': 'just plain boring'},
    {'cat': '-', 'doc': 'entirely predictable and lacks energy'},
    {'cat': '-', 'doc': 'no surprises and very few laughs'},
    {'cat': '+', 'doc': 'very powerful'},
    {'cat': '+', 'doc': 'the most fun film of the summer'}
]

test = 'predictable with no fun'

naive_bayes(training_documents, test)

The test document was classified on the '-' category with a probability of 0.0002%


In [13]:
training_documents = [
    {'cat': 'China', 'doc': 'Chinese Beijing Chinese'},
    {'cat': 'China', 'doc': 'Chinese Chinese Shanghai'},
    {'cat': 'China', 'doc': 'Chinese Macao'},
    {'cat': 'Japan', 'doc': 'Tokyo Japan Chinese'}
]

test = 'Chinese Chinese Chinese Tokyo Japan'

naive_bayes(training_documents, test)

The test document was classified on the 'China' category with a probability of 0.45%


2. Basado en la función que implementaron para calcular el tf (conteo de términos para el modelo de lenguaje), calcular el tf-idf y armar los vectores de documentos basados en tfidf y luego normalizar para que un vector de documentos tenga una magnitud de 1. Implementar la similitud coseno entre los dos vectores.

In [60]:
# Frecuencia de terminos en el documento dado
def tf_d(document):
    tokens = {}
    for token in WordPunctTokenizer().tokenize(document):
        tokens[token] = 1 if token not in tokens else tokens[token] + 1
    return(tokens)
    
def df_t(documents, term):
    return len(
        [
            lst for lst in [
            [
                token for token in WordPunctTokenizer().tokenize(document)] for document in documents
            ] if term in lst
        ]
    )

def tf_idf(documents):
    vocabulary = {token: 0 for token in[token for document in documents for token in WordPunctTokenizer().tokenize(document)]}
    df_t_vocabulary = dict(vocabulary)
    for token in df_t_vocabulary:
        df_t_vocabulary[token] = df_t(documents, token)
    print(df_t_vocabulary)

print(tf_idf([document['doc'] for document in training_documents]))

{'Chinese': 4, 'Beijing': 1, 'Shanghai': 1, 'Macao': 1, 'Tokyo': 1, 'Japan': 1}
None
