# Welcome to Jupyter!

This repo contains an introduction to [Jupyter](https://jupyter.org) and [IPython](https://ipython.org).

Outline of some basics:

* [Notebook Basics](../examples/Notebook/Notebook Basics.ipynb)
* [IPython - beyond plain python](../examples/IPython Kernel/Beyond Plain Python.ipynb)
* [Markdown Cells](../examples/Notebook/Working With Markdown Cells.ipynb)
* [Rich Display System](../examples/IPython Kernel/Rich Output.ipynb)
* [Custom Display logic](../examples/IPython%20Kernel/Custom%20Display%20Logic.ipynb)
* [Running a Secure Public Notebook Server](../examples/Notebook/Running%20the%20Notebook%20Server.ipynb#Securing-the-notebook-server)
* [How Jupyter works](../examples/Notebook/Multiple%20Languages%2C%20Frontends.ipynb) to run code in different languages.

You can also get this tutorial and run it on your laptop:

    git clone https://github.com/ipython/ipython-in-depth

Install IPython and Jupyter:

with [conda](https://www.anaconda.com/download):

    conda install ipython jupyter

with pip:

    # first, always upgrade pip!
    pip install --upgrade pip
    pip install --upgrade ipython jupyter

Start the notebook in the tutorial directory:

    cd ipython-in-depth
    jupyter notebook

In [32]:
'''
import numpy as np

titulos = (['alargue', 'automóvil', 'casa', 'novedoso'])

x = ([[3, 3, 1, 6, 'SPAM'],
		[1,	4,	7,	1,	'HAM'],
		[1,	4,	3,	1,	'HAM'],
		[4,	2,	1,	5,	'SPAM'],
		[1,	3,	4,	2,	'HAM'],
		[2,	4,	5,	1,	'HAM']])
tabla = np.asarray(x)

conjunta_SPAM = ([0,	0,	0,	0,	'SPAM'])
conjunta_HAM = ([0,	0,	0,	0,	'HAM'])

contador = 0
cantidad_SPAM = 0
cantidad_HAM = 0
for fila in tabla:
    contador += 1
    if fila[4] == 'SPAM':
        cantidad_SPAM += 1
        conjunta_SPAM[0] = conjunta_SPAM[0] + float(fila[0])
        conjunta_SPAM[1] = conjunta_SPAM[1] + float(fila[1])
        conjunta_SPAM[2] = conjunta_SPAM[2] + float(fila[2])
        conjunta_SPAM[3] = conjunta_SPAM[3] + float(fila[3])
    else:
        cantidad_HAM += 1
        conjunta_HAM[0] = conjunta_HAM[0] + float(fila[0])
        conjunta_HAM[1] = conjunta_HAM[1] + float(fila[1])
        conjunta_HAM[2] = conjunta_HAM[2] + float(fila[2])
        conjunta_HAM[3] = conjunta_HAM[3] + float(fila[3])

p_SPAM = cantidad_SPAM/contador
p_HAM = cantidad_HAM/contador
print("La probablidad de que sea SPAM es: " + str(round(p_SPAM, 2)))
print("La probablidad de que sea HAM es: " + str(round(p_HAM, 2)))

#P(Alargue/SPAM) = P(Alargue,SPAM) * P(SPAM)
condicional_SPAM = []
condicional_HAM = []
i = 0
for palabra in titulos:
    condicional_SPAM.append(conjunta_SPAM[i] * p_SPAM)
    condicional_HAM.append(conjunta_HAM[i] * p_HAM)
    print("La probabilidad de " + palabra + " siendo la categoria SPAM es: " + str(round(condicional_SPAM[i], 2)))
    print("La probabilidad de " + palabra + " siendo la categoria HAM es: " + str(round(condicional_HAM[i], 2)))
    i += 1
'''

from sklearn.feature_extraction.text import CountVectorizer
import os
from collections import Counter
import numpy as np
from matplotlib import pyplot as plt

def make_Dictionary():
    emails_array = []
    emails_array.append("alargue alargue alargue automóvil automóvil automóvil casa novedoso novedoso novedoso novedoso novedoso novedoso")
    emails_array.append("alargue automóvil automóvil automóvil automóvil casa casa casa casa casa casa casa novedoso")
    emails_array.append("alargue automóvil automóvil automóvil automóvil casa casa casa novedoso")
    emails_array.append("alargue alargue alargue alargue automóvil automóvil casa novedoso novedoso novedoso novedoso novedoso")
    emails_array.append("alargue automóvil automóvil automóvil casa casa casa casa novedoso novedoso")
    emails_array.append("alargue alargue automóvil automóvil casa casa casa casa casa novedoso")
    labels = []
    labels.append(True)
    labels.append(False)
    labels.append(False)
    labels.append(True)
    labels.append(False)
    labels.append(False)
    labels = np.array(labels)
    vocabulary = ['alargue', 'automóvil', 'casa', 'novedoso']
    return vocabulary, emails_array, labels

vocabulary, emails_array, labels = make_Dictionary()
count_vectorizer = CountVectorizer(vocabulary=vocabulary)
feature_matrix = count_vectorizer.fit_transform(emails_array)


def get_histogram_from_matrix(alpha, feature_matrix):
    words_distribution_un = feature_matrix.sum(axis=0) + alpha
    words_distribution = words_distribution_un/(words_distribution_un.sum()) # + alpha*feature_matrix.shape[1])
    return words_distribution

spam_feature_matrix = feature_matrix[labels,:]
ham_feature_matrix = feature_matrix[np.logical_not(labels),:]

# P(x_i/SPAM)
spam_words_distribution = get_histogram_from_matrix(1, spam_feature_matrix)
# P(x_i/HAM)
ham_words_distribution = get_histogram_from_matrix(1, ham_feature_matrix)

def get_accuracy(feature_matrix, labels, spam_words_distribution, ham_words_distribution):
    P_SPAM = labels.nonzero()[0].shape[0]/len(labels) # Por que no es necesario en este caso?
    P_HAM = np.logical_not(labels).nonzero()[0].shape[0]/len(labels) # Por que no es necesario en este caso?
    correct_count = 0
    for i, row in enumerate(feature_matrix):
        coded_email = row.nonzero()
        spam_prod = np.log(spam_words_distribution[coded_email]) * feature_matrix[i][coded_email].T
        ham_prod = np.log(ham_words_distribution[coded_email]) * feature_matrix[i][coded_email].T
        spam_log_likelihood = spam_prod+ np.log(P_SPAM)
        ham_log_likelihood = ham_prod + np.log(P_HAM)
        SPAM = spam_log_likelihood>=ham_log_likelihood
        if SPAM == labels[i]:
            correct_count = correct_count + 1
    print('%0.3f'%(correct_count/len(labels)))



get_accuracy(feature_matrix, labels, spam_words_distribution, ham_words_distribution)


###################RESOLUCION###################
#Punto 1
P(SPAM) = 1/3
P(HAM) = 2/3
#Punto 2
#Esto es para cada palabra
P(w=alargue/SPAM) = 7/25
#Punto 3
P(SPAM/x1, x2, x3, x4) = (P(x1/SPAM)^4.P(x2/SPAM).P(x3/SPAM).P(x4/SPAM)^5.P(SPAM))/P(x1, x2, x3, x4, SPAM)
P(HAM/x1, x2, x3, x4) = (P(x1/HAM)^4.P(x2/HAM).P(x3/HAM).P(x4/HAM)^5.P(HAM))/P(x1, x2, x3, x4, HAM)

1.000
