In [27]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer

### Explorando los datos

In [18]:
df = pd.read_csv('./emails3.csv')
df.head()

Unnamed: 0,message,class
0,------000000000000000000000\n\nContent-Type: t...,spam
1,Legal TV Descarmbler\n\n\n\nWant to watch Spor...,spam
2,FROM: COL. MICHAEL BUNDU. \n\nDEMOCRATIC REPUB...,spam
3,"<html>\n\n<body>\n\n<p align=""center""><a href=...",spam
4,ATTN:SIR/MADAN \n\n\n\n ...,spam


In [28]:
df.shape

(3000, 2)

### Separando datos de prueba y entrenamiento

In [29]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=1)
train_set.shape

(2400, 2)

### Preprocesando los datos

Se convierten los mensages a una matriz donde cada columna representa la presencia de una parabra en determinado mensage(fila)

In [30]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(train_set['message'].values)
targets = train_set['class'].copy()

In [31]:
counts.toarray()

array([[1, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [32]:
targets.head()

268     spam
32      spam
199     spam
1488     ham
228     spam
Name: class, dtype: object

### Probando varios algoritmos de ML

In [33]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

#### Algunos datos rapidos de pruebas.

In [34]:
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)

#### SVM

In [35]:
svm_clf = SVC()
svm_clf.fit(counts, targets)

svm_clf.predict(example_counts)

array(['ham', 'ham'], dtype=object)

#### KNN

In [36]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(counts, targets)

knn_clf.predict(example_counts)

array(['ham', 'ham'], dtype=object)

#### DTree

In [37]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(counts, targets)

tree_clf.predict(example_counts)

array(['ham', 'ham'], dtype=object)

#### NaiveBayes

In [38]:
nv_clf = MultinomialNB()
nv_clf.fit(counts, targets)

nv_clf.predict(example_counts)

array(['spam', 'ham'], dtype='<U4')

### Midiendo la presicion

In [39]:
# SVM
cross_val_score(svm_clf, counts, targets, cv=3, scoring="accuracy")

array([0.92375, 0.945  , 0.91875])

In [40]:
# KNN
cross_val_score(knn_clf, counts, targets, cv=3, scoring="accuracy")

array([0.935  , 0.95125, 0.96375])

In [41]:
# Desition Tree
cross_val_score(tree_clf, counts, targets, cv=3, scoring="accuracy")

array([0.97   , 0.965  , 0.96625])

In [42]:
# NaiveBayes
cross_val_score(nv_clf, counts, targets, cv=3, scoring="accuracy")

array([0.9625, 0.94  , 0.9675])

### Cambio de metrica de presicion

La presicion no es una metrica muy presisa a la hora de hablar de clasificacion, es mejor utilizar la matriz de confusion.

Vamos a estar trabajano sobre los modelos Arbol de desicion y NieveBayes que son los que mejor rindieron.

In [43]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

In [45]:
# Desision Tree
tree_pred = cross_val_predict(tree_clf, counts, targets, cv=3)
confusion_matrix(tree_pred, targets)

array([[1960,   48],
       [  37,  355]])

In [47]:
# NieveBayes
nv_pred = cross_val_predict(nv_clf, counts, targets, cv=3)
confusion_matrix(nv_pred, targets)

array([[1991,   98],
       [   6,  305]])

### Basado en la matriz de confusion se puede calcular el nivel de presicion del modelo
presision = TP / (TP+FP)

TP is the number of true positives, and FP is the number of false positives.

In [60]:
from sklearn.metrics import precision_score, recall_score

In [57]:
precision_score(targets, tree_pred, average='binary', pos_label='spam')

0.9056122448979592

In [58]:
precision_score(targets, nv_pred, average='binary', pos_label='spam')

0.9807073954983923

### Modelo de Nieve Bayes supera al de arbol de desicion encuanto a presicion.

Existe otra metrica que es inversamente proporcional a la presicion llamada recall. Mientras que la presicion mide cuantos elementos positivos son clasificados como positivos correctamente, el recall mide cuantos positivos son clasificados como negativos o cuantos positivos no son detectados.

In [61]:
recall_score(targets, nv_pred, average='binary', pos_label='spam')

0.7568238213399504

### Validando con datos de pruebas

In [62]:
counts_test = vectorizer.transform(test_set['message'].values)
targets_test = test_set['class'].copy()

test_predict = nv_clf.predict(counts_test)

precision_score(targets_test, test_predict, average='binary', pos_label='spam')

0.9866666666666667