# Naive Bayes (ALGORITMO)

### Import

In [185]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Análise Exploratória de Dados

In [186]:
emails = pd.read_csv('spam_or_not_spam.csv')

In [187]:
emails.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [188]:
emails.shape

(3000, 2)

In [189]:
emails_dedupl = emails.drop_duplicates()

In [190]:
emails_filtered = emails_dedupl.dropna()

In [191]:
emails_filtered.shape

(2872, 2)

### Tratamento de Texto

In [192]:
df = emails_filtered.copy()

In [193]:
stemmer = PorterStemmer()

In [194]:
nltk.download("stopwords")
stopwords = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\laura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [195]:
df['unstemmed'] = df['email'].str.split()

In [196]:
df['unstemmed'] = df['unstemmed'].apply(lambda x: [re.sub(r"[^A-Za-z]", " ", y).lower() for y in x])

In [197]:
df['stemmed'] = df['unstemmed'].apply(lambda x: [stemmer.stem(y) for y in x if not y in stopwords])

In [198]:
df['stemmed_stopwords'] = df['stemmed'].apply(lambda x: [item for item in x if item not in stopwords])

In [199]:
df['features'] = df['stemmed_stopwords'].apply(lambda x: ' '.join(x))

In [200]:
df_prod = df[['features', 'label']]

In [201]:
df_prod.head()

Unnamed: 0,features,label
0,date wed number aug number number number numbe...,0
1,martin post tasso papadopoulo greek sculptor b...,0
2,man threaten explos moscow thursday august num...,0
3,klez viru die alreadi prolif viru ever klez co...,0
4,ad cream spaghetti carbonara effect pasta make...,0


In [202]:
df_prod.shape

(2872, 2)

### Codificações Baseadas em Palavras

In [203]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_prod)
train, validation = train_test_split(train, test_size=0.2)
print('Entradas por Dataset')
print('Dataset de Treino: ', len(train), 'linhas.')
print('Dataset de Validação: ', len(validation), 'linhas.')
print('Dataset de Teste: ', len(test), 'linhas.')

Entradas por Dataset
Dataset de Treino:  1723 linhas.
Dataset de Validação:  431 linhas.
Dataset de Teste:  718 linhas.


In [204]:
train.head()

Unnamed: 0,features,label
2011,url url date number number numbertnumb number ...,0
282,mon number number number number number matthia...,0
1,martin post tasso papadopoulo greek sculptor b...,0
533,john hall wrote fast normal termin veloc much ...,0
1283,matthia saou wrote upon time roi wrote oh xmm ...,0


In [205]:
train_ham = train[train['label'] == 0]
train_spam = train[train['label'] == 1]

In [206]:
print("Treino #ham: ", len(train_ham))
print("Treino #spam: ", len(train_spam))

Treino #ham:  1448
Treino #spam:  275


In [207]:
oversampling_factor = len(train_ham) // len(train_spam) # 5

In [208]:
train_spam = train_spam.sample(n=len(train_spam) * oversampling_factor,
                               replace=True)

In [209]:
train = pd.concat([train_ham, train_spam])

# Naive Bayes