### OS:
 The OS module in Python provides a way of using operating system dependent functionality.
### IO:
 in Python 3.x it is the default interface to access files and streams.

In [31]:
import os
import io

In [32]:
import numpy as np
from pandas import DataFrame

### Count Vectorizer 
Converts a collection of text documents to a matrix of token counts

This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

### Multinomial Naive Bayes:
Parameter estimation and event models. ... The assumptions on distributions of features are called the event model of the Naive Bayes classifier. For discrete features like the ones encountered in document classification (include spam filtering), multinomial and Bernoulli distributions are popular.

In [34]:
from sklearn.naive_bayes import MultinomialNB

In [61]:
def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)
            
            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n': # it finds the first empty line and we suppose everything under that empty line is body.
                    inBody=True
            f.close()
            message = '\n'.join(lines)
            yield path, message
                

### good explanation about "yield"
https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do

### example of os.walk()

In [36]:
for root, dirs, files in os.walk(".", topdown=False):
   for name in files:
      print(os.path.join(root, name))
   for name in dirs:
      print(os.path.join(root, name))

.\.ipynb_checkpoints\Implementing a Spam Classifier with  Naive Bayes-checkpoint.ipynb
.\Implementing a Spam Classifier with  Naive Bayes.ipynb
.\.ipynb_checkpoints


In [62]:
def dataFrameFromDirectory(path, classification):
    rows=[]
    index=[]
    for filename, message in readFiles(path):
        rows.append({'message':message, 'class':classification})
        index.append(filename)
    return DataFrame(rows, index=index)    

In [65]:
data = DataFrame({'message':[], 'class':[]})
data = data.append(dataFrameFromDirectory('C:\\Users\\mojiway\\Desktop\\tutorials\\data-science-frank-kane-udemy\\my-files-data-science-udemy-frank-kane\\DataScience-Python3\\emails\\spam', 'spam'))
data = data.append(dataFrameFromDirectory('C:\\Users\\mojiway\\Desktop\\tutorials\\data-science-frank-kane-udemy\\my-files-data-science-udemy-frank-kane\\DataScience-Python3\\emails\\ham', 'ham'))


In [67]:
data

Unnamed: 0,class,message
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\spam\00001.7848dde101aa985090474a91ec93fcf0,spam,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr..."
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\spam\00002.d94f1b97e48ed3b553b3508d116e6a09,spam,1) Fight The Risk of Cancer!\n\nhttp://www.adc...
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\spam\00003.2ee33bc6eacdb11f38d052c44819ba6c,spam,1) Fight The Risk of Cancer!\n\nhttp://www.adc...
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\spam\00004.eac8de8d759b7e74154f142194282724,spam,##############################################...
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\spam\00005.57696a39d7d84318ce497886896bf90d,spam,I thought you might like these:\n\n1) Slim Dow...
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\spam\00006.5ab5620d3d7c6c0db76234556a16f6c1,spam,A POWERHOUSE GIFTING PROGRAM You Don't Want To...
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\spam\00007.d8521faf753ff9ee989122f6816f87d7,spam,Help wanted. We are a 14 year old fortune 500...
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\spam\00008.dfd941deb10f5eed78b1594b131c9266,spam,<html>\n\n<head>\n\n<title>ReliaQuote - Save U...
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\spam\00009.027bf6e0b0c4ab34db3ce0ea4bf2edab,spam,TIRED OF THE BULL OUT THERE?\n\nWant To Stop L...
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\spam\00010.445affef4c70feec58f9198cfbc22997,spam,"Dear ricardo1 ,\n\n\n\n<html>\n\n<body>\n\n<ce..."


### what'a next?

 we need to use countVectorizer() to go through all messages, and count all words and tell us how many times each word repeats.
 we call that tokenizing the words. the variable "counts" here will demonstrate it. simply each message will be indicated as a list of numbers for each word's repetition. 

In [69]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

<3000x62964 sparse matrix of type '<class 'numpy.int64'>'
	with 429785 stored elements in Compressed Sparse Row format>

now its time to use the multinomial naive bayes classifier to do the analysis

In [73]:
classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## now we can try it out

In [74]:
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham'], dtype='<U4')

### train-test split

In [106]:
from sklearn.model_selection import train_test_split

In [107]:
X = data['message']
y = data['class']

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [109]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(X_train.values)
classifier = MultinomialNB()
targets = y_train.values

In [110]:
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [113]:

example_counts = vectorizer.transform(X_test)
pred = classifier.predict(example_counts)

In [114]:
pred

array(['ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam',
       'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam',
       'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham',
       'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'ham', 'ham',
       'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham',
       'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam',
       'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam',
       'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam',
       'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', '

In [115]:
submission = DataFrame({
     "prediction": pred,
        "Reality": y_test
       
    })
submission.to_excel('results.xlsx', index=False)

In [116]:
submission

Unnamed: 0,Reality,prediction
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\ham\01302.06ac4ee24e3c434afa330b8c0408649b,ham,ham
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\ham\00691.fe0daf79c97e1e314de953d18efc37e2,ham,ham
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\ham\01318.db11c5ecba49aba4fb12fbead712f815,ham,ham
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\spam\00252.7e355e0c5fd1de609684544262435579,spam,spam
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\ham\02006.e6e362eb554a8cebfec08a622aa57ade,ham,ham
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\ham\00618.bec430e993398552fbf09c76e04b9994,ham,ham
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\ham\00912.f7faf669f2794a54dd91d62e7ac3b904,ham,ham
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\ham\01614.005f290e71d13de44d3503c640dfe57c,ham,ham
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\spam\00409.e59f63e813b6766a9a4ddf0790634ca3,spam,spam
C:\Users\mojiway\Desktop\tutorials\data-science-frank-kane-udemy\my-files-data-science-udemy-frank-kane\DataScience-Python3\emails\ham\02080.dada2901a28c2e2eb93daad554ff9e0e,ham,ham
