# NLP Demonstration with Jupyter Notebook

## Load Training Data

In [1]:
import pandas as pd

# import all training data
data = pd.read_csv("data/emails.csv")
data = data[['text', 'spam']]


In [2]:
# show data
print(data)

                                                   text spam
0     Subject: naturally irresistible your corporate...    1
1     Subject: the stock trading gunslinger  fanny i...    1
2     Subject: unbelievable new homes made easy  im ...    1
3     Subject: 4 color printing special  request add...    1
4     Subject: do not have money , get software cds ...    1
...                                                 ...  ...
5725  Subject: re : research and development charges...    0
5726  Subject: re : receipts from visit  jim ,  than...    0
5727  Subject: re : enron case study update  wow ! a...    0
5728  Subject: re : interest  david ,  please , call...    0
5729  Subject: news : aurora 5 . 2 update  aurora ve...    0

[5730 rows x 2 columns]


## Initialise Model

In [3]:
# set correct inports
from src.model import SpamModel
from src.lemmantisation import lemmantisation
from src.porter_stemmer import run_porter
from src.stop_words import remove_stopwords
from src.nlp import nlp
from src.tokenisation import tokenise

model = SpamModel(
    tokenise,
    remove_stopwords, 
    lemmantisation,
    run_porter,
    nlp
)

## Train the Model

In [4]:
# Choose your preprocessing strategy
# Choose '1' -> Lemmantization 
# Choose '2' -> Porter Stemmer

data = model.train_model(data,2)

## Test Examples

#### Testing with Lemmantisation Preprocessing

In [5]:
data_test = pd.DataFrame(
    {
    'text': ['Hi pione get rich quick encod iast']
    }
)
predict = model.predict(data_test)
print(predict)

SPAM


In [6]:
data_test = pd.DataFrame(
    {
    'text': ['Hi there']
    }
)
predict = model.predict(data_test)
print(predict)

HAM


In [7]:
data_test = pd.DataFrame(
    {
    'text': ["I'm going to be rich!"]
    }
)
predict = model.predict(data_test)
print(predict)

SPAM


#### Testing with Porter Stemming

In [8]:
# reload data 
data = pd.read_csv("data/emails.csv")
data = data[['text', 'spam']]

In [9]:
model = SpamModel(
    tokenise,
    remove_stopwords, 
    lemmantisation,
    run_porter,
    nlp
)

In [10]:
# Choose your preprocessing strategy
# Choose '1' -> Lemmantization 
# Choose '2' -> Porter Stemmer
data = model.train_model(data,2)

In [11]:
data_test = pd.DataFrame(
    {
    'text': ['Hi pione get rich quick encod iast']
    }
)
predict = model.predict(data_test)
print(predict)

SPAM


In [12]:
data_test = pd.DataFrame(
    {
    'text': ['Hi there']
    }
)
predict = model.predict(data_test)
print(predict)

HAM


In [13]:
data_test = pd.DataFrame(
    {
    'text': ["I'm going to be rich!"]
    }
)
predict = model.predict(data_test)
print(predict)

SPAM
