<a href="https://colab.research.google.com/github/phongloihong/email_classification/blob/main/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
!gdown --id 1mjQXesbVYhMrhi1CWDgAQtadblU9VMmh

Downloading...
From: https://drive.google.com/uc?id=1mjQXesbVYhMrhi1CWDgAQtadblU9VMmh
To: /content/spam_text_classifier.csv
100% 486k/486k [00:00<00:00, 47.4MB/s]


In [None]:
DATASET_PATH = '/content/spam_text_classifier.csv'
df = pd.read_csv(DATASET_PATH)
df[df['Category']=='spam']

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [None]:
messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

## Add labels

In [None]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f'Classes: {le.classes_}')
print(f'Encoded labels: {y}')

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


## Preprocessing

In [None]:
def lowercase(text):
  return text.lower()


def punctuation_removal(txt):
  translator = str.maketrans('', '', string.punctuation)
  return txt.translate(translator)


def tokenize(text):
  return nltk.word_tokenize(text)


def remove_stopwords(tokens):
  stopwords = nltk.corpus.stopwords.words('english')
  return [token for token in tokens if token not in stopwords]


def stemming(tokens):
  stemmer = nltk.stem.PorterStemmer()
  return [stemmer.stem(token) for token in tokens]


def preprocessing_text(text):
  text = lowercase(text)
  text = punctuation_removal(text)
  text = tokenize(text)
  text = remove_stopwords(text)
  text = stemming(text)
  return text


INPUT_TEXT = "The cat's toys are scattered everywhere!"
INPUT_TEXT = preprocessing_text(INPUT_TEXT)
print(INPUT_TEXT)

['cat', 'toy', 'scatter', 'everywher']


In [None]:
messages  = [preprocessing_text(message) for message in messages]

In [None]:
messages[0]

['go',
 'jurong',
 'point',
 'crazi',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amor',
 'wat']

## Build vocabulary

In [None]:
def create_dictionary(messages):
    dictionary = []

    for tokens in messages:
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)

    return dictionary

def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))

    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1

    return features

dictionary = create_dictionary(messages)
X = np.array([create_features(tokens, dictionary) for tokens in messages])
print(X.shape)
print(X)

(5572, 8166)
[[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Split train, validate, test set

In [None]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0
IS_SHUFFLE = True

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=VAL_SIZE,
    shuffle=IS_SHUFFLE,
    random_state=SEED
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=TEST_SIZE,
    shuffle=IS_SHUFFLE,
    random_state=SEED
)

print(f'Number of training samples {X_train.shape[0]}')
print(f'Number of validate samples {X_val.shape[0]}')
print(f'Number of test samples {X_test.shape[0]}')

Number of training samples 3899
Number of validate samples 1115
Number of test samples 558


## Train model

In [None]:
%%time
model = GaussianNB()
print('Start training...')

model = model.fit(X_train, y_train)
print("Training completed")

Start training...
Training completed
CPU times: user 410 ms, sys: 285 ms, total: 695 ms
Wall time: 720 ms


## Evaluation

In [None]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Val accuracy: {val_accuracy}')
print(f'Test accuracy: {test_accuracy}')


Val accuracy: 0.8816143497757848
Test accuracy: 0.8602150537634409


## Prediction

In [None]:
def predict(text, model, dictionary):
  processed_text = preprocessing_text(text)
  features = create_features(text, dictionary)
  features = np.array(features).reshape(1, -1)
  prediction = model.predict(features)
  prediction_cls = le.inverse_transform(prediction)[0]

  return prediction_cls

test_input = "England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/ú1.20 POBOXox36504W45WQ 16+"
prediction_cls = predict(test_input, model, dictionary)
print(f'Prediction: {prediction_cls}')

Prediction: spam
