<a href="https://colab.research.google.com/github/ngohuy1811/AIO---Module-2/blob/main/Text_Classification_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PREPARE DATA**

In [36]:
#Load libraries
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

import pandas as pd #doc file csv
import numpy as np #thu vien cho array
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB #model naive bayes
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#Download data
!gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: /content/2cls_spam_text_cls.csv
100% 486k/486k [00:00<00:00, 122MB/s]


In [37]:
#Set data path and read
DATASET_PATH = '/content/2cls_spam_text_cls.csv'
df = pd.read_csv(DATASET_PATH) #read data file
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [38]:
#Split Category and Message into different list
messages = df['Message'].values.tolist() #.tolist return a list of the values
labels = df['Category'].values.tolist()

In [45]:
# Đưa 2 labels về dạng 0, 1
le = LabelEncoder()
y = le.fit_transform(labels)
print(f'Classes: {le.classes_}')
print(f'Encoded labels: {y}')

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


In [39]:
#LOWERCASE
def lowercase(text):
  return text.lower()

#PUNCTUATION REMOVAL
def punctuation_removal(text):
  translator = str.maketrans('', '', string.punctuation)

  return text.translate(translator)

#TOKENIZE
def tokenize(text):
  return nltk.word_tokenize(text)

#REMOVE STOPWORD
def remove_stopwords(tokens):
  stop_words = nltk.corpus.stopwords.words('english')

  return [token for token in tokens if token not in stop_words]

#STEMMING
def stemming(tokens):
  stemmer = nltk.stem.porter.PorterStemmer()

  return [stemmer.stem(token) for token in tokens]

def preprocess_text(text):
  text = lowercase(text)
  text = punctuation_removal(text)
  tokens = tokenize(text)
  tokens = remove_stopwords(tokens)
  tokens = stemming(tokens)

  return tokens

# **DATA PRE-PROCESSING**

In [40]:
messages = [preprocess_text(message) for message in messages]

In [41]:
messages

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'u', 'oni'],
 ['free',
  'entri',
  '2',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  '87121',
  'receiv',
  'entri',
  'questionstd',
  'txt',
  'ratetc',
  'appli',
  '08452810075over18'],
 ['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', 'alreadi', 'say'],
 ['nah', 'dont', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  '3',
  'week',
  'word',
  'back',
  'id',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  '£150',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'caller',
  'press',
  '9',
  'copi',
  '

In [44]:
#BUILD VOCAB
def create_dictionary(messages):
  dictionary = []
  for tokens in messages:
    for token in tokens:
      if token not in dictionary:
        dictionary.append(token)
  return dictionary

dictionary = create_dictionary(messages)
dictionary [:10] #in 10 tu dau tien

['go',
 'jurong',
 'point',
 'crazi',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la']

In [43]:
#CREATE FEATURES:
def create_features(tokens, dictionary):
  features = np.zeros(len(dictionary))

  for token in tokens:
    if token in dictionary:
      features[dictionary.index(token)] += 1
  return features

x = np.array([create_features(tokens,dictionary) for tokens in messages])
print(x.shape)
print(x)

(5572, 8166)
[[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# **PROCESSING**

In [52]:
#Phan chia bo du lieu ra theo 7 train - 2 validate - 1 test
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0
IS_SHUFFLE = True

x_train, x_val, y_train, y_val = train_test_split(
    x, y,
    test_size=VAL_SIZE,
    shuffle=IS_SHUFFLE,
    random_state=SEED
    )
x_train, x_test, y_train, y_test = train_test_split(
    x_train, y_train,
    test_size=TEST_SIZE,
    shuffle=IS_SHUFFLE,
    random_state=SEED
    )

In [53]:
print(x_train.shape[0])
print(x_val.shape[0])
print(x_test.shape[0])

3899
1115
558


In [55]:
#MODEL TRAINING

%%time
model = GaussianNB()
print('Hold my beer...!')
model.fit(x_train, y_train)
print('NAILED IT!')


Hold my beer...!
NAILED IT!
CPU times: user 322 ms, sys: 139 ms, total: 461 ms
Wall time: 454 ms


# **EVALUATION**

In [56]:
# EVALUATION
y_val_pred = model.predict(x_val)
y_test_pred = model.predict(x_test)
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Validation Accuracy: {val_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

Validation Accuracy: 0.8816143497757848
Test Accuracy: 0.8602150537634409


In [59]:
#PREDICTION
def predict(text, model, dictionary):
  processed_text = preprocess_text(text) # pre-process
  features = create_features(text, dictionary) # create feature
  features = np.array([features]).reshape(1, -1) # reshape
  prediction = model.predict(features) # predict
  prediction_cls = le.inverse_transform(prediction)[0] # prediction decoding

  return prediction_cls


In [63]:
test_input = "Wanna buy a house with cheap prize?"
prediction_cls = predict(test_input, model, dictionary)
print(f'Prediction: {prediction_cls}')

Prediction: ham
