### Import Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import sys
sys.path.append("../data")

from process_data import process_data

import numpy as np
import pandas as pd
import re

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mikelu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load & pre-process Data Frame

In [2]:
X_train, y_train, X_test, y_test = process_data(buckets=10)

print("Before Cleanining")
X_train[['title','description']]

Before Cleanining


Unnamed: 0,title,description
2316,Filthiest Dunks in NBA History,The best dunks in NBA history. Let me know if ...
406,Rating EVERY American Cereal with Ludwig and A...,This will be the ultimate tier list of all the...
645,Easiest (and Hardest) Countries to Take Over,The world is filled with dominating military f...
1206,"$1 vs $1,000,000 Hotel Room!",The hotel at the end is worth the wait!\n\nDow...
592,Mayweather vs. Paul: Ceremonial Weigh-In | SHO...,Hall of Fame boxing legend Floyd Mayweather an...
...,...,...
1095,Elden Ring OFFICIAL DLC in Miquella's Dream Co...,The DLC is finally revealed! ELDEN RING: Shado...
1462,Kirby but tier list,Kirby Triple Deluxe but I finally have an opin...
2197,"LATEST! MARCH 18, 2023 PINOY boxer NO MERC...","LATEST! MARCH 18, 2023 PINOY boxer NO MERC..."
1039,As It Was - Harry Styles,🎶Lyrics:\nHoldin' me back\nGravity's holdin' m...


In [3]:
def clean_text(text):
    """Function to clean up text by making all characters lowercase,
    removing non-alphanumeric characters, and removing common stop words"""

    # make the text lowercase
    text = text.lower()
    
    # remove non-alphabetic characters (including digits and punctuation)
    text = re.sub("[^a-zA-Z]", ' ', text)

    # remove common stop words
    stop_words = set(stopwords.words("english"))
    text = [w for w in text.split() if w not in stop_words]

    return " ".join(text)

X_train['description'] = X_train['description'].apply(clean_text)
X_test['description'] = X_test['description'].apply(clean_text)
X_train['title'] = X_train['title'].apply(clean_text)
X_test['title'] = X_test['title'].apply(clean_text)

print("After Cleaning")
X_train[['title','description']]

After Cleaning


Unnamed: 0,title,description
2316,filthiest dunks nba history,best dunks nba history let know make part than...
406,rating every american cereal ludwig abroad japan,ultimate tier list different cereal could find...
645,easiest hardest countries take,world filled dominating military forces around...
1206,vs hotel room,hotel end worth wait download experian app htt...
592,mayweather vs paul ceremonial weigh showtime ppv,hall fame boxing legend floyd mayweather socia...
...,...,...
1095,elden ring official dlc miquella dream confirm...,dlc finally revealed elden ring shadow erdtree...
1462,kirby tier list,kirby triple deluxe finally opinion thank note...
2197,latest march pinoy boxer mercy tinalo ang form...,latest march pinoy boxer mercy tinalo ang form...
1039,harry styles,lyrics holdin back gravity holdin back want ho...


### Classification Using Bag of Words Approach

In [4]:
def classifyByText(X_train, Y_train, X_test, Y_test):
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train) 

    print(f"  {X_train_counts.shape[0]:0d} records containing",
          f"{X_train_counts.shape[1]:0d} unique words")

    clf_MNB_pipe = Pipeline([("vect", CountVectorizer()), 
                             ("tfidf", TfidfTransformer()), 
                             ("clf_nominalNB", MultinomialNB())])
    clf_MNB_pipe.fit(X_train, Y_train)

    predictedMNB = clf_MNB_pipe.predict(X_test)

    print(f"  Accuracy on Test dataset: {np.mean(predictedMNB == Y_test)*100:0.2f}%")
    
print("Classify by Description")
classifyByText(X_train['description'], y_train['views_category_10'], 
               X_test['description'], y_test['views_category_10'])
print("")
print("Classify by Title")
classifyByText(X_train['title'], y_train['views_category_10'],
               X_test['description'], y_test['views_category_10'])

Classify by Description
  1859 records containing 29442 unique words
  Accuracy on Test dataset: 21.08%

Classify by Title
  1859 records containing 4605 unique words
  Accuracy on Test dataset: 11.40%


### Classification Using Embeddings

In [5]:
def classifyByTextEmbeddings(X_train, Y_train, X_test, Y_test,
                             vocab_size=5000, sequence_len=100,
                             embedding_dim=2, num_epochs=5):

    tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
    tokenizer.fit_on_texts(pd.concat([X_train, X_test]))

    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_train_padded_seq = pad_sequences(X_train_seq, 
                                       maxlen=sequence_len, 
                                       padding='post', 
                                       truncating='post')
    
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)

    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim=vocab_size,
                                        output_dim=embedding_dim,
                                        input_length=sequence_len))
    model.add(tf.keras.layers.GlobalAveragePooling1D())
    model.add(tf.keras.layers.Dense(units=10, activation='softmax'))

    model.compile(optimizer='adam', 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])
    
    display(model.summary())
    
    model.fit(
        x=X_train_padded_seq,
        y=Y_train,
        epochs=num_epochs,
        batch_size=64,
        validation_split=0.1,
        verbose=1
    )
    
    X_test_seq = tokenizer.texts_to_sequences(X_test)
    X_test_padded_seq = pad_sequences(X_test_seq, 
                                      maxlen=sequence_len, 
                                      padding='post', 
                                      truncating='post')
    
    metrics = model.evaluate(
                  x=X_test_padded_seq,
                  y=Y_test,
                  return_dict=True
              )
    
    print("")
    print(f"Test Loss: {metrics['loss']:0.4f}")
    print(f"Test Accuracy: {metrics['accuracy']:0.4f}")

In [6]:
classifyByTextEmbeddings(X_train=X_train['description'], 
                         Y_train=y_train['views_category_10'],
                         X_test=X_test['description'],
                         Y_test=y_test['views_category_10'],
                         vocab_size=1500, sequence_len=5,
                         embedding_dim=3, num_epochs=10)

Metal device set to: Apple M2 Pro
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 5, 3)              4500      
                                                                 
 global_average_pooling1d (G  (None, 3)                0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 10)                40        
                                                                 
Total params: 4,540
Trainable params: 4,540
Non-trainable params: 0
_________________________________________________________________


None

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Test Loss: 0.2208
Test Accuracy: 0.1097


In [7]:
classifyByTextEmbeddings(X_train=X_train['title'], 
                         Y_train=y_train['views_category_10'],
                         X_test=X_test['title'],
                         Y_test=y_test['views_category_10'],
                         vocab_size=100, sequence_len=3,
                         embedding_dim=3, num_epochs=10)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 3)              300       
                                                                 
 global_average_pooling1d (G  (None, 3)                0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 10)                40        
                                                                 
Total params: 340
Trainable params: 340
Non-trainable params: 0
_________________________________________________________________


None

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Test Loss: 0.2145
Test Accuracy: 0.1097
