# 3. Category Prediction using Video Title and Tags - Part 2

# Table of contents
1. [Import libraries](#import_libraries)
2. [Load GloVe embeddings and processed data](#load)
3. [Create GloVe feature vectors](#glove)
5. [Create TF-IDF feature vectors](#tfidf)
6. [Training and tuning FFN](#ffn) (Commented code contains the combinations we tried)

### Import libraries <a name="import_libraries"></a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from tqdm import tqdm
from pathlib import Path
cwd = Path('.')

  '{0}.{1}.{2}'.format(*version.hdf5_built_version_tuple)


In [2]:
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from collections import Counter
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AJL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AJL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
tf.__version__

'2.0.0'

In [4]:
from random import seed
RANDOM = 42
seed(RANDOM)
np.random.seed(RANDOM)
tf.random.set_seed(RANDOM)

### Load GloVe embeddings and processed data <a name="load"></a>

In [5]:
# Location of your glove.twitter.27B.200d.txt file
GLOVE_LOCATION = r'D:\Downloads\glove.twitter.27B\glove.twitter.27B.200d.txt'

In [6]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [7]:
def sentence_to_avg(sentence, word_to_vec_map):
    words = sentence.split()
    avg = np.zeros(200) # depends on glove
    num_valid_words = 0
    for w in words:
        if w in word_to_vec_map:
            avg += word_to_vec_map[w]
            num_valid_words += 1
    if num_valid_words == 0:
        return np.zeros(200)
    else:
        return avg / num_valid_words

In [8]:
df = pd.read_csv(cwd/'output'/"title_tags_videos_df.csv")

### Create GloVe feature vectors <a name="glove"></a>

In [9]:
if os.path.isfile('X_GloVe.npy'): # if cwd has the file
    X = np.load("X_GloVe.npy", allow_pickle=True)
else:
    title_and_tags = df['title_and_tags'].astype(str).tolist()
    word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(GLOVE_LOCATION)
    X = np.array([sentence_to_avg(x, word_to_vec_map) for x in tqdm(title_and_tags)])
    X = np.nan_to_num(X)
    np.save('X_GloVe.npy', X)

100%|██████████| 3388/3388 [00:00<00:00, 16654.27it/s]


### Create TF-IDF feature vectors <a name="tfidf"></a>

In [10]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df['title_and_tags'].values).todense()

In [11]:
y = df['category_id'].to_numpy()

In [12]:
y.shape

(3388,)

In [13]:
X.shape

(3388, 200)

In [14]:
X_tfidf.shape

(3388, 16376)

### Training and Tuning FFN <a name="ffn"></a>
Best Combination --- Hidden layers: 768, 384, 64 w/ acc of 0.68)

In [15]:
# We can either use GloVe embeddings or TF-IDF vectors as input. Change network_input according.
# If using GLoVe embeddings, you will have 200 dimensions, if TF-IDF then 16376 dimensions.
network_input = X_tfidf # or X

In [16]:
network_input.shape

(3388, 16376)

In [17]:
num_layer_1 = [768] #[1024, 896, 768, 640, 512, 384, 256]
num_layer_2 = [384] #[896, 768, 640, 512, 384, 256, 128]
num_layer_3 = [64] #[768, 640, 512, 384, 256, 128, 64]

In [18]:
permutations= [(num_1, num_2, num_3) for num_3 in num_layer_3 for num_2 in num_layer_2 for num_1 in num_layer_1\
               if num_1 > num_2 > num_3]

#### Since we use early stopping and restore best wrights, we can afford to set a high epoch - it will automatically stop once loss stops improving

In [19]:
parameters_to_accuracy = {}
for permutation in permutations:
    accuracies = []
    skf = StratifiedKFold(n_splits=5)
    skf.get_n_splits(network_input, y)
    split_count = 1
    for train_index, test_index in skf.split(network_input, y):
        #print(f"Currently doing --- layer_1: {num_1}, layer_2: {num_2}, split: {split_count}/5")
        X_train, X_test = network_input[train_index], network_input[test_index]
        y_train, y_test = y[train_index], y[test_index]
        tf.keras.backend.clear_session()
        model = Sequential()
        for i, layer in enumerate(permutation):
            if i == 0:
                model.add(Dense(layer, input_dim=network_input.shape[1], activation='relu'))
            else:
                model.add(Dense(layer, activation='relu'))
            model.add(Dropout(rate=0.5))
        model.add(Dense(len(Counter(y)), activation='softmax'))
        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        callbacks = [EarlyStopping(monitor='val_accuracy', patience=25, restore_best_weights=True)] #impt
        model.fit(X_train, y_train, epochs=1000, batch_size=32, validation_data=(X_test, y_test), callbacks=callbacks, verbose=0)
        _, accuracy = model.evaluate(X_test, y_test, verbose=0)
        accuracies.append(accuracy)
        print(f"Completed --------- {permutation}, accuracy: {accuracy}, split: {split_count}/5")
        split_count += 1
    average_accuracy = sum(accuracies)/len(accuracies)
    parameters_to_accuracy[permutation] = average_accuracy

Completed --------- (768, 384, 64), accuracy: 0.6950146555900574, split: 1/5
Completed --------- (768, 384, 64), accuracy: 0.6842877864837646, split: 2/5
Completed --------- (768, 384, 64), accuracy: 0.7132353186607361, split: 3/5
Completed --------- (768, 384, 64), accuracy: 0.6696296334266663, split: 4/5
Completed --------- (768, 384, 64), accuracy: 0.6358209252357483, split: 5/5


#### Sorted combinations (best to worst)

In [29]:
for i in sorted(list(parameters_to_accuracy.items()), key=lambda x:x[1], reverse=True):
    print(f"Parameters: {i[0]}, accuracy: {i[1]}")

Parameters: (768, 384, 64), accuracy: 0.6795976638793946
