# 4. Category Prediction using Video Title and Tags - Part 2

### Preparing GloVe Vectors

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from tqdm import tqdm
from pathlib import Path
cwd = Path('.')

  '{0}.{1}.{2}'.format(*version.hdf5_built_version_tuple)


In [2]:
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from collections import Counter
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AJL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AJL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
tf.__version__

'2.0.0'

In [4]:
from random import seed
RANDOM = 42
seed(RANDOM)
np.random.seed(RANDOM)
tf.random.set_seed(RANDOM)

In [5]:
# Location of glove.twitter.27B.200d.txt file
GLOVE_LOCATION = r'glove.twitter.27B.200d.txt'

In [6]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [7]:
def sentence_to_avg(sentence, word_to_vec_map):
    words = sentence.split()
    avg = np.zeros(200) # depends on glove
    num_valid_words = 0
    for w in words:
        if w in word_to_vec_map:
            avg += word_to_vec_map[w]
            num_valid_words += 1
    if num_valid_words == 0:
        return np.zeros(200)
    else:
        return avg / num_valid_words

In [8]:
df = pd.read_csv(cwd/'output'/"title_tags_videos_df.csv")

In [9]:
if os.path.isfile('X_GloVe.npy'): # if cwd has the file
    X = np.load("X_GloVe.npy", allow_pickle=True)
else:
    title_and_tags = df['title_and_tags'].astype(str).tolist()
    print(len(title_and_tags))
    word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(GLOVE_LOCATION)
    X = np.array([sentence_to_avg(x, word_to_vec_map) for x in tqdm(title_and_tags)])
    X = np.nan_to_num(X)
    np.save('X_GloVe.npy', X)

3388


FileNotFoundError: [Errno 2] No such file or directory: 'glove.twitter.27B.200d.txt'

In [14]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df['title_and_tags'].values).todense()

In [15]:
y = df['category_id'].to_numpy()

In [16]:
y.shape

(3388,)

In [17]:
X.shape

(3388, 200)

In [18]:
X_tfidf.shape

(3388, 16376)

# Training and Tuning FFN
Best Combination --- Hidden layers: 768, 384, 64 w/ acc of 0.6876203536987304)

In [35]:
# We can either use GloVe embeddings or TF-IDF vectors as input. Change network_input according.
# If using GLoVe embeddings, you will have 200 dimensions, if TF-IDF then 16376 dimensions.
network_input = X_tfidf # or X

In [36]:
network_input.shape

(3388, 16376)

In [37]:
num_layer_1 = [768] #[1024, 896, 768, 640, 512, 384, 256]
num_layer_2 = [384] #[896, 768, 640, 512, 384, 256, 128]
num_layer_3 = [64] #[768, 640, 512, 384, 256, 128, 64]

In [38]:
permutations= [(num_1, num_2, num_3) for num_3 in num_layer_3 for num_2 in num_layer_2 for num_1 in num_layer_1\
               if num_1 > num_2 > num_3]

In [41]:
parameters_to_accuracy = {}
for permutation in permutations:
    accuracies = []
    skf = StratifiedKFold(n_splits=5)
    skf.get_n_splits(network_input, y)
    split_count = 1
    for train_index, test_index in skf.split(network_input, y):
        #print(f"Currently doing --- layer_1: {num_1}, layer_2: {num_2}, split: {split_count}/5")
        X_train, X_test = network_input[train_index], network_input[test_index]
        y_train, y_test = y[train_index], y[test_index]
        tf.keras.backend.clear_session()
        model = Sequential()
        for i, layer in enumerate(permutation):
            if i == 0:
                model.add(Dense(layer, input_dim=network_input.shape[1], activation='relu'))
            else:
                model.add(Dense(layer, activation='relu'))
            model.add(Dropout(rate=0.5))
        model.add(Dense(len(Counter(y)), activation='softmax'))
        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        callbacks = [EarlyStopping(monitor='val_accuracy', patience=25, restore_best_weights=True)] #impt
        model.fit(X_train, y_train, epochs=1000, batch_size=32, validation_data=(X_test, y_test), callbacks=callbacks, verbose=0)
        _, accuracy = model.evaluate(X_test, y_test, verbose=0)
        accuracies.append(accuracy)
        print(f"Completed --------- {permutation}, accuracy: {accuracy}, split: {split_count}/5")
        split_count += 1
    average_accuracy = sum(accuracies)/len(accuracies)
    parameters_to_accuracy[permutation] = average_accuracy

Completed --------- (1024, 896, 768), accuracy: 0.6686217188835144, split: 1/5
Completed --------- (1024, 896, 768), accuracy: 0.6607929468154907, split: 2/5
Completed --------- (1024, 896, 768), accuracy: 0.7014706134796143, split: 3/5
Completed --------- (1024, 896, 768), accuracy: 0.6696296334266663, split: 4/5
Completed --------- (1024, 896, 768), accuracy: 0.6313433051109314, split: 5/5
Completed --------- (1024, 896, 640), accuracy: 0.6891495585441589, split: 1/5
Completed --------- (1024, 896, 640), accuracy: 0.6740087866783142, split: 2/5
Completed --------- (1024, 896, 640), accuracy: 0.7014706134796143, split: 3/5
Completed --------- (1024, 896, 640), accuracy: 0.6725925803184509, split: 4/5
Completed --------- (1024, 896, 640), accuracy: 0.6313433051109314, split: 5/5
Completed --------- (1024, 768, 640), accuracy: 0.6818181872367859, split: 1/5
Completed --------- (1024, 768, 640), accuracy: 0.6710719466209412, split: 2/5
Completed --------- (1024, 768, 640), accuracy: 0.70

In [77]:
x = parse_this.split("\n")

In [78]:
x.pop(0)

''

In [84]:
perm_acc = {}
for i in x:
    perm = i.split("(")[1].split(')')[0]
    acc = float(i.split(":")[1].split(",")[0].strip())
    if perm not in perm_acc:
        perm_acc[perm] = [acc]
    else:
        perm_acc[perm].append(acc)
for perm, acc in perm_acc.items():
    print(f"{perm}: {sum(acc)/len(acc)}")
    perm_acc[perm] = sum(acc)/len(acc)

1024, 896, 768: 0.6663716435432434
1024, 896, 640: 0.673712968826294
1024, 768, 640: 0.6692713618278503
896, 768, 640: 0.6711137294769287
1024, 896, 512: 0.6704530596733094
1024, 768, 512: 0.670440399646759
896, 768, 512: 0.6769663333892822
1024, 640, 512: 0.673743999004364
896, 640, 512: 0.674328339099884
768, 640, 512: 0.6672100186347961
1024, 896, 384: 0.6813761830329895
1024, 768, 384: 0.6731601357460022
896, 768, 384: 0.6727999329566956
1024, 640, 384: 0.6678440928459167
896, 640, 384: 0.6754782319068908
768, 640, 384: 0.6748789310455322
1024, 512, 384: 0.6689960598945618
896, 512, 384: 0.6760589957237244
768, 512, 384: 0.6727942943572998
640, 512, 384: 0.6745686292648315
1024, 896, 256: 0.676700484752655
1024, 768, 256: 0.6728493928909302
896, 768, 256: 0.6743555068969727
1024, 640, 256: 0.6742910742759705
896, 640, 256: 0.6749008655548095
768, 640, 256: 0.6692893266677856
1024, 512, 256: 0.6787471294403076
896, 512, 256: 0.6719219923019409
768, 512, 256: 0.6728258013725281
640, 

In [86]:
perm_acc = list(perm_acc.items())
perm_acc.sort(key=lambda x: x[1], reverse=True)

In [88]:
perm_acc[0]

('768, 384, 64', 0.6876203536987304)