# Category prediction using video title and tags

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import time
import pickle
import collections
import string
import warnings
warnings.filterwarnings('ignore')

In [3]:
from random import seed
RANDOM = 42
seed(RANDOM)
np.random.seed(RANDOM)

In [4]:
videos_df = pd.read_csv("videos.csv")

In [5]:
le = preprocessing.LabelEncoder()
le.fit(videos_df['category_id'])
videos_df['category_id'] = le.transform(videos_df['category_id']) 

In [6]:
category_id_to_word = {}
for _, row in videos_df.iterrows():
    if row['category_id'] not in category_id_to_word:
        category_id_to_word[row['category_id']] = row['category_name']

In [7]:
videos_df.head()

Unnamed: 0,video_id,title,channel_title,category_id,tags,views,likes,dislikes,comment_total,thumbnail_link,date,category_name,likes_log,views_log,dislikes_log,comment_log,like_rate,dislike_rate,comment_rate
0,XpVt6Z1Gjjo,1 YEAR OF VLOGGING -- HOW LOGAN PAUL CHANGED Y...,Logan Paul Vlogs,9,logan paul vlog|logan paul|logan|paul|olympics...,4394029,320053,5931,46245,https://i.ytimg.com/vi/XpVt6Z1Gjjo/default.jpg,13.09,Entertainment,12.676245,15.295757,8.688117,10.74173,7.283816,0.134979,1.052451
1,K4wEI5zhHB0,iPhone X — Introducing iPhone X — Apple,Apple,13,Apple|iPhone 10|iPhone Ten|iPhone|Portrait Lig...,7860119,185853,26679,0,https://i.ytimg.com/vi/K4wEI5zhHB0/default.jpg,13.09,Science & Technology,12.132717,15.877312,10.19167,0.0,2.364506,0.339422,0.0
2,cLdxuaxaQwc,My Response,PewDiePie,7,[none],5845909,576597,39774,170708,https://i.ytimg.com/vi/cLdxuaxaQwc/default.jpg,13.09,People & Blogs,13.264901,15.581253,10.590994,12.047716,9.863257,0.680373,2.920128
3,WYYvHb03Eog,Apple iPhone X first look,The Verge,13,apple iphone x hands on|Apple iPhone X|iPhone ...,2642103,24975,4542,12829,https://i.ytimg.com/vi/WYYvHb03Eog/default.jpg,13.09,Science & Technology,10.125671,14.787086,8.421343,9.459541,0.94527,0.171909,0.48556
4,sjlHnJvXdQs,iPhone X (parody),jacksfilms,8,jacksfilms|parody|parodies|iphone|iphone x|iph...,1168130,96666,568,6666,https://i.ytimg.com/vi/sjlHnJvXdQs/default.jpg,13.09,Comedy,11.479027,13.970916,6.34388,8.804925,8.275278,0.048625,0.570656


In [8]:
videos_df.to_csv("videos_w_category_name.csv")

In [9]:
category_id_to_word

{9: 'Entertainment',
 13: 'Science & Technology',
 7: 'People & Blogs',
 8: 'Comedy',
 0: 'Film & Animation',
 11: 'Howto & Style',
 2: 'Music',
 10: 'News & Politics',
 4: 'Sports',
 12: 'Education',
 1: 'Autos & Vehicles',
 6: 'Gaming',
 3: 'Pets & Animals',
 14: 'Nonprofits & Activism',
 5: 'Travel & Events',
 15: 'Shows'}

In [10]:
videos_df.groupby(['category_id']).agg(['count'])

Unnamed: 0_level_0,video_id,title,channel_title,tags,views,likes,dislikes,comment_total,thumbnail_link,date,category_name,likes_log,views_log,dislikes_log,comment_log,like_rate,dislike_rate,comment_rate
Unnamed: 0_level_1,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
category_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
0,771,771,771,771,771,771,771,771,771,771,771,771,771,771,771,767,767,767
1,183,183,183,183,183,183,183,183,183,183,183,183,183,183,183,183,183,183
2,2547,2547,2547,2547,2547,2547,2547,2547,2547,2547,2547,2547,2547,2547,2547,2532,2532,2533
3,194,194,194,194,194,194,194,194,194,194,194,194,194,194,194,194,194,194
4,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114
5,94,94,94,94,94,94,94,94,94,94,94,94,94,94,94,94,94,94
6,334,334,334,334,334,334,334,334,334,334,334,334,334,334,334,334,334,334
7,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035
8,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312
9,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114


In [11]:
videos_df = videos_df[videos_df['category_id'] != 15]
videos_df = videos_df[videos_df['category_id'] != 14]

In [12]:
videos_df.groupby(['category_id']).agg(['count'])

Unnamed: 0_level_0,video_id,title,channel_title,tags,views,likes,dislikes,comment_total,thumbnail_link,date,category_name,likes_log,views_log,dislikes_log,comment_log,like_rate,dislike_rate,comment_rate
Unnamed: 0_level_1,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
category_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
0,771,771,771,771,771,771,771,771,771,771,771,771,771,771,771,767,767,767
1,183,183,183,183,183,183,183,183,183,183,183,183,183,183,183,183,183,183
2,2547,2547,2547,2547,2547,2547,2547,2547,2547,2547,2547,2547,2547,2547,2547,2532,2532,2533
3,194,194,194,194,194,194,194,194,194,194,194,194,194,194,194,194,194,194
4,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114,1114
5,94,94,94,94,94,94,94,94,94,94,94,94,94,94,94,94,94,94
6,334,334,334,334,334,334,334,334,334,334,334,334,334,334,334,334,334,334
7,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035,2035
8,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312,1312
9,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114,3114


In [13]:
videos_df.head()

Unnamed: 0,video_id,title,channel_title,category_id,tags,views,likes,dislikes,comment_total,thumbnail_link,date,category_name,likes_log,views_log,dislikes_log,comment_log,like_rate,dislike_rate,comment_rate
0,XpVt6Z1Gjjo,1 YEAR OF VLOGGING -- HOW LOGAN PAUL CHANGED Y...,Logan Paul Vlogs,9,logan paul vlog|logan paul|logan|paul|olympics...,4394029,320053,5931,46245,https://i.ytimg.com/vi/XpVt6Z1Gjjo/default.jpg,13.09,Entertainment,12.676245,15.295757,8.688117,10.74173,7.283816,0.134979,1.052451
1,K4wEI5zhHB0,iPhone X — Introducing iPhone X — Apple,Apple,13,Apple|iPhone 10|iPhone Ten|iPhone|Portrait Lig...,7860119,185853,26679,0,https://i.ytimg.com/vi/K4wEI5zhHB0/default.jpg,13.09,Science & Technology,12.132717,15.877312,10.19167,0.0,2.364506,0.339422,0.0
2,cLdxuaxaQwc,My Response,PewDiePie,7,[none],5845909,576597,39774,170708,https://i.ytimg.com/vi/cLdxuaxaQwc/default.jpg,13.09,People & Blogs,13.264901,15.581253,10.590994,12.047716,9.863257,0.680373,2.920128
3,WYYvHb03Eog,Apple iPhone X first look,The Verge,13,apple iphone x hands on|Apple iPhone X|iPhone ...,2642103,24975,4542,12829,https://i.ytimg.com/vi/WYYvHb03Eog/default.jpg,13.09,Science & Technology,10.125671,14.787086,8.421343,9.459541,0.94527,0.171909,0.48556
4,sjlHnJvXdQs,iPhone X (parody),jacksfilms,8,jacksfilms|parody|parodies|iphone|iphone x|iph...,1168130,96666,568,6666,https://i.ytimg.com/vi/sjlHnJvXdQs/default.jpg,13.09,Comedy,11.479027,13.970916,6.34388,8.804925,8.275278,0.048625,0.570656


In [14]:
def clean_and_join_title_and_tags(row, to_stem):
    # join title and tags
    text = row['title'] + ' ' + " ".join(row['tags'].split("|"))
    # tokenize
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [token for token in tokens if token.isalpha()]
    # filter out stop words
    words = [w for w in words if not w in stop_words]
    # stemming words
    if to_stem:
        porter = PorterStemmer()
        words = [porter.stem(word) for word in words]
    return " ".join(words)

In [15]:
to_stem = False
videos_df['title_and_tags'] = videos_df.apply(lambda x: clean_and_join_title_and_tags(x, to_stem), axis=1)

In [16]:
videos_df['title_and_tags'][0]

'year vlogging logan paul changed youtube forever logan paul vlog logan paul logan paul olympics logan paul youtube vlog daily comedy hollywood parrot maverick bird maverick clothes diamond play button logan paul diamond play button subscribers logan paul year vlogging year vlog dwarf mamba play button logan paul history youtube history plaque youtube button diamond button logang logang life'

In [17]:
videos_df = videos_df.drop_duplicates(subset='title_and_tags', keep='first')

In [18]:
len(videos_df)

3388

In [19]:
videos_df.to_csv("14_category_videos_w_clean_title_tags.csv", index=False)

In [20]:
vectorizer = TfidfVectorizer()
counts = vectorizer.fit_transform(videos_df['title_and_tags'].values)
counts.shape

(3388, 16376)

In [21]:
print(vectorizer.get_feature_names())



In [22]:
X = counts
y = videos_df['category_id'].to_numpy()

In [23]:
def unison_shuffled_copies(a, b):
    assert a.shape[0] == b.shape[0]
    p = np.random.permutation(a.shape[0])
    return a[p], b[p]
X, y = unison_shuffled_copies(X, y)
X = np.nan_to_num(X)

#### See baseline of some models to see if task is doable

In [22]:
classifiers = {"LogisticRegression": LogisticRegression(),
               "MultinomialNB": MultinomialNB(),
               "RandomForestClassifier": RandomForestClassifier(),
               "SVC": SVC(probability=True),
               "KNeighborsClassifier": KNeighborsClassifier(),
               "DecisionTreeClassifier": DecisionTreeClassifier()}

In [72]:
accuracies = {}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM)
for classifier, model in classifiers.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracies[classifier] = accuracy_score(y_test, y_pred)

In [86]:
for classifier, accuracy in accuracies.items():
    print("{0:<50}{1:<25}\n".format(f"{classifier} has baseline accuracy of:", accuracy))
    #print(f"{classifier} has baseline accuracy of: {accuracy}")
print(f"\nThe average baseline accuracy is {sum(accuracies.values())/len(accuracies)}")

MultinomialNB has baseline accuracy of:           0.532448377581121        

RandomForestClassifier has baseline accuracy of:  0.6710914454277286       

SVC has baseline accuracy of:                     0.17551622418879056      

KNeighborsClassifier has baseline accuracy of:    0.6032448377581121       

DecisionTreeClassifier has baseline accuracy of:  0.6150442477876106       


The average baseline accuracy is 0.5194690265486726


In [109]:
1/len(collections.Counter(y))

0.07142857142857142

7% of randomly guessing

### Tuning
#### You won't always get the same parameters as diff random seed / diff computer give different results :(

#### Tuning KNN
Best: 0.649941 using {'algorithm': 'auto', 'n_neighbors': 6, 'weights': 'distance'}

In [41]:
# Params
n_neighbors = range(1, 20)
algorithm = ['auto']
weights = ['distance']

param_grid = dict(n_neighbors=n_neighbors, algorithm=algorithm, weights=weights)

model = KNeighborsClassifier()
CV = GridSearchCV(estimator=model, param_grid=param_grid, cv = 5, n_jobs=-1, verbose=3)
start_time = time.time()
result = CV.fit(X, y)

# Summarize results
print("Best: %f using %s" % (result.best_score_, result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Fitting 5 folds for each of 19 candidates, totalling 95 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best: 0.649941 using {'algorithm': 'auto', 'n_neighbors': 6, 'weights': 'distance'}
Execution time: 1.6805050373077393 ms


[Parallel(n_jobs=-1)]: Done  95 out of  95 | elapsed:    1.6s finished


#### Tuning Decision Tree
Baseline is the best already i think..

Best: 0.603601 using {'random_state': 42}

In [95]:
# Params
random_state = [RANDOM]

param_grid = dict(random_state=random_state)

model = DecisionTreeClassifier()
CV = GridSearchCV(estimator=model, param_grid=param_grid, cv = 5, n_jobs=-1, verbose=3)
start_time = time.time()
result = CV.fit(X, y)

# Summarize results
print("Best: %f using %s" % (result.best_score_, result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.5s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s finished


Best: 0.603601 using {'random_state': 42}
Execution time: 1.9198668003082275 ms


#### Tuning Random Forest
Best: 0.688312 using {'n_estimators': 87, 'random_state': 42}

In [45]:
# Params
n_estimators = range(81, 101)
random_state = [RANDOM]

param_grid = dict(n_estimators=n_estimators, random_state=random_state)

model = RandomForestClassifier()
CV = GridSearchCV(estimator=model, param_grid=param_grid, cv = 5, n_jobs=-1, verbose=3)
start_time = time.time()
result = CV.fit(X, y)

# Summarize results
print("Best: %f using %s" % (result.best_score_, result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.8min finished


Best: 0.688312 using {'n_estimators': 87, 'random_state': 42}
Execution time: 108.98533844947815 ms


#### Tuning Logistic Regression
Best: 0.703365 using {'C': 50, 'penalty': 'l2', 'random_state': 42}

In [62]:
# Params
penalty = ['l2']
C = range(40, 61)
random_state = [RANDOM]

param_grid = dict(penalty=penalty, C=C, random_state=random_state)

model = LogisticRegression()
CV = GridSearchCV(estimator=model, param_grid=param_grid, cv = 5, n_jobs=-1, verbose=3)
start_time = time.time()
result = CV.fit(X, y)

# Summarize results
print("Best: %f using %s" % (result.best_score_, result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:   10.1s finished


Best: 0.703365 using {'C': 50, 'penalty': 'l2', 'random_state': 42}
Execution time: 10.411162614822388 ms


#### Tuning Support Vector Machine
Best: 0.693329 using {'C': 11.1, 'gamma': 0.09, 'kernel': 'rbf', 'random_state': 42}

In [103]:
# Params
C = [10.7, 10.8, 10.9, 11.0, 11.1]
gamma = [0.07, 0.08, 0.09, 0.10, 0.11]
kernel = ['rbf']
random_state = [RANDOM]
param_grid = dict(C=C, gamma=gamma, kernel=kernel, random_state=random_state)

model = SVC()
CV = GridSearchCV(estimator=model, param_grid=param_grid, cv = 5, n_jobs=-1, verbose=3)
start_time = time.time()
result = CV.fit(X, y)

# Summarize results
print("Best: %f using %s" % (result.best_score_, result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:  1.9min finished


Best: 0.693329 using {'C': 11.1, 'gamma': 0.09, 'kernel': 'rbf', 'random_state': 42}
Execution time: 116.46165132522583 ms


#### Tuning Naive Bayes
Best: 0.709563 using {'alpha': 0.12, 'fit_prior': False}

In [24]:
# Params
alpha = np.linspace(0.0, 0.2, num=21)
fit_prior = [True, False]
param_grid = dict(alpha=alpha, fit_prior=fit_prior)

model = MultinomialNB()
CV = GridSearchCV(estimator=model, param_grid=param_grid, cv = 5, n_jobs=-1, verbose=3)
start_time = time.time()
result = CV.fit(X, y)

# Summarize results
print("Best: %f using %s" % (result.best_score_, result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.4s


Best: 0.708087 using {'alpha': 0.19, 'fit_prior': False}
Execution time: 2.180708885192871 ms


[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:    2.1s finished
