# Pattern Recognition in Daily Top Trending YouTube Videos

## NLP Classification Sub-Experiment

## Setup

In [0]:
import pandas as pd
import re
from google.colab import drive
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV

**Load Data From Google Drive**

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Read-in CSV & JSON**

In [0]:
videos = pd.read_csv("/content/drive/My Drive/cs6140 project/data/USvideos.csv")
videos_categories = pd.read_json("/content/drive/My Drive/cs6140 project/data/US_category_id.json")

This block of code links the csv to json categories

In [0]:
# source: https://www.kaggle.com/skalskip/youtube-data-exploration-and-plotly-visualization
categories = {category['id']: category['snippet']['title'] for category in videos_categories['items']}
videos.insert(4, 'category', videos['category_id'].astype(str).map(categories))

In [5]:
videos.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,People & Blogs,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,Entertainment,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,Comedy,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,Entertainment,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,Entertainment,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


**Remove Duplicates**

The outcome of this experiment will be heavily impacted by duplicate videos. For example, if there are duplicates, splitting the dataset into test and training data might result in the same video appearing in both sets.

In [0]:
videos.drop_duplicates(subset="title", keep="last", inplace=True)

**Creating New Attributes**

In [0]:
"""
function: get_tags
param(s): text, a string
returns: a list of tokenized strings
"""
def get_tags(text):
  # split text into list of words
  data = re.split("\"| |\|", str(text))
  return " ".join(data)

In [0]:
# a tfidf vectorizer takes care of this later on
# videos['tags'] = videos['tags'].apply(get_tags)

In [0]:
# rate of likes per view
videos["likes_per_view"] = videos["likes"]/videos["views"]
# rate of dislikes per view
videos["dislikes_per_view"] = videos["dislikes"]/videos["views"]
# rate of comments per view
videos["comments_per_view"] = videos["comment_count"]/videos["views"]

In [0]:
videos["total_likes_dislikes"] = videos["likes"] + videos["dislikes"]
videos["total_likes_dislikes_per_view"] = videos["total_likes_dislikes"]/videos["views"]
videos["likes_percentage"] = videos["likes"]/videos["total_likes_dislikes"]
videos["dislikes_percentage"] =videos["dislikes"]/videos["total_likes_dislikes"]

In [11]:
videos['category'].value_counts()

Entertainment            1644
Music                     821
Howto & Style             601
Comedy                    548
News & Politics           510
People & Blogs            502
Sports                    455
Science & Technology      391
Film & Animation          322
Education                 257
Pets & Animals            144
Gaming                    104
Autos & Vehicles           73
Travel & Events            64
Nonprofits & Activism      15
Shows                       4
Name: category, dtype: int64

We can already see some issues - let's finish data preprocessing before tackling this.

In [0]:
# source https://www.kaggle.com/skalskip/youtube-data-exploration-and-plotly-visualization

videos['trending_date'] = pd.to_datetime(videos['trending_date'], format='%y.%d.%m').dt.date
reformatted_time = pd.to_datetime(videos['publish_time'], format='%Y-%m-%dT%H:%M:%S.%fZ')

videos['publish_date'] = reformatted_time.dt.date
videos['publish_time'] = reformatted_time.dt.time
videos['publish_hour'] = reformatted_time.dt.hour
videos['publish_month'] = reformatted_time.dt.month
videos['publish_year'] = reformatted_time.dt.year

In [13]:
videos['publish_year'].value_counts()

2018    4209
2017    2176
2013      13
2015      10
2016       9
2011       8
2012       8
2014       7
2010       6
2009       5
2008       3
2006       1
Name: publish_year, dtype: int64

For a later classification sub=experiment, I will classify videos by their year using natural language processing. To accomplish this I will use two labels: 2018 and pre-2018.

In [0]:
"""
function: new_year_labels
params: year, an integer
returns: an integer (binary value [2017,2018])
does: converts publish_year column to 2018 and pre-2018 values
"""
def new_year_labels(year):
  if int(year) == 2018:
    return '2018'
  return '2017'

In [0]:
# assign new values to the year_classes column
videos['year_classes'] = videos['publish_year'].apply(new_year_labels)

In [16]:
videos['year_classes'].value_counts()

2018    4209
2017    2246
Name: year_classes, dtype: int64

**Testing and Metrics Function**

In [0]:
"""
function: test_model
params: clf, a function; y_pred, a list; test_labels, a list
returns: nothing
does: prints out precision, recall, f-score, and ROC AUC
"""
def test_model(clf, y_pred, test_labels):
  metrics = precision_recall_fscore_support(y_true=test_labels, 
                                            y_pred=y_pred, 
                                            average='weighted')
  print('Test Precision: %.4f' %metrics[0])
  print('Test Recall: %.4f' %metrics[1])
  print('Test F-Score: : %.4f' %metrics[2])

In [0]:
# function to keep output less verbose and ignore deprication and other warnings
# source: https://stackoverflow.com/questions/32612180/eliminating-warnings-from-scikit-learn
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

`FeatureUnion` combines two `TfidfVectorizers` to clean text for our model

In [0]:
vec_W = TfidfVectorizer(lowercase=True, 
                        analyzer='word',
                        stop_words=None, 
                        ngram_range = (1,1), 
                        max_df=1.0, min_df=1, 
                        max_features=None, 
                        norm = 'l2')
vec_C = TfidfVectorizer(lowercase=True, 
                        analyzer='char', 
                        stop_words=None, 
                        ngram_range = (1,1), 
                        max_df=1.0, min_df=1, 
                        max_features=None, 
                        norm = 'l2')

combined_features = FeatureUnion([('word', vec_W), ('char', vec_C)])

## Classify Video Category

In [0]:
# create training and testing sets
train_dataset = videos.sample(frac=0.8,random_state=12345)
test_dataset = videos.drop(train_dataset.index)

In [0]:
# clean data
train_dataset = train_dataset.dropna()
test_dataset = test_dataset.dropna()

In [0]:
# get labels
train_labels = train_dataset.pop('category')
test_labels = test_dataset.pop('category')

In [23]:
# get lengths of train and test datasets
print(train_dataset.shape[0])
print(test_dataset.shape[0])

5051
1269


Combine text columns into variables to be passed to models


In [0]:
combined_train = train_dataset['title'] + train_dataset['description'] + train_dataset['channel_title'] + train_dataset['tags']
combined_test = test_dataset['title'] + test_dataset['description'] + test_dataset['channel_title'] + test_dataset['tags']

**MultinomialNB (strawman)**

For our strawman model, we will use MultinomialNB to predict `cateogory` label for each example

In [25]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', MultinomialNB())
    ])

parameters = {'clf__alpha': (0, 1, 2),
              'clf__fit_prior': (True, False)}

clf = GridSearchCV(clf, parameters, cv=3)

clf = clf.fit(combined_train, train_labels)
y_pred = clf.predict(combined_test)
test_model(clf, y_pred, test_labels)

Test Precision: 0.7936
Test Recall: 0.7597
Test F-Score: : 0.7520


**LogisticRegressionCV**

In [26]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', LogisticRegressionCV(cv=3, solver='newton-cg', multi_class='multinomial'))
    ])

clf = clf.fit(combined_train, train_labels)
y_pred = clf.predict(combined_test)
test_model(clf, y_pred, test_labels)

Test Precision: 0.8027
Test Recall: 0.8054
Test F-Score: : 0.7994


**Aggregate Labels**

Even though we got an accuracy of ~80%, that's still not good enough. We can do better. One of the problems we are running into is that there are not enough examples for each of the caterogies, even after smoothing is applied. Here are the values of our labels:

In [27]:
train_labels.value_counts()

Entertainment            1277
Music                     639
Howto & Style             476
Comedy                    426
News & Politics           404
People & Blogs            386
Sports                    353
Science & Technology      308
Film & Animation          260
Education                 204
Pets & Animals            117
Gaming                     83
Autos & Vehicles           53
Travel & Events            52
Nonprofits & Activism      10
Shows                       3
Name: category, dtype: int64

Some of these values are high in number, while others are pretty low (less than 100 examples). We do not have enough examples, even after smoothing is performed. There's simply too many categories to perform classification. We need to reduce the number of classes to get a better result. Let us aggregate attributes.

**New Labels**

I will redefine the given labels as a binary classification problem using the labels `Entertainment` and `Informational`. Given the labels, it is easy to see that these two labels encompass the entirety of the classes.

`Entertainment` = `Entertainment` + `Music` + `Comedy` + `Film & Animation` + `Gaming` + `Shows` + `Pets & Animals`

`Informational` = `News & Politics` + `Nonpofits & Activism` + `Education` + `Travel & Events` + `Science & Technology` + `Autos & Vehicles` + `Howto & Style` + `People & Blogs` + `Sports`

**Create New Labels Function**

In [0]:
"""
function: create_new_labels
params: label, a string
returns: a new label
does: aggregates labels for the YouTube top daily trending videos dataset into 
      new categories
"""
def create_new_labels(label):
    if str(label) in ['Music', 'Comedy', 'Gaming', 'Shows', 'Film & Animation', 'Pets & Animals', 'Entertainment']:
        return 'Other Entertainment'
    elif str(label) in ['People & Blogs', 'Education', 'Nonprofits & Activism', 'Travel & Events', 
                        'Autos & Vehicles', 'Science & Technology','Howto & Style', 'News & Politics', 'Sports']:
        return 'Informational'

In [0]:
train_labels = train_labels.apply(create_new_labels)
test_labels = test_labels.apply(create_new_labels)

**BernoulliNB with New Labels**

We need to verify that our strawman model improves with this binary distirbution of the category label. Note that we are now using a bernoulli distribution instead of multinomial.

In [30]:
train_labels.value_counts()

Other Entertainment    2805
Informational          2246
Name: category, dtype: int64

In [31]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', BernoulliNB())
    ])

parameters = {'clf__alpha': (0.5, 0.5, 1),
              'clf__fit_prior': (True, False)}

clf = GridSearchCV(clf, parameters, cv=3)

clf = clf.fit(combined_train, train_labels)
y_pred = clf.predict(combined_test)
test_model(clf, y_pred, test_labels)

Test Precision: 0.9040
Test Recall: 0.9039
Test F-Score: : 0.9035


**LogisticRegressionCV with New Labels**

Next, we test a model. We choose `LogisticRegressionCV`, this time we can increase cross validation to 10 fold given that we have more instances of each class label.

In [32]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', LogisticRegressionCV(cv=3))
    ])

clf = clf.fit(combined_train, train_labels)
y_pred = clf.predict(combined_test)
test_model(clf, y_pred, test_labels)

Test Precision: 0.9077
Test Recall: 0.9078
Test F-Score: : 0.9077


**Conclusion**

We can now see that given an aggregation of labels to a binary distribution, we are can easily predict the type of video it is. 

## Classify Video Publishing Year

In [0]:
# create training and testing sets
train_dataset = videos.sample(frac=0.8,random_state=42)
test_dataset = videos.drop(train_dataset.index)

In [0]:
# clean the data
train_dataset = train_dataset.dropna()
test_dataset = test_dataset.dropna()

In [0]:
# get labels
train_labels = train_dataset.pop('year_classes')
test_labels = test_dataset.pop('year_classes')

In [36]:
print(train_dataset.shape[0])
print(test_dataset.shape[0])

5059
1261


Combine text columns into variables to be passed to models

In [0]:
combined_train = train_dataset['title'] + train_dataset['description'] + train_dataset['channel_title'] + train_dataset['tags']
combined_test = test_dataset['title'] + test_dataset['description'] + test_dataset['channel_title'] + test_dataset['tags']

**BernoulliNB (strawman model)**

In [38]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', BernoulliNB())
    ])

parameters = {'clf__alpha': (0.5, 0.5, 1),
              'clf__fit_prior': (True, False)}

clf = GridSearchCV(clf, parameters, cv=3)

clf = clf.fit(combined_train, train_labels)
y_pred = clf.predict(combined_test)
test_model(clf, y_pred, test_labels)

Test Precision: 0.6987
Test Recall: 0.7105
Test F-Score: : 0.7020


**LogisticRegressionCV (n=10)**

In [39]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', LogisticRegressionCV(cv=3, solver='liblinear'))
])

clf = clf.fit(combined_train, train_labels)
y_pred = clf.predict(combined_test)
test_model(clf, y_pred, test_labels)

Test Precision: 0.7222
Test Recall: 0.7328
Test F-Score: : 0.7243


**MLPClassifier**

In [40]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', MLPClassifier(hidden_layer_sizes=(100, 150, 2), max_iter=400, 
                          activation='relu', random_state=42, 
                          validation_fraction=0.3, early_stopping=True, 
                          warm_start=True, solver='adam'))
])

clf = clf.fit(combined_train, train_labels)
y_pred = clf.predict(combined_test)
test_model(clf, y_pred, test_labels)

Test Precision: 0.6991
Test Recall: 0.7153
Test F-Score: : 0.6755
