# Characterising the evolution of the user interests on YT

This notebook covers the whole pipeline of the project.

Here is the table content of the principal tasks : 
- **Data Processing**
- **Preparing Pyspark Model**
- **Topic modelling**
- **Results: Topic Coherence**


### Imports

In [71]:
import collections
import fasttext
import gzip
import json
import nltk
import os
import pickle
import scipy.sparse
import sys

import numpy as np
import pandas as pd
import zstandard as zstd

from collections import Counter
from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from scipy.sparse import dok_matrix

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType, StructField, StructType
from pyspark.ml.clustering import LDA, LDAModel, LocalLDAModel
from pyspark.ml.linalg import Vectors, SparseVector

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/olam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Data Processing 

In this section, we will select the data for the topic modelling model as follow:

- videos has at least 10'000 views
- videos from channels with at least 100'000 subscribers

For that, we proceed as follow :

- first pass over the whole dataset in order to build the vocabulary and the keep the index of the relevant videos
- second pass over the whole dataset to construct a NxM sparse matrix, where N is the number of videos and M is the number of words in the vocabulary

In [49]:
# GET THE LIST OF RELEVANT CHANNELS

df_channelcrawler = pd.read_csv('/dlabdata1/youtube_large/channelcrawler.csv')

df_channelcrawler['channel_id'] = df_channelcrawler['link'].apply(
    lambda x: x.replace('http://www.youtube.com/channel/', ''))

# Filter channels with at least 100'000 subs
df_channelcrawler = df_channelcrawler[df_channelcrawler['subscribers'] >= 100000]

# Store in a set since it will be faster to check if a channel is in channelcrawler
set_relevant_channels = set(df_channelcrawler['channel_id'])

print('There are ' + str(len(set_relevant_channels)) + ' relevant channels.')

There are 50456 relevant channels.


In [50]:
# Setting NLP pre-processing features
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
s_stemmer = SnowballStemmer(language='english')

In [48]:
def check_10000_views(video):
    try:
        return video['view_count'] >= 10000
    except KeyError:
        return False

In [68]:
def check_channel(video):
    try:
        return video['channel_id'] in set_relevant_channels
    except:
        return False

In [51]:
def isEnglishAlpha(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [52]:
def get_freq_tokens_per_video(video):
    ''''''

    title_tokens = [w for w in tokenizer.tokenize(
        video['title'].lower()) if not w in stop_words]
    tag_tokens = [w for w in tokenizer.tokenize(
        video['tags'].lower()) if not w in stop_words]

    # We want to keep duplicates !!
    tokens_per_video = title_tokens + tag_tokens

    # Filter token with length < 3, with non english alphabet since fastext is not 100% accurate and remove numerical token
    tokens_keep = []
    for token in tokens_per_video:
        if len(token) >= 3 and (not token.isnumeric()) and isEnglishAlpha(token):
            tokens_keep.append(token)

    # Stemming
    stemmed_tokens_per_video = ([s_stemmer.stem(w) for w in tokens_keep])

    # Return a Counter object of the tokens
    return collections.Counter(stemmed_tokens_per_video)

### First pass 
The first pass on the dataset will allow us to recover relevant videos and list of tokens

In [69]:
# Variable that contains the idx of every non english vid and that
# belongs to a channel in channelcrawler.csv TO BE USED IN SECOND ITER
set_relevant_vid = set()

# Variable first instanciated as set to check existing tokens efficiently,
# which will be a list in order to get the index for each tokens
set_stemmed_tokens = set()

# Reading the file
with gzip.open('/dlabdata1/youtube_large/yt_metadata_en_dd.jsonl.gz', 'rb') as f:

    for i, line in enumerate(f):

        try:
            # line is a byte dict, video is the corresponding dict
            video = json.loads(line)
        except:
            video = {'channel_id': None}

        if check_channel(video) and check_10000_views(video):

            tokens_per_video = get_freq_tokens_per_video(video)

            set_relevant_vid.add(i)
            set_stemmed_tokens.update(tokens_per_video)

        if i % 1000000 == 0 and i != 0:
            print('Processed ' + str(i) + ' videos...')

error: Error -3 while decompressing data: invalid code lengths set

In [None]:
# save some intermediate results

with open('/dlabdata1/youtube_large/olam/data/final_res/set_relevant_vid.pickle', 'wb') as f:
    pickle.dump(set_relevant_vid, f)
f.close()

with open('/dlabdata1/youtube_large/olam/data/final_res/set_stemmed_tokens.pickle', 'wb') as f:
    pickle.dump(set_stemmed_tokens, f)
f.close()

### Second pass 
The second pass on the dataset will allow us to create a NxM sparse matrix, where N is the number of videos and M is the number of words in the vocabulary

Notes:
- In order to keep the memory usage low, we will fill sparse matrix with only 1'000'000 rows and save them in the csr format
- Stack the saved sparse matrix together to get the final sparse matrix

In [73]:
def remove_zero_rows(M):
    '''Function that removes all rows from sparse matrix M that contains only zero.'''
    num_nonzeros = np.diff(M.indptr)
    return M[num_nonzeros != 0]

In [82]:
def fill_underlying_dict(freq_tokens_per_video, word2id, i_vid):
    '''Method to fill the underlying dictionnary in order to 
    update the sparse matrix incrementally by videos'''

    dict_freq_tokens_for_sparse_matrix = {}

    for key in freq_tokens_per_video.keys():

        # Column index in the sparse matrix (one column for each token)
        try:
            j_token = word2id[key]

            # Filling the underlying dict
            dict_freq_tokens_for_sparse_matrix[(
                i_vid % 1000000, j_token)] = freq_tokens_per_video[key]

        except KeyError:
            None

    return dict_freq_tokens_for_sparse_matrix

In [95]:
# Get dimension of sparse matrix
size_of_tokens_dict = len(set_stemmed_tokens)
number_of_vid = len(set_relevant_vid)

# Create dictionnary of tokens with their indice
word2id = {}

# Fill dictionnary of tokens
for i, token in enumerate(set_stemmed_tokens):
    word2id[token] = i
    
id2word = {v: k for k, v in word2id.items()}

# Create mini sparse matrix
S = dok_matrix((1000000, size_of_tokens_dict), dtype=np.uint8)

In [None]:
i_vid = 0

# Reading the file
with gzip.open('/dlabdata1/youtube_large/yt_metadata_en_dd.jsonl.gz', 'rb') as f:

    for i, line in enumerate(f):

        if i_vid % 1000000 == 0 and i_vid != 0:

            # Transform to csr format for memory efficiency
            S = S.tocsr()
            file_name = 'S' + str(int(i_vid/1000000)) + '.npz'

            if not os.path.isfile('/dlabdata1/youtube_large/olam/data/final_res/matrices/' + file_name):
                scipy.sparse.save_npz(
                    '/dlabdata1/youtube_large/olam/data/final_res/matrices/' + file_name, S)

                # Refresh mini sparse matrix
                S = dok_matrix((1000000, size_of_tokens_dict), dtype=np.uint8)

        if i in set_relevant_vid:

            video = json.loads(line)

            # Get the tokens for each video and theirs number of occurences
            freq_tokens_per_video = get_freq_tokens_per_video(video)

            # Fill the underlying dict
            dict_freq_tokens_for_sparse_matrix = fill_underlying_dict(
                freq_tokens_per_video, word2id, i_vid)

            # Fill data in to sparse matrix
            dict.update(S, dict_freq_tokens_for_sparse_matrix)

            # Increase i_vid
            i_vid += 1

        if i % 1000000 == 0 and i != 0:
            print('Processed ' + str(i) + ' videos...')

# Save last sparse matrix
S = S.tocsr()
S = remove_zero_rows(S)
scipy.sparse.save_npz(
    '/dlabdata1/youtube_large/olam/data/final_res/matrices/S_last.npz', S)

In [None]:
# Get final sparse matrix
S = scipy.sparse.load_npz(
    '/dlabdata1/youtube_large/olam/data/final_res/matrices/S1.npz')

for i in range(2, XXX):
    S_next = scipy.sparse.load_npz(
        '/dlabdata1/youtube_large/olam/data/final_res/matrices/S' + str(i) + '.npz')
    S = scipy.sparse.vstack([S, S_next])

# Add last matrix
S_last = scipy.sparse.load_npz(
    '/dlabdata1/youtube_large/olam/data/final_res/matrices/S_last.npz')
S = scipy.sparse.vstack([S, S_last])

In [None]:
# save the full matrix
scipy.sparse.save_npz(
    '/dlabdata1/youtube_large/olam/data/final_res/matrices/S_full.npz', S)

## Process the data for Topic Modelling with PySpark

First, for a better topic modelling model, we select the videos from the sparse matrix as follow:
- we group the videos by their respective `channel_id`, `category` and `upload_date`
- keep the 20 videos with the most `view_counts` from each group
- keep tokens that appears in at least 100 videos

Then, we should compute a spark dataframe from our final sparse matrix in order to compute the models on the hadoop cluster

### Filter the videos for topic modelling 

In [None]:
# create pandas DataFrame of relevant videos with relevant features
columns_names = ['idx', 'channel_id',
                 'view_counts', 'uploaded_year', 'category']

# store relevant features of relevant videos in a list
list_relevant_data = []

# Reading the file
with gzip.open('/dlabdata1/youtube_large/yt_metadata_en_dd.jsonl.gz', 'rb') as f:

    for i, line in enumerate(f):
        if i % 1000000 == 0 and i != 0:
        print('Progress: ' + str(int(idx/1000000)) + '/85')

        if i in set_relevant_vid:

            # line is a str dict, video is the dict corresponding to the str dict
            video = json.loads(line)

            list_vid_relevant_features = [video['channel_id']]
            list_vid_relevant_features.append(video['view_count'])
            list_vid_relevant_features.append(video['upload_date'][:4])
            list_vid_relevant_features.append(video['categories'])

            list_relevant_data.append(list_vid_relevant_features)

            idx_new += 1

In [None]:
df_relevant_data = pd.DataFrame(list_relevant_data, columns=columns_names)
df_relevant_data_top20 = df_relevant_data.sort_values(['view_counts'], ascending=False).groupby(
    ['category', 'uploaded_year', 'channel_id']).head(20)

set_relevant_vid_top20 = sorted(df_relevant_data_top20.index.values)

In [None]:
# Save intermediate result
with open('/dlabdata1/youtube_large/olam/data/final_res/model/set_relevant_vid_top20.pickle', 'wb') as f:
    pickle.dump(sorted_idx_relevant_vid_top20, f)
f.close()

### Keep only relevant tokens

In [None]:
# Load intermediate data
S = scipy.sparse.load_npz(
    '/dlabdata1/youtube_large/olam/data/final_res/matrices/S_full.npz')


# Load set of videos in the top20 as processed abose
with open('/dlabdata1/youtube_large/olam/data/final_res/model/set_relevant_vid_top20.pickle', 'rb') as f:
    set_relevant_vid_top20 = pickle.load(f)
f.close()

In [None]:
S = S[set_relevant_vid_top20, :]

# Convert matrix to csc for efficient computing
S = S.tocsc()

list_relevant_tokens = []

# Iterate on the columns
for i in range(S_full.shape[1]):

    if i % 1000000 == 0:
        print('Processed : ' + str(i) + ' tokens')

    # Check column has more than 100 non zero entries
    if S_full[:, i].count_nonzero() >= 100:
        list_relevant_tokens.append(i)

In [None]:
S = S[:, list_relevant_tokens]
S = S.tocsr()
scipy.sparse.save_npz('/dlabdata1/youtube_large/olam/data/final_res/matrices/S_final.npz', S)

### Build a dictionnary id2word for final sparse matrix

`id2word_top20` is a dictionnary that map each of the token id to the token itself only for relevant tokens for top20 data

In [None]:
id2word_top20 = {}

for i, id_token in enumerate(list_relevant_tokens):
    id2word_top20[i] = id2word[id_token]

### Get Spark dataframe to use the hadoop cluster to perform topic modelling

In [None]:
conf = SparkConf().setMaster("local[4]").setAll(
    [('spark.executor.memory', '4g'), ('spark.driver.memory', '16g'), ('spark.driver.maxResultSize', '0')])

# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
def get_dict_for_row(row, S):
    '''Construct SparseVector bag-of-word for each row (videos)'''
    tmp_dict = {}
    for key, value in row:
        tmp_dict[key[1]] = value

    return SparseVector(S.shape[1], tmp_dict)

In [None]:
data = []

print('Process video for topic modelling...')
for i in range(S.shape[0]):

    if i % 1000000 == 0:
        print(str(i) + ' videos processed...')

    data.append([i, get_dict_for_row(S.getrow(i).todok().items(), S)])


# Construct dataframe for LDA
all_df = spark.createDataFrame(data, ["id", "features"])

In [None]:
# save the dataframe
all_df.write.option('compression', 'gzip').json(
    '/dlabdata1/youtube_large/olam/data/final_res/model/sparkdf.json')

## Topic Modelling


## Results: Topic Coherence

In [None]:
class FakedGensimDict:
    """
    Locally made class for `~gensim.corpora.dictionary.Dictionary`
    """

    def __init__(self, data, S):
        if not isinstance(data, dict):
            raise ValueError('`data` must be an instance of `dict`')

        self.id2token = data
        self.token2id = {v: k for k, v in data.items()}
        self.doc2bow = S

    @staticmethod
    def from_vocab(vocab):
        return FakedGensimDict(dict(zip(range(len(vocab)), vocab)))

### Preparing the coherence model

In [None]:
texts = []

for i in range(S.shape[0]):
    token_indices = list(S.getrow(i).nonzero()[1])
    tokens = []

    for token_indice in token_indices:
        tokens.append(id2word_top20[token_indice])
    texts.append(tokens)

In [None]:
corpus = []

for row in S.toarray():
    bow = []
    idx_nonzero = np.nonzero(row)[0]
    for i in range(len(idx_nonzero)):
        bow.append((idx_nonzero[i], row[idx_nonzero[i]]))
    corpus.append(bow)

### Get the coherence scores for each model 

In [None]:
coherence_scores = []
coherence_scores_umass = []

n_topics_list = [40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95,
                 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160]

for i, n_topics in enumerate(n_topics_list):

    print('Computing coherence score for model with ' +
          str(n_topics) + ' topics...')

    # Get describe_topics dataframe
    describe_topics = spark.read.json(
        '/dlabdata1/youtube_large/olam/data/final_res/model/tune/describe_topics_' + str(n_topics) + '.json')

    # Characterize the topics with tokens
    topics = []

    for row in describe_topics.sort('topic').rdd.collect():
        tokenized_topic = []
        for j, token_id in enumerate(row.termIndices):
            tokenized_topic.append(id2word_top20[token_id])
            if j > 10:
                break
        topics.append(tokenized_topic)

    # Compute c_v coherence score and append to coherence scores
    coherence_model = CoherenceModel(topics=topics,
                                     corpus=S,
                                     dictionary=FakedGensimDict(
                                         id2word_top20, S),
                                     texts=texts,
                                     coherence='c_v')

    # Compute u_mass coherence score and append to coherence scores
    coherence_model_umass = CoherenceModel(topics=topics,
                                           corpus=corpus,
                                           dictionary=FakedGensimDict(
                                               id2word_top20, S),
                                           coherence='u_mass')

    print('Getting c_v coherence score...')
    coherence_scores.append(coherence_model.get_coherence())
    print('Getting u_mass coherence score...')
    coherence_scores_umass.append(coherence_model_umass.get_coherence())
    print('')

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

ax.set_title('Coherence score for a given number of topics', fontsize=24)
ax.set_xlabel('Number of Topics', fontsize=16)
ax.set_ylabel('Coherence Score c_v', fontsize=16)

ax.grid('on')

ax.plot(coherence_scores, label='c_v coherence score', linewidth=3)

ax2 = ax.twinx()
ax2.set_ylabel('Coherence Score u_mass', fontsize=16)
ax2.plot(coherence_scores_umass, label='u_mass coherence score', linewidth=3, color='orange')

ax.legend(fontsize=16)
ax2.legend(loc='upper right', bbox_to_anchor=(1, 0.93), fontsize=16)

plt.xticks(np.arange(len(n_topics_list)), n_topics_list)

In [None]:
fig.savefig('/home/olam/coherence_scores')