In [1]:
import collections
import json
import nltk
import pickle
import pyLDAvis
import random
import scipy.sparse
import sys
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import zstandard as zstd


from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from pyspark import SparkContext
from pyspark.sql import SparkSession
from scipy.sparse import dok_matrix

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/olam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
s_stemmer = SnowballStemmer(language='english')

# Select select the data

### Selection criteria
- Not used to train the topic modelling
- Video with more than 10'000 views

In [3]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''


    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode(errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]

In [4]:
# Load set of videos to consider
with open('/dlabdata1/youtube_large/olam/data/view10000_sub10000/idx_vid_to_consider.pickle', 'rb') as f:
    idx_vid_to_consider = pickle.load(f)
f.close()

In [14]:
# Load channels
df_channelcrawler = pd.read_csv('/dlabdata1/youtube_large/channelcrawler.csv')

df_channelcrawler['channel_id'] = df_channelcrawler['link'].apply(lambda x: x.replace('http://www.youtube.com/channel/', ''))

# filter the channels
df_channelcrawler_100000sub = df_channelcrawler[df_channelcrawler['subscribers'] >= 100000]
set_channelcrawler_100000sub = set(df_channelcrawler_100000sub['channel_id'])

In [13]:
reader = Zreader("/dlabdata1/youtube_large/yt_metadata_all.jsonl.zst", chunk_size=2**28)

idx = 0
array_relevant_infos = []

for line in reader.readlines():
    ###start_iter = time.time()
    idx += 1
    
    if idx % 1000000 == 0:
        print('Progress: ' + str(int(idx/1000000)) + '/85')
        
    if idx in idx_vid_to_consider:
        
        # line is a str dict, video is the dict corresponding to the str dict
        video = json.loads(line)
        
        array_vid_relevant_infos = [video['channel_id']]
        array_vid_relevant_infos.append(video['view_count'])
        array_vid_relevant_infos.append(video['upload_date'][:4])
        array_vid_relevant_infos.append(video['categories'])
        
        array_relevant_infos.append(array_vid_relevant_infos)
        
    

Progress: 1/85
Progress: 2/85
Progress: 3/85
Progress: 4/85
Progress: 5/85
Progress: 6/85
Progress: 7/85
Progress: 8/85
Progress: 9/85
Progress: 10/85
Progress: 11/85
Progress: 12/85
Progress: 13/85
Progress: 14/85
Progress: 15/85
Progress: 16/85
Progress: 17/85
Progress: 18/85
Progress: 19/85
Progress: 20/85
Progress: 21/85
Progress: 22/85
Progress: 23/85
Progress: 24/85
Progress: 25/85
Progress: 26/85
Progress: 33/85
Progress: 34/85
Progress: 35/85
Progress: 36/85
Progress: 37/85
Progress: 38/85
Progress: 39/85
Progress: 40/85
Progress: 41/85
Progress: 42/85
Progress: 43/85
Progress: 44/85
Progress: 45/85
Progress: 46/85
Progress: 47/85
Progress: 48/85
Progress: 49/85
Progress: 50/85
Progress: 51/85
Progress: 52/85
Progress: 53/85
Progress: 54/85
Progress: 55/85
Progress: 56/85
Progress: 57/85
Progress: 58/85
Progress: 59/85
Progress: 60/85
Progress: 61/85
Progress: 62/85
Progress: 63/85
Progress: 64/85
Progress: 65/85
Progress: 66/85
Progress: 67/85
Progress: 68/85
Progress: 69/85
P

In [15]:
# Get the dataframe of all the videos that we will consider

column_names = ['channel_id', 'view_counts', 'uploaded_year', 'category']

df = pd.DataFrame(array_relevant_infos, columns=column_names)

In [45]:
df.shape

(21714294, 4)

### Remove all the videos that are used for topic modelling

Find the video indices in the dataframe such that:
- more than 100'000 subscribers
- top20 from category/channel/year

In [29]:
df_sub100000 = df[df['channel_id'].isin(set_channelcrawler_100000sub)]
df_top20 = df_sub100000.sort_values(['view_counts'], ascending=False).groupby(['category', 'uploaded_year', 'channel_id']).head(20)
df_top20.head()

Unnamed: 0,channel_id,view_counts,uploaded_year,category
21326653,UC0C-w0YjGpqDXGB8IHb662A,4468090305,2017,Music
10648298,UCVp3nfGRxmMadNDuVbJSk8A,4295905423,2015,Music
7977749,UCcdwLMPsaU2ezNSJU1nFoBQ,3838039119,2016,Education
4655616,UCmfFGTSsfJVu6CGvL8r75qg,3709532958,2014,Music
13575776,UCN1hnUccO4FD5WfM7ithXaw,3055180938,2015,Music


In [33]:
index_to_remove = set(df_top20.index)

In [52]:
idx_vid_to_consider_classifier = []

for index in df.index:
    if index not in index_to_remove:
        idx_vid_to_consider_classifier.append(index)

In [54]:
len(index_data)

17801854

In [64]:
with open('/dlabdata1/youtube_large/olam/data/classifier/idx_vid_to_consider_classifier.pickle', 'wb') as f:
    pickle.dump(idx_vid_to_consider_classifier, f)
f.close()

# Process the data

- Transform every video into BoW, according to the topic model vocabulary
- Get the transformed data -> distribution over the topic for each video
- Separate into train set and test set

In [5]:
def isEnglishAlpha(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [6]:
def get_freq_tokens_per_video(video):
    ''''''
    
    title_tokens = [w for w in tokenizer.tokenize(video['title'].lower()) if not w in stop_words]
    tag_tokens = [w for w in tokenizer.tokenize(video['tags'].lower()) if not w in stop_words]
    
    # We want to keep duplicates !!
    tokens_per_video = title_tokens + tag_tokens

    #Â Filter token with length < 3, with non english alphabet since fastext is not 100% accurate and remove numerical token 
    tokens_keep = []
    for token in tokens_per_video:
        if len(token) >= 3 and (not token.isnumeric()) and isEnglishAlpha(token):
            tokens_keep.append(token)
    
    
    # Stemming
    stemmed_tokens_per_video = ([s_stemmer.stem(w) for w in tokens_keep])
    
    
    # Return a Counter object of the tokens
    return collections.Counter(tokens_keep)

In [64]:
def fill_underlying_dict(freq_tokens_per_video, dict_stemmed_tokens, i_vid):
    '''Method to fill the underlying dictionnary in order to 
    update the sparse matrix incrementally by videos'''
    
    dict_freq_tokens_for_sparse_matrix = {}
    
    for key in freq_tokens_per_video.keys():
        
        # Column index in the sparse matrix (one column for each token)
        try:
            j_token = dict_stemmed_tokens[key]
            
            # Filling the underlying dict
            dict_freq_tokens_for_sparse_matrix[(i_vid % 1000000, j_token)] = freq_tokens_per_video[key]
            
        except KeyError:
            None
    
    return dict_freq_tokens_for_sparse_matrix

In [61]:
def remove_zero_rows(M):
    '''Function that removes all rows from sparse matrix M that contains only zero.'''
    num_nonzeros = np.diff(M.indptr)
    return M[num_nonzeros != 0]

In [28]:
# Load dictionnary of words
with open('/dlabdata1/youtube_large/olam/data/view10000_sub100000/id2word_tok100vid_sub100000.pickle', 'rb') as f:
    id2word = pickle.load(f)
f.close()

# Load index of data for classifier
with open('/dlabdata1/youtube_large/olam/data/classifier/idx_vid_to_consider_classifier.pickle', 'rb') as f:
    idx_vid_to_consider_classifier = pickle.load(f)
f.close()

In [43]:
word2id = {v: k for k, v in id2word.items()}

In [29]:
idx_vid_to_consider_sorted = list(idx_vid_to_consider)
idx_vid_to_consider_sorted.sort()

In [38]:
index_data = set([idx_vid_to_consider_sorted[i] for i in idx_vid_to_consider_classifier])

In [42]:
with open('/dlabdata1/youtube_large/olam/data/classifier/index_data.pickle', 'wb') as f:
    pickle.dump(index_data, f)
f.close()

In [33]:
vocab = list(id2word.values())

In [69]:
data = dok_matrix((1000000, len(vocab)), dtype=np.uint8)
groundtruth = []


reader = Zreader("/dlabdata1/youtube_large/yt_metadata_all.jsonl.zst", chunk_size=2**28)

idx = 0
i_vid = 0

for line in reader.readlines():
    ###start_iter = time.time()
    idx += 1
    
    if idx % 1000000 == 0:
        print('Processed ' + str(idx) + ' videos...')
        
    if i_vid % 1000000 == 0 and i_vid != 0:
        print('Size of matrix dok: ' + str(sys.getsizeof(data)))
        print('Shape of S : ' + str(data.get_shape()) + ' and number of elems : ' + str(data.getnnz()))
        data = data.tocsr()
        print('Size of matrix csr: ' + str(sys.getsizeof(data)))
        file_name = 'data' + str(int(i_vid / 1000000))
        scipy.sparse.save_npz('/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/' + file_name + '.npz', data)
        data = dok_matrix((1000000, len(vocab)), dtype=np.uint8)
        print('Shape of S : ' + str(data.get_shape()) + ' and number of elems : ' + str(data.getnnz()))
        print('Processed ' + str(i_vid) + ' videos.')
        print('')
        
    if idx in index_data:
        
        # line is a str dict, video is the dict corresponding to the str dict
        video = json.loads(line)
        
        # Get the tokens for each video and theirs number of occurences
        freq_tokens_per_video = get_freq_tokens_per_video(video)
        
        # For each video, create a underlying dictionnary for filling the sparse matrix efficiently
        dict_freq_tokens_for_sparse_matrix = fill_underlying_dict(freq_tokens_per_video, word2id, i_vid)
        
        # Need to check that the video contains token from the reduced vocabulary
        if dict_freq_tokens_for_sparse_matrix != {}:
            
            # Update the Sparse Matrix
            dict.update(data, dict_freq_tokens_for_sparse_matrix)
            i_vid += 1
            
            # Get groundtruth values
            groundtruth.append(video['categories'])
            
# Save last sparse matrix
data = data.tocsr()
scipy.sparse.save_npz('/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/data_last.npz', data)

Processed 1000000 videos...
Processed 2000000 videos...
Processed 3000000 videos...
Processed 4000000 videos...
Size of matrix dok: 671088752
Shape of S : (1000000, 42757) and number of elems : 19355417
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 1000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 1000000 videos.

Processed 5000000 videos...
Processed 6000000 videos...
Processed 7000000 videos...
Processed 8000000 videos...
Processed 9000000 videos...
Size of matrix dok: 671088752
Shape of S : (1000000, 42757) and number of elems : 19499467
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 2000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 2000000 vid

Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of m

Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of m

Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of m

Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of m

Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 3000000 videos.

Size of m

Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 8000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 8000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 8000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 8000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 8000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 800

Shape of S : (1000000, 42757) and number of elems : 0
Processed 8000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 8000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 8000000 videos.

Size of matrix dok: 248
Shape of S : (1000000, 42757) and number of elems : 0
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 8000000 videos.

Processed 39000000 videos...
Processed 40000000 videos...
Processed 41000000 videos...
Processed 42000000 videos...
Processed 43000000 videos...
Size of matrix dok: 671088752
Shape of S : (1000000, 42757) and number of elems : 19047880
Size of matrix csr: 48
Shape of S : (1000000, 42757) and number of elems : 0
Processed 9000000 videos.

Processed 44000000 videos...
Proc

### Get training features for classifier

- get full matrix of BoW
- process for pyspark
- !!! on the cluster, run the model, SAVE the model AND transformed data 
- transform the data to have k features, which are the distribution over the topics 

In [None]:
# get full matrix of BoW

data = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/data1.npz')

for i in range(2, 18):
    data_next = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/data' + str(i) + '.npz')
    data = scipy.sparse.vstack([data, data_next])

# Add last matrix
data_last = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/data_last.npz')
data = scipy.sparse.vstack([data, data_last])

In [None]:
remove_zero_rows(data).shape

In [58]:
scipy.sparse.save_npz('/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/data_final.npz', data)

KeyboardInterrupt: 

In [None]:
# process for pyspark

data_spark = []

print('Process video for topic modelling...')
for i in range(data.shape[0]):

    if i % 1000000 == 0:
        print(str(i) + ' videos processed...')

    data_spark.append([i, get_dict_for_row(data.getrow(i).todok().items(), data)])
    
    
# Construct dataframe for LDA
df = spark.createDataFrame(data_spark, ["id", "features"])

In [None]:
# Get model and/or transformed data!

# Train the classifier

- Train set into train' and validation set, in order to do cross validation
