In [3]:
import collections
import fasttext
import json
import nltk
import pickle
import random
import scipy.sparse
import sys
import time

import numpy as np
import pandas as pd
import zstandard as zstd

from collections import Counter
from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from scipy.sparse import dok_matrix

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType, StructField, StructType
from pyspark.ml.clustering import LDA, LDAModel, LocalLDAModel
from pyspark.ml.linalg import Vectors, SparseVector

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/olam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Remove video that have 0 tokens in the new vocab

In [4]:
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/matrices/S_final2.npz')

In [9]:
row_with_tokens = []
for i in range(S.shape[0]):
    if S[i, :].count_nonzero() != 0:
        row_with_tokens.append(i)

In [10]:
len(row_with_tokens)

68566847

In [11]:
S

<68638982x663127 sparse matrix of type '<class 'numpy.uint8'>'
	with 1251231562 stored elements in Compressed Sparse Row format>

In [12]:
scipy.sparse.save_npz('/dlabdata1/youtube_large/olam/matrices/S_final3.npz', S[row_with_tokens,:])

## Get list of index of relevant videos, from each categorie, each year and each channel


In [9]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''


    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode(errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]

In [10]:
# Load set of videos to consider
with open('/dlabdata1/youtube_large/olam/filtered10000/idx_vid_to_consider.pickle', 'rb') as f:
    idx_vid_to_consider = pickle.load(f)
f.close()

In [11]:
n_rows = len(idx_vid_to_consider)
n_columns = 5

columns_names = ['idx', 'channel_id', 'view_counts', 'uploaded_year', 'category']

In [12]:
array_relevant_infos = []

In [13]:
reader = Zreader("/dlabdata1/youtube_large/yt_metadata_all.jsonl.zst", chunk_size=2**28)

In [14]:
start = time.time()
idx = 0
idx_new = 0
for line in reader.readlines():
    ###start_iter = time.time()
    idx += 1
    
    if idx % 1000000 == 0:
        print('Progress: ' + str(int(idx/1000000)) + '/85')
        
    if idx in idx_vid_to_consider:
        
        # line is a str dict, video is the dict corresponding to the str dict
        video = json.loads(line)
        array_vid_relevant_infos = [idx_new + 1]
        
        array_vid_relevant_infos.append(video['channel_id'])
        array_vid_relevant_infos.append(video['view_count'])
        array_vid_relevant_infos.append(video['upload_date'][:4])
        array_vid_relevant_infos.append(video['categories'])
        
        array_relevant_infos.append(array_vid_relevant_infos)
        
        idx_new += 1
        
print(str(time.time() - start))

Progress: 1/85
Progress: 2/85
Progress: 3/85
Progress: 4/85
Progress: 5/85
Progress: 6/85
Progress: 7/85
Progress: 8/85
Progress: 9/85
Progress: 10/85
Progress: 11/85
Progress: 12/85
Progress: 13/85
Progress: 14/85
Progress: 15/85
Progress: 16/85
Progress: 17/85
Progress: 18/85
Progress: 19/85
Progress: 20/85
Progress: 21/85
Progress: 22/85
Progress: 23/85
Progress: 24/85
Progress: 25/85
Progress: 26/85
Progress: 27/85
Progress: 28/85
Progress: 29/85
Progress: 30/85
Progress: 31/85
Progress: 32/85
Progress: 33/85
Progress: 34/85
Progress: 35/85
Progress: 36/85
Progress: 37/85
Progress: 38/85
Progress: 39/85
Progress: 40/85
Progress: 41/85
Progress: 42/85
Progress: 43/85
Progress: 44/85
Progress: 45/85
Progress: 46/85
Progress: 47/85
Progress: 48/85
Progress: 49/85
Progress: 50/85
Progress: 51/85
Progress: 52/85
Progress: 53/85
Progress: 54/85
Progress: 55/85
Progress: 56/85
Progress: 57/85
Progress: 58/85
Progress: 59/85
Progress: 60/85
Progress: 61/85
Progress: 62/85
Progress: 63/85
P

In [15]:
df = pd.DataFrame(array_relevant_infos, columns=columns_names)

In [16]:
df.shape

(21714294, 5)

In [17]:
df.head(5)

Unnamed: 0,idx,channel_id,view_counts,uploaded_year,category
0,1,UCzzzZ3-icktxbC3j7hkWqRw,1888967,2016,Howto & Style
1,2,UCzzzZ3-icktxbC3j7hkWqRw,1297474,2016,Howto & Style
2,3,UCzzzZ3-icktxbC3j7hkWqRw,582615,2016,Howto & Style
3,4,UCzzzZ3-icktxbC3j7hkWqRw,14507,2016,Howto & Style
4,5,UCzzzZ3-icktxbC3j7hkWqRw,171671,2016,Howto & Style


In [18]:
len(idx_vid_to_consider)

21714294

In [19]:
df_top5 = df.sort_values(['view_counts'], ascending=False).groupby(['category', 'uploaded_year', 'channel_id']).head(5)
df_top10 = df.sort_values(['view_counts'], ascending=False).groupby(['category', 'uploaded_year', 'channel_id']).head(10)                                                                                                                   
df_top20 = df.sort_values(['view_counts'], ascending=False).groupby(['category', 'uploaded_year', 'channel_id']).head(20)

In [20]:
sorted_idx_relevant_vid_top5 = sorted(df_top5.index.values)
sorted_idx_relevant_vid_top10 = sorted(df_top10.index.values)
sorted_idx_relevant_vid_top20 = sorted(df_top20.index.values)

In [21]:
with open('/dlabdata1/youtube_large/olam/filtered10000/sorted_idx_relevant_vid_top5.pickle', 'wb') as f:
    pickle.dump(sorted_idx_relevant_vid_top5, f)
f.close()
with open('/dlabdata1/youtube_large/olam/filtered10000/sorted_idx_relevant_vid_top10.pickle', 'wb') as f:
    pickle.dump(sorted_idx_relevant_vid_top10, f)
f.close()
with open('/dlabdata1/youtube_large/olam/filtered10000/sorted_idx_relevant_vid_top20.pickle', 'wb') as f:
    pickle.dump(sorted_idx_relevant_vid_top20, f)
f.close()

In [22]:
len(sorted_idx_relevant_vid_top5)

3432979

In [23]:
len(sorted_idx_relevant_vid_top10)

5453143

In [24]:
len(sorted_idx_relevant_vid_top20)

8116217

In [1]:
7728523 / 68000000

0.11365475

In [2]:
7728523 / 45421300

0.17015195513998937

# Save file to work on hadoop cluster

In [25]:
conf = SparkConf().setMaster("local[4]").setAll([('spark.executor.memory', '4g'),('spark.driver.memory','16g'),('spark.driver.maxResultSize', '0')])

# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

In [26]:
def get_dict_for_row(row, S):
    '''Construct SparseVector bag-of-word for each row (videos)'''
    tmp_dict = {}
    for key, value in row:
        tmp_dict[key[1]] = value

    return SparseVector(S.shape[1], tmp_dict)

def remove_zero_rows(M):
    '''Function that removes all rows from sparse matrix M that contains only zero.'''
    num_nonzeros = np.diff(M.indptr)
    return M[num_nonzeros != 0]

In [30]:
# Load data
print('Loading data...')
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/data/view10000_sub10000/csr_matrices/S_final_tok100vid.npz')

# Load set of videos to consider
with open('/dlabdata1/youtube_large/olam/data/view10000_sub10000/sorted_idx_relevant_vid_top20.pickle', 'rb') as f:
    sorted_idx_relevant_vid_top = pickle.load(f)
f.close()

# Select videos
S = S[sorted_idx_relevant_vid_top,:]
S = remove_zero_rows(S)


all_data = []

print('Process video for topic modelling...')
for i in range(S.shape[0]):

    if i % 1000000 == 0:
        print(str(i) + ' videos processed...')

    all_data.append([i, get_dict_for_row(S.getrow(i).todok().items(), S)])
    
    
# Construct dataframe for LDA
all_df = spark.createDataFrame(all_data, ["id", "features"])

Loading data...
Process video for topic modelling...
0 videos processed...
1000000 videos processed...
2000000 videos processed...


In [31]:
all_df.count()

2042687

In [32]:
#save the dataframe
all_df.write\
        .option('compression', 'gzip')\
        .json('/dlabdata1/youtube_large/olam/data/view10000_sub10000/LDA_models/top20/sparkdf.json')

In [16]:
all_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)



In [4]:
all_df_test = spark.read.json('/dlabdata1/youtube_large/olam/data/view10000_sub10000/LDA_models/top10/sparkdf.json')

In [7]:
all_df_test.printSchema()

root
 |-- features: struct (nullable = true)
 |    |-- indices: array (nullable = true)
 |    |    |-- element: long (containsNull = true)
 |    |-- size: long (nullable = true)
 |    |-- type: long (nullable = true)
 |    |-- values: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |-- id: long (nullable = true)



In [55]:
df_collect = all_df_test.collect()

In [8]:
data = []
for row in all_df_test.collect():
    features = row['features']
    data.append([row['id'], SparseVector(features['size'], features['indices'], features['values'])])

KeyboardInterrupt: 

In [68]:
df_test = spark.createDataFrame(data, ['id', 'features'])

In [69]:
df_test.printSchema()

root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)



In [70]:
lda = LDA(k=10, seed=1)
model = lda.fit(df_test)

In [72]:
describe_topics = model.describeTopics()

In [None]:
describe_topics.show()

In [49]:
row_test = df_collect[0]['features']

In [52]:
row_test

Row(indices=[12768, 12772, 42897, 47518, 55859, 72328, 77508, 78050, 87151, 125552, 136529, 150635, 156780, 161250, 166006], size=166209, type=0, values=[1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0, 2.0])

In [53]:
SparseVector(row_test['size'], row_test['indices'], row_test['values'])

SparseVector(166209, {12768: 1.0, 12772: 1.0, 42897: 3.0, 47518: 1.0, 55859: 1.0, 72328: 3.0, 77508: 1.0, 78050: 1.0, 87151: 1.0, 125552: 1.0, 136529: 3.0, 150635: 1.0, 156780: 1.0, 161250: 2.0, 166006: 2.0})

In [28]:
all_df_test.printSchema()

root
 |-- features: struct (nullable = true)
 |    |-- indices: array (nullable = true)
 |    |    |-- element: long (containsNull = true)
 |    |-- size: long (nullable = true)
 |    |-- type: long (nullable = true)
 |    |-- values: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |-- id: long (nullable = true)



In [29]:
all_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)



In [33]:
all_df.select('features').show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                                                                                  |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(416311,[5881,16455,18197,77164,152293,162655,185041,203528,225256,295597,356334,375392,377662,386188],[2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0])                                                                         |
|(416311,[3414,5881,8565,18197,34688,62895,110100,14

In [3]:
S_view1000_sub10000 = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/filtered/csr_matrices/S_final_tok100vid.npz')

In [6]:
S_view1000_sub10000

<45421300x166209 sparse matrix of type '<class 'numpy.uint8'>'
	with 473957128 stored elements in Compressed Sparse Row format>

In [4]:
S_view1000_sub100000 = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/filtered/csr_matrices_100000sub/S_final_tok100vid.npz')
S_view1000_sub100000

<24839929x74362 sparse matrix of type '<class 'numpy.uint8'>'
	with 146450642 stored elements in Compressed Sparse Row format>

In [8]:
S_view10000_sub10000 = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/filtered10000/csr_matrices/S_final_tok100vid.npz')
S_view10000_sub10000

<21714294x65907 sparse matrix of type '<class 'numpy.uint8'>'
	with 109510371 stored elements in Compressed Sparse Row format>

In [7]:
S_view10000_sub100000 = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/filtered10000/csr_matrices_100000sub/S_final_tok100vid.npz')
S_view10000_sub100000

<15167437x42757 sparse matrix of type '<class 'numpy.uint8'>'
	with 58140783 stored elements in Compressed Sparse Row format>

##  Remove duplicates of sparse matrix

In [19]:
def remove_duplicate_rows(data):
    unique_row_indices, unique_columns = [], []
    
    for row_idx, row in enumerate(data):
        
        indices = row.indices.tolist()
        
        if indices not in unique_columns:
            
            unique_columns.append(indices)
            unique_row_indices.append(row_idx)
            
    return data[unique_row_indices]

In [4]:
nrows = 10
ncolumns = 5

S_1 = dok_matrix((nrows, ncolumns))

In [9]:
S_1.items()

dict_items([((0, 0), 1.0), ((1, 0), 1.0)])

In [7]:
S_1[0,0] = 1

In [17]:
S_1[1,0] = 2

In [22]:
S_1.todense()

matrix([[1., 0., 0., 0., 0.],
        [2., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

In [23]:
remove_duplicate_rows(S_1.tocsr()).todense()

matrix([[1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])