Notebook selecting the colums of the sparse matrix having the most comments in it.

In [47]:
import json
import queue
import time
import pickle
import gzip
import scipy.sparse
import sys
import operator

import zstandard as zstd
import pandas as pd
import networkx as nx
import numpy as np

from scipy.sparse import dok_matrix
from scipy.sparse import csr_matrix, hstack

In [2]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = pd.read_pickle("/dlabdata1/youtube_large/id_to_channel_mapping.pickle")

In [4]:
# Channels that are in set_crawler dataset and also in which the language is in english
with open('../../../dlabdata1/youtube_large/olam/channels_id.pickle', 'rb') as f:
    channels_id = pickle.load(f)
f.close()

In [6]:
# Dictionnary to map the channel id to an integer corresponding to the column/row of the sparse matrix.
channel_dict = {}
for ind, channel_id in enumerate(channels_id):
    channel_dict[channel_id] = ind

In [5]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''

    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode("utf-8", errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]


Calculate the number of comments for each channel. These results are stored into a dictionnary: 
index_channel -> number_of_comments_for_index_channel

In [10]:
'''
Function to add new edge
    PARAMETERS:
        - graph_dict: dictionnary mapping the edge (tuple of channel indices) with the weight of that edge
        - user_edge_channel_id: new edge to be added in graph_dict
'''
def add_edge(chan_id):
    chan_index = channel_dict[chan_id]
    if graph_dict.get(chan_index) is None:
        graph_dict[chan_index] = 1
    else:
        graph_dict[chan_index] += 1
        
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# parameters
graph_dict = {}
idx = 1
begin_time = time.time()

# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) == 9:
        if idx == 1:
            print(line_split)

        else:
            if vid_to_channels.get(line_split[2]) in channels_id:
                corr_channel = vid_to_channels[line_split[2]]
                add_edge(corr_channel)

    idx += 1
    if idx % 100000000 == 0:
        print('line number: ' + str(idx) + ' time: ' + str(time.time() - begin_time))
        begin_time = time.time()
    if idx % 1000000000 == 0:
        output = open('../../../dlabdata1/youtube_large/jouven/channels_usage_'+str(idx)+'.pkl', 'wb')
pickle.dump(graph_dict, output)
output.close()

output = open('../../../dlabdata1/youtube_large/jouven/channels_usage.pkl', 'wb')
pickle.dump(graph_dict, output)
output.close()

['author_id', 'id', 'video_id', 'parent_id', 'crawled_at', 'likes', 'replies', 'author', 'content']
line number: 100000000 time: 402.5513184070587
line number: 200000000 time: 402.1791410446167
line number: 300000000 time: 429.0353271961212
line number: 400000000 time: 431.31581449508667
line number: 500000000 time: 426.8570764064789
line number: 600000000 time: 427.2702944278717
line number: 700000000 time: 416.2495291233063
line number: 800000000 time: 412.0062770843506
line number: 900000000 time: 436.35715770721436
line number: 1000000000 time: 420.625691652298
line number: 1100000000 time: 422.53399634361267
line number: 1200000000 time: 417.3296959400177
line number: 1300000000 time: 414.458532333374
line number: 1400000000 time: 424.5543465614319
line number: 1500000000 time: 428.0177056789398
line number: 1600000000 time: 424.02525520324707
line number: 1700000000 time: 487.32174491882324
line number: 1800000000 time: 536.2319900989532
line number: 1900000000 time: 540.82876753

Reduce the size of the sparse matrix in order to be able to create a networkx graph

In [30]:
pkl_file = open('../../../dlabdata1/youtube_large/jouven/channels_usage.pkl', 'rb')
channel_wanted = pickle.load(pkl_file)
pkl_file.close()

In [31]:
channel_sorted = sorted(channel_wanted.items(), key=operator.itemgetter(1), reverse = True)

In [32]:
channels = []
for val in channel_sorted:
    channels.append(val[0])

In [33]:
length_matrix = 1000
channels = channels[:length_matrix]

In [34]:
sparse_matrix = scipy.sparse.load_npz('../../../dlabdata1/youtube_large/jouven/sparse_matrix_graph.npz')

In [35]:
cols_to_keep = np.array(channels)
m = sparse_matrix[:, cols_to_keep]

In [36]:
m_r = m[cols_to_keep, :]

In [37]:
graph_most_used_channels = nx.from_scipy_sparse_matrix(m_r, create_using = nx.DiGraph())

In [21]:
# Save the final sparse matrix
scipy.sparse.save_npz('../../../dlabdata1/youtube_large/jouven/sparse_matrix_4_most_channels_used.npz', m_r)

NameError: name 'm_r' is not defined