In [1]:
import collections
import fasttext
import json
import nltk
import pickle
import scipy.sparse
import sys
import time

import numpy as np
import pandas as pd
import zstandard as zstd

from collections import Counter
from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from scipy.sparse import dok_matrix

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/olam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
s_stemmer = SnowballStemmer(language='english')

In [3]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''


    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode(errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]

In [4]:
class LanguageIdentification:

    def __init__(self):
        pretrained_lang_model = "/home/olam/fasttext/lid.176.bin"
        self.model = fasttext.load_model(pretrained_lang_model)

    def predict_lang(self, text):
        predictions = self.model.predict(text, k=1) 
        return predictions

In [5]:
LANGUAGE = LanguageIdentification()



In [6]:
def detect_language(text):
    '''Method that detect the language of the argument using langdetect'''
    
    
    # Create list to store the language detections
    #detections = []
    
    
    # TextBlob
    #start_detect = time.time()
    #blob = TextBlob(text)
    #textblob_res = blob.detect_language()
    #end_detect = time.time()
    #duration_detect = end_detect - start_detect
    #print('TextBlob : ' + str(duration_detect))
    
    # Langedetect
    #start_detect = time.time()
    #for i in range(5):
    #    detections.append(detect(text))
    #end_detect = time.time()
    #duration_detect = end_detect - start_detect
    #print('Langedetect : ' + str(duration_detect))
    
    # LangID
    #start_detect = time.time()
    #langid_res = langid.classify(text)[0]
    #end_detect = time.time()
    #duration_detect = end_detect - start_detect
    #print('LangId : ' + str(duration_detect))
    
    # FastText
    #start_detect = time.time()
    fastext_res = LANGUAGE.predict_lang(text)[0][0].replace('__label__', '')
    #end_detect = time.time()
    #duration_detect = end_detect - start_detect
    #print('Fastext : ' + str(duration_detect))
    
    # Create the counter to get the most detected language
    #c = Counter(detections)
    #language_detected, _ = c.most_common()[0]

    
    #print('LangId result : ' + langid_res)
    #print('FastText result : ' + fastext_res)
    
    return fastext_res

In [7]:
def check_en_video(vid_title, vid_description):
    '''Returns True if the language of the video is english'''
    
    detected_language = ''
    
    # First check for description and if there is none, chech for the title.
    # Handling exceptions when a text couldn't be used of langdetect
    if vid_description != '':
        try:
            detected_language = detect_language(vid_description)
        except:
            if vid_title != '':
                try:
                    detected_language = detect_language(vid_title)
                except:
                    detected_language = None
    elif vid_title != '':
        try:
            detected_language = detect_language(vid_title)
        except:
            detected_language = None
    else:
        detected_language = None
        
    return detected_language == 'en'

In [8]:
def get_freq_tokens_per_video(video):
    ''''''
    
    title_tokens = [w for w in tokenizer.tokenize(video['title'].lower()) if not w in stop_words]
    tag_tokens = [w for w in tokenizer.tokenize(video['tags'].lower()) if not w in stop_words]
    
    # We want to keep duplicates !!
    tokens_per_video = title_tokens + tag_tokens

    # Stemming
    stemmed_tokens_per_video = ([s_stemmer.stem(w) for w in tokens_per_video])
    
    # Return a Counter object of the tokens
    return collections.Counter(stemmed_tokens_per_video)

In [9]:
def fill_underlying_dict(freq_tokens_per_video, dict_stemmed_tokens, dict_freq_tokens_for_sparse_matrix, i_vid):
    '''Method to fill the underlying dictionnary in order to 
    update the sparse matrix incrementally by videos'''
    
    for key in freq_tokens_per_video.keys():
        
        # Column index in the sparse matrix (one column for each token)
        j_token = dict_stemmed_tokens[key]
    
        # Filling the underlying dict
        dict_freq_tokens_for_sparse_matrix[(i_vid % 1000000, j_token)] = freq_tokens_per_video[key]
    

In [10]:
### GET THE LIST OF CHANNELS IN channelcrawler.csv

df_channelcrawler = pd.read_csv('/dlabdata1/youtube_large/channelcrawler.csv')

df_channelcrawler['channel_id'] = df_channelcrawler['link'].apply(lambda x: x.replace('http://www.youtube.com/channel/', ''))

# Store in a set since it will be faster to check if a channel is in channelcrawler
set_channelcrawler = set(df_channelcrawler['channel_id'])

### A) First pass to build dict of tokens

In [11]:
reader = Zreader("/dlabdata1/youtube_large/yt_metadata_all.jsonl.zst", chunk_size=2**28)

In [12]:
idx = 0

# Variable that contains the idx of every non english vid and that
# belongs to a channel in channelcrawler.csv TO BE USED IN SECOND ITER
idx_vid_to_consider = set()

# Variable first instanciated as set to check existing tokens efficiently, 
# which will be a list in order to get the index for each tokens
list_stemmed_tokens = set()

for line in reader.readlines():
    #start_iter = time.time()
    idx += 1
    
    if idx % 1000000 == 0:
        print('Processed ' + str(idx) + ' videos.')
        
    # line is a str dict, video is the dict corresponding to the str dict
    video = json.loads(line)
    
    
    if video['channel_id'] in set_channelcrawler and \
    check_en_video(video['title'], video['description']):
        
        # Keep idx of video in memory
        idx_vid_to_consider.add(idx)
        
        # Get the stemmed token of the video
        tokens_per_video = get_freq_tokens_per_video(video).keys()
        
        # Update list_stemmed_tokens
        list_stemmed_tokens.update(tokens_per_video)
    #end_iter = time.time()
    #duration_iter = end_iter - start_iter
    #print('Time for 1 iteration : ' + str(duration_iter))

Processed 1000000 videos.
Processed 2000000 videos.
Processed 3000000 videos.
Processed 4000000 videos.
Processed 5000000 videos.
Processed 6000000 videos.
Processed 7000000 videos.
Processed 8000000 videos.
Processed 9000000 videos.
Processed 10000000 videos.
Processed 11000000 videos.
Processed 12000000 videos.
Processed 13000000 videos.
Processed 14000000 videos.
Processed 15000000 videos.
Processed 16000000 videos.
Processed 17000000 videos.
Processed 18000000 videos.
Processed 19000000 videos.
Processed 20000000 videos.
Processed 21000000 videos.
Processed 22000000 videos.
Processed 23000000 videos.
Processed 24000000 videos.
Processed 25000000 videos.
Processed 26000000 videos.
Processed 27000000 videos.
Processed 28000000 videos.
Processed 29000000 videos.
Processed 30000000 videos.
Processed 31000000 videos.
Processed 32000000 videos.
Processed 33000000 videos.
Processed 34000000 videos.
Processed 35000000 videos.
Processed 36000000 videos.
Processed 37000000 videos.
Processed 

In [13]:
list_stemmed_tokens = list(list_stemmed_tokens)

In [14]:
len(list_stemmed_tokens)

7987546

In [15]:
len(idx_vid_to_consider)

68642144

In [None]:
with open('/home/olam/idx_vid_to_consider.pickle', 'wb') as f:
    pickle.dump(idx_vid_to_consider, f)
f.close()

In [None]:
with open('/home/olam/list_stemmed_tokens.pickle', 'wb') as f:
    pickle.dump(list_stemmed_tokens, f)
f.close()

### B) Second pass to build the sparse matrix 

In [11]:
# Load set of videos to consider
with open('/home/olam/idx_vid_to_consider.pickle', 'rb') as f:
    idx_vid_to_consider = pickle.load(f)
f.close()

In [12]:
# Load dictionnary of tokens
with open('/home/olam/list_stemmed_tokens.pickle', 'rb') as f:
    list_stemmed_tokens = pickle.load(f)
f.close()

In [13]:
print('Number of videos to consider : ' + str(len(idx_vid_to_consider)))

Number of videos to consider : 68642144


In [14]:
print('Number of tokens : ' + str(len(list_stemmed_tokens)))

Number of tokens : 7987546


In [15]:
# Get dimension of sparse matrix
size_of_tokens_dict = len(list_stemmed_tokens)
number_of_vid = len(idx_vid_to_consider)

In [16]:
# Create dictionnary of tokens with their indice
dict_stemmed_tokens = {}

# Fill dictionnary of tokens
for i, token in enumerate(list_stemmed_tokens):
    dict_stemmed_tokens[token] = i

---

In [77]:
import random

In [78]:
nrows = 1000
ncolumns = 100000

In [91]:
S_1 = dok_matrix((nrows, ncolumns))

In [92]:
S_2 = dok_matrix((nrows, ncolumns))

In [93]:
S_1i = dok_matrix((nrows, ncolumns), dtype=np.uint8)

In [94]:
S_2i = dok_matrix((nrows, ncolumns), dtype=np.uint8)

In [95]:
for i in range(5000):
    x = random.randint(0, 999)
    y = random.randint(0, 99999)
    
    v = random.randint(0, 20)
    
    S_1[x, y] = v
    S_1i[x, y] = v
    
for i in range(10000):
    x = random.randint(0, 999)
    y = random.randint(0, 99999)
    
    v = random.randint(0, 20)
    
    S_2[x, y] = v
    S_2i[x, y] = v

In [96]:
S_1

<1000x100000 sparse matrix of type '<class 'numpy.float64'>'
	with 4751 stored elements in Dictionary Of Keys format>

In [97]:
S_1i

<1000x100000 sparse matrix of type '<class 'numpy.uint8'>'
	with 4751 stored elements in Dictionary Of Keys format>

In [98]:
S_2

<1000x100000 sparse matrix of type '<class 'numpy.float64'>'
	with 9494 stored elements in Dictionary Of Keys format>

In [99]:
S_2i

<1000x100000 sparse matrix of type '<class 'numpy.uint8'>'
	with 9494 stored elements in Dictionary Of Keys format>

In [100]:
sys.getsizeof(S_1)

147584

In [101]:
sys.getsizeof(S_1i)

147584

In [102]:
sys.getsizeof(S_2)

295032

In [103]:
sys.getsizeof(S_2i)

295032

In [115]:
S_1_csr = S_1.tocsr()
S_1i_csr = S_1i.tocsr()

In [116]:
S_1_csr

<1000x100000 sparse matrix of type '<class 'numpy.float64'>'
	with 4751 stored elements in Compressed Sparse Row format>

In [117]:
S_1i_csr

<1000x100000 sparse matrix of type '<class 'numpy.uint8'>'
	with 4751 stored elements in Compressed Sparse Row format>

In [118]:
sys.getsizeof(S_1_csr)

64

In [119]:
sys.getsizeof(S_1i_csr)

64

In [126]:
S_test = dok_matrix((20,10))

In [127]:
S1 = dok_matrix((10,10))

In [128]:
S2 = dok_matrix((10,10))

In [129]:
for i in range(20):
    for j in range(10):
        
        value = random.randint(0, 20)
        
        S_test[i, j] = value
        
        if i < 10:
            S1[i, j] = value
        else:
            S2[i - 10, j] = value
        

In [131]:
S_test.todense()

matrix([[ 5.,  7.,  6., 15., 14.,  5., 16.,  9., 20.,  1.],
        [ 7.,  8.,  3.,  3.,  3., 17., 14., 20., 20.,  7.],
        [ 8.,  5., 15.,  3., 10., 13.,  9., 14., 17., 12.],
        [ 7., 16.,  7., 18.,  8.,  3.,  8., 20.,  9., 17.],
        [ 5.,  0., 12.,  9., 15., 10.,  3., 10., 17.,  5.],
        [19.,  7.,  3.,  0., 15., 13.,  1.,  9.,  5.,  7.],
        [ 3., 12.,  5.,  6.,  9.,  4.,  5.,  6.,  2., 18.],
        [16.,  0.,  9.,  4., 15.,  4., 10., 12.,  1.,  9.],
        [14., 14., 18.,  0., 15.,  4.,  8.,  9., 18.,  0.],
        [ 8.,  5., 11.,  5., 14., 11., 14., 13., 20., 11.],
        [14., 10., 19.,  9., 17., 17., 11., 14., 10.,  8.],
        [ 9.,  9.,  3., 20., 17.,  6., 15., 17., 16., 11.],
        [ 6., 17., 19.,  2.,  8., 17.,  5.,  8.,  8.,  3.],
        [ 6.,  1.,  8., 11.,  7., 15., 10.,  4.,  3., 14.],
        [17., 18., 17.,  5., 19.,  8., 11.,  4., 11.,  8.],
        [17.,  6., 14., 11.,  8., 18., 11.,  6.,  0.,  7.],
        [13., 18., 20., 10., 17.,  7.,  

In [132]:
S1.todense()

matrix([[ 5.,  7.,  6., 15., 14.,  5., 16.,  9., 20.,  1.],
        [ 7.,  8.,  3.,  3.,  3., 17., 14., 20., 20.,  7.],
        [ 8.,  5., 15.,  3., 10., 13.,  9., 14., 17., 12.],
        [ 7., 16.,  7., 18.,  8.,  3.,  8., 20.,  9., 17.],
        [ 5.,  0., 12.,  9., 15., 10.,  3., 10., 17.,  5.],
        [19.,  7.,  3.,  0., 15., 13.,  1.,  9.,  5.,  7.],
        [ 3., 12.,  5.,  6.,  9.,  4.,  5.,  6.,  2., 18.],
        [16.,  0.,  9.,  4., 15.,  4., 10., 12.,  1.,  9.],
        [14., 14., 18.,  0., 15.,  4.,  8.,  9., 18.,  0.],
        [ 8.,  5., 11.,  5., 14., 11., 14., 13., 20., 11.]])

In [133]:
S2.todense()

matrix([[14., 10., 19.,  9., 17., 17., 11., 14., 10.,  8.],
        [ 9.,  9.,  3., 20., 17.,  6., 15., 17., 16., 11.],
        [ 6., 17., 19.,  2.,  8., 17.,  5.,  8.,  8.,  3.],
        [ 6.,  1.,  8., 11.,  7., 15., 10.,  4.,  3., 14.],
        [17., 18., 17.,  5., 19.,  8., 11.,  4., 11.,  8.],
        [17.,  6., 14., 11.,  8., 18., 11.,  6.,  0.,  7.],
        [13., 18., 20., 10., 17.,  7.,  3.,  3.,  6., 10.],
        [19.,  6.,  2., 18., 14., 16., 15., 14., 20., 12.],
        [ 4., 18.,  9.,  7.,  0., 14.,  7., 11.,  5., 19.],
        [ 8.,  6., 15.,  4., 17., 16., 19., 20.,  8.,  2.]])

In [134]:
S_testcsr = S_test.tocsr()

In [138]:
sys.getsizeof(S_test)

9344

In [137]:
sys.getsizeof(S_testcsr)

64

In [136]:
S_testcsr.todense()

matrix([[ 5.,  7.,  6., 15., 14.,  5., 16.,  9., 20.,  1.],
        [ 7.,  8.,  3.,  3.,  3., 17., 14., 20., 20.,  7.],
        [ 8.,  5., 15.,  3., 10., 13.,  9., 14., 17., 12.],
        [ 7., 16.,  7., 18.,  8.,  3.,  8., 20.,  9., 17.],
        [ 5.,  0., 12.,  9., 15., 10.,  3., 10., 17.,  5.],
        [19.,  7.,  3.,  0., 15., 13.,  1.,  9.,  5.,  7.],
        [ 3., 12.,  5.,  6.,  9.,  4.,  5.,  6.,  2., 18.],
        [16.,  0.,  9.,  4., 15.,  4., 10., 12.,  1.,  9.],
        [14., 14., 18.,  0., 15.,  4.,  8.,  9., 18.,  0.],
        [ 8.,  5., 11.,  5., 14., 11., 14., 13., 20., 11.],
        [14., 10., 19.,  9., 17., 17., 11., 14., 10.,  8.],
        [ 9.,  9.,  3., 20., 17.,  6., 15., 17., 16., 11.],
        [ 6., 17., 19.,  2.,  8., 17.,  5.,  8.,  8.,  3.],
        [ 6.,  1.,  8., 11.,  7., 15., 10.,  4.,  3., 14.],
        [17., 18., 17.,  5., 19.,  8., 11.,  4., 11.,  8.],
        [17.,  6., 14., 11.,  8., 18., 11.,  6.,  0.,  7.],
        [13., 18., 20., 10., 17.,  7.,  

In [144]:
S_stacked = scipy.sparse.vstack([S1, S2])
S_stacked

<20x10 sparse matrix of type '<class 'numpy.float64'>'
	with 193 stored elements in COOrdinate format>

In [145]:
S_stacked.todense()

matrix([[ 5.,  7.,  6., 15., 14.,  5., 16.,  9., 20.,  1.],
        [ 7.,  8.,  3.,  3.,  3., 17., 14., 20., 20.,  7.],
        [ 8.,  5., 15.,  3., 10., 13.,  9., 14., 17., 12.],
        [ 7., 16.,  7., 18.,  8.,  3.,  8., 20.,  9., 17.],
        [ 5.,  0., 12.,  9., 15., 10.,  3., 10., 17.,  5.],
        [19.,  7.,  3.,  0., 15., 13.,  1.,  9.,  5.,  7.],
        [ 3., 12.,  5.,  6.,  9.,  4.,  5.,  6.,  2., 18.],
        [16.,  0.,  9.,  4., 15.,  4., 10., 12.,  1.,  9.],
        [14., 14., 18.,  0., 15.,  4.,  8.,  9., 18.,  0.],
        [ 8.,  5., 11.,  5., 14., 11., 14., 13., 20., 11.],
        [14., 10., 19.,  9., 17., 17., 11., 14., 10.,  8.],
        [ 9.,  9.,  3., 20., 17.,  6., 15., 17., 16., 11.],
        [ 6., 17., 19.,  2.,  8., 17.,  5.,  8.,  8.,  3.],
        [ 6.,  1.,  8., 11.,  7., 15., 10.,  4.,  3., 14.],
        [17., 18., 17.,  5., 19.,  8., 11.,  4., 11.,  8.],
        [17.,  6., 14., 11.,  8., 18., 11.,  6.,  0.,  7.],
        [13., 18., 20., 10., 17.,  7.,  

In [146]:
scipy.sparse.save_npz('/home/olam/csr_matrices/test.npz', S_stacked)

In [147]:
S_loaded = scipy.sparse.load_npz('/home/olam/csr_matrices/test.npz')

In [150]:
sys.getsizeof(S_loaded)

64

---

In [17]:
# Create sparse matrix
S = dok_matrix((1000000, size_of_tokens_dict), dtype=np.uint8)

In [18]:
reader = Zreader("/dlabdata1/youtube_large/yt_metadata_all.jsonl.zst", chunk_size=2**28)

In [19]:
idx = 0

# Row index in the sparse matrix (one row for each video)
i_vid = 0


for line in reader.readlines():
    ###start_iter = time.time()
    idx += 1
    
    if idx % 1000000 == 0:
        print('Size of matrix dok: ' + str(sys.getsizeof(S)))
        print('Shape of S : ' + str(S.get_shape()) + ' and number of elems : ' + str(S.getnnz()))
        S = S.tocsr()
        print('Size of matrix csr: ' + str(sys.getsizeof(S)))
        file_name = 'S' + str(int(idx / 1000000))
        scipy.sparse.save_npz('/home/olam/csr_matrices/' + file_name + '.npz', S)
        S = dok_matrix((1000000, size_of_tokens_dict), dtype=np.uint8)
        print('Shape of S : ' + str(S.get_shape()) + ' and number of elems : ' + str(S.getnnz()))
        print('Processed ' + str(idx) + ' videos.')
        print('')
        
        
    # line is a str dict, video is the dict corresponding to the str dict
    video = json.loads(line)
    
    if idx in idx_vid_to_consider:
        
        # For each video, create a underlying dictionnary for filling the sparse matrix efficiently
        dict_freq_tokens_for_sparse_matrix = {}
    
        # Get the tokens for each video and theirs number of occurences
        ###start_freq = time.time()
        freq_tokens_per_video = get_freq_tokens_per_video(video)
        ###print('Time for getting tokens of video : ' + str(time.time() - start_freq))

        # Fill the underlying dict
        fill_underlying_dict(freq_tokens_per_video, dict_stemmed_tokens, dict_freq_tokens_for_sparse_matrix, i_vid)

        # Update the Sparse Matrix
        ###start_update = time.time()
        dict.update(S, dict_freq_tokens_for_sparse_matrix)
        ###print('Time for updating sparse_matrix : ' + str(time.time() - start_update))

        # Increment Row index for next video
        i_vid += 1
        
    ###print('Time for 1 iter : ' + str(time.time() - start_iter))
    
# Save last sparse matrix
S = S.tocsr()
scipy.sparse.save_npz('/home/olam/csr_matrices/S_last.npz', S)

Size of matrix dok: 671088768
Shape of S : (1000000, 7987546) and number of elems : 16182370
Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 1000000 videos.

Size of matrix dok: 671088768
Shape of S : (1000000, 7987546) and number of elems : 17469788
Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 2000000 videos.

Size of matrix dok: 671088768
Shape of S : (1000000, 7987546) and number of elems : 17077835
Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 3000000 videos.

Size of matrix dok: 671088768
Shape of S : (1000000, 7987546) and number of elems : 15748510
Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 4000000 videos.

Size of matrix dok: 671088768
Shape of S : (1000000, 7987546) and number of elems : 16932781
Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 5000000 videos.

Size 

Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 42000000 videos.

Size of matrix dok: 671088768
Shape of S : (1000000, 7987546) and number of elems : 16596418
Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 43000000 videos.

Size of matrix dok: 671088768
Shape of S : (1000000, 7987546) and number of elems : 17153841
Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 44000000 videos.

Size of matrix dok: 671088768
Shape of S : (1000000, 7987546) and number of elems : 16603929
Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 45000000 videos.

Size of matrix dok: 671088768
Shape of S : (1000000, 7987546) and number of elems : 15541993
Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 46000000 videos.

Size of matrix dok: 671088768
Shape of S : (1000000, 7987546) and number of elems : 16623830


Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 83000000 videos.

Size of matrix dok: 671088768
Shape of S : (1000000, 7987546) and number of elems : 16362815
Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 84000000 videos.

Size of matrix dok: 671088768
Shape of S : (1000000, 7987546) and number of elems : 17002625
Size of matrix csr: 64
Shape of S : (1000000, 7987546) and number of elems : 0
Processed 85000000 videos.



NotImplementedError: Save is not implemented for sparse matrix of format dok.

In [24]:
S = scipy.sparse.load_npz('/home/olam/csr_matrices/S1.npz')

In [25]:
for i in range(2, 86):
    S_next = scipy.sparse.load_npz('/home/olam/csr_matrices/S' + str(i) + '.npz')
    S = scipy.sparse.vstack([S, S_next])

# Add last matrix
S_last = scipy.sparse.load_npz('/home/olam/csr_matrices/S_last.npz')
S = scipy.sparse.vstack([S, S_last])

In [26]:
S

<86000000x7987546 sparse matrix of type '<class 'numpy.uint8'>'
	with 1412203435 stored elements in Compressed Sparse Row format>

In [31]:
S_full = scipy.sparse.load_npz('/home/olam/csr_matrices/S_full.npz')

In [32]:
S_full

<86000000x7987546 sparse matrix of type '<class 'numpy.uint8'>'
	with 1412203435 stored elements in Compressed Sparse Row format>