In [1]:
import numpy as np
import pickle
import random
import scipy.sparse
import sys
import time

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

In [2]:
conf = SparkConf().setMaster("local[12]").setAll([
                                   ('spark.executor.memory', '16g'),  # find
                                   ('spark.driver.memory','8g'), # your
                                   ('spark.driver.maxResultSize', '4G') # setup
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

In [3]:
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/matrices/S_final2.npz')

In [4]:
data = []

In [5]:
def get_dict_for_row(row):
    ''''''
    tmp_dict = {}
    for key, value in row:
        tmp_dict[key[1]] = value
        
    return Vectors.sparse(len(tmp_dict), tmp_dict)

In [6]:
for i in range(S.shape[0]):
    
    if i % 10000000 == 0:
        print(str(i) + ' videos processed')
    if i == 500000:
        break
    
    # Data is a list of list of the following elems : index of doc and a bag-of-word sparse Vector
    data.append([i, get_dict_for_row(S.getrow(i).todok().items())])
    

0 videos processed


In [7]:
print('Length of array : ' + str(len(data)) + ' and first row : ' + str(data[258937]))

Length of array : 500000 and first row : [258937, SparseVector(24, {20451: 1.0, 49360: 1.0, 62889: 1.0, 90939: 1.0, 142981: 3.0, 165021: 1.0, 169010: 1.0, 190462: 1.0, 212862: 1.0, 247334: 2.0, 309667: 2.0, 374074: 1.0, 384153: 3.0, 428500: 1.0, 445034: 2.0, 447282: 1.0, 453390: 1.0, 470491: 1.0, 493674: 2.0, 507615: 1.0, 509141: 1.0, 629388: 1.0, 633252: 1.0, 649840: 2.0})]


In [232]:
np.savez_compressed('/dlabdata1/youtube_large/olam/list_data_lda_spark', data)

In [8]:
corpus = sc.parallelize(data)

In [9]:
model = LDA.train(corpus, k=20, seed=1)

In [15]:
model.save(sc, '/dlabdata1/youtube_large/olam/lda_sparkModel_500000vid')

In [10]:
model.describeTopics()[:5]

[([618890, 655263, 650755, 39325, 60788],
  [0.01591342080221603,
   0.01062900727127717,
   0.008840857290014758,
   0.007208700311898657,
   0.00694410222612207]),
 ([655263, 494551, 423399, 606524, 582861],
  [0.011805558715055611,
   0.010003007665561966,
   0.009368654497465557,
   0.008724152988832003,
   0.008045816303057063]),
 ([60953, 50414, 655263, 585888, 118607],
  [0.05121343464147456,
   0.014160465507474409,
   0.01202342934848488,
   0.011352127772678682,
   0.01000544035185211]),
 ([587583, 156037, 237158, 558227, 655263],
  [0.02260586094214811,
   0.009552352188027398,
   0.009114894433390077,
   0.008541334351027982,
   0.00792920881818704]),
 ([655263, 347101, 60788, 91913, 479998],
  [0.011984542545684037,
   0.006776620716003611,
   0.006500006164804386,
   0.005295908138002675,
   0.005032335627837785])]

In [222]:
same_model = LDAModel.load(sc, '/dlabdata1/youtube_large/olam/lda_sparkModel')

In [227]:
same_model.describeTopics()[0]

([214313, 389685, 702165, 671629, 420134, 544917],
 [0.266132635069856,
  0.04908022772804397,
  0.04380826578831293,
  0.04196875751770345,
  0.031857012470995164,
  0.015664738377080007])

## Understand results from LDA

In [11]:
# Load dictionnary of tokens
with open('/dlabdata1/youtube_large/olam/id2word_2.pickle', 'rb') as f:
    id2word = pickle.load(f)
f.close()

In [12]:
len(model.describeTopics())

20

In [13]:
model.describeTopics()[0][0]

[618890, 655263, 650755, 39325, 60788]

In [14]:
for i in range(len(model.describeTopics())):
    topic_words = model.describeTopics()[i][0]
    print('Topic ' + str(i))
    for i, id_token in enumerate(topic_words):
        print('   ' + id2word[id_token])
    print(' ')

Topic 0
   prime
   game
   surf
   roblox
   news
 
Topic 1
   game
   onlin
   final
   gta
   part
 
Topic 2
   minecraft
   mod
   game
   hack
   pokemon
 
Topic 3
   hair
   hairstyl
   madden
   tutori
   game
 
Topic 4
   game
   video
   news
   gopro
   laptop
 
Topic 5
   video
   game
   song
   khan
   music
 
Topic 6
   game
   song
   video
   final
   full
 
Topic 7
   dota
   draft
   abil
   pro
   game
 
Topic 8
   horizon
   forza
   game
   nba
   video
 
Topic 9
   war
   bindass
   game
   fail
   episod
 
Topic 10
   scooter
   game
   jamaica
   motorcycl
   uaap
 
Topic 11
   game
   video
   play
   music
   part
 
Topic 12
   muscl
   gain
   build
   workout
   fit
 
Topic 13
   watch
   face
   gear
   fit
   best
 
Topic 14
   lol
   video
   game
   surpris
   seri
 
Topic 15
   watch
   pokemon
   face
   game
   gear
 
Topic 16
   song
   tamil
   bhojpuri
   hit
   politician
 
Topic 17
   final
   fantasi
   hero
   game
   fortnit
 
Topic 18
   pvl


## Issue : need to remove tokens of length 1/2 and tokens with another alphabet

Hence, save the S_final2 sparse matrix and get the id2word2 dict that map the token_id to the token in the new matrix

In [129]:
id_token_to_remove = []
token_to_keep = []

In [130]:
def englishAlpha(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return True
    else:
        return False

In [131]:
for token_id, token in id2word.items():
    if len(token) < 3 or nonEnglishAlpha(token) or token.isnumeric():
        id_token_to_remove.append(token_id)
    else:
        token_to_keep.append(token)

In [132]:
len(id_token_to_remove)

81014

In [133]:
len(token_to_keep)

663127

In [134]:
len(token_to_keep) + len(id_token_to_remove) == len(id2word)

True

In [135]:
S

<68638982x744141 sparse matrix of type '<class 'numpy.uint8'>'
	with 1393937498 stored elements in Compressed Sparse Row format>

In [136]:
id_token_to_keep = (np.delete(np.arange(len(id2word)), id_token_to_remove))

In [137]:
S = S[:, id_token_to_keep]

In [138]:
S

<68638982x663127 sparse matrix of type '<class 'numpy.uint8'>'
	with 1251231562 stored elements in Compressed Sparse Row format>

In [139]:
token_to_keep = set(token_to_keep)

In [142]:
id2word_new = {}

k = 0
for i, token in enumerate(id2word.values()):
    if token in token_to_keep:
        id2word_new[k] = token
        k += 1

In [145]:
scipy.sparse.save_npz('/dlabdata1/youtube_large/olam/matrices/S_final2.npz', S)

In [146]:
with open('/dlabdata1/youtube_large/olam/id2word_2.pickle', 'wb') as f:
    pickle.dump(id2word_new, f)
f.close()