In [102]:
import numpy as np
import pickle
import pyLDAvis
import random
import scipy.sparse
import sys
import time

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.clustering import LDA, LDAModel, LocalLDAModel
from pyspark.ml.linalg import Vectors, SparseVector

In [3]:
conf = SparkConf().setMaster("local[12]").setAll([
                                   ('spark.executor.memory', '16g'),  # find
                                   ('spark.driver.memory','8g'), # your
                                   ('spark.driver.maxResultSize', '4G') # setup
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

### Process train_data and test_data

We tune the optimal number of topics by minimizing the perplexity on subsample of videos, randomly taken from all the videos

In [4]:
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/matrices/S_final2.npz')

In [5]:
# We want to tune the optimal number of topics => find it on random subset of videos
id_vid2train = random.sample(range(0,S.shape[0]), 100000)

In [6]:
S_sub = S[id_vid2train,:]

In [7]:
S_sub

<100000x663127 sparse matrix of type '<class 'numpy.uint8'>'
	with 1823560 stored elements in Compressed Sparse Row format>

In [8]:
def get_dict_for_row(row):
    ''''''
    tmp_dict = {}
    for key, value in row:
        tmp_dict[key[1]] = value

    return SparseVector(S_sub.shape[1], tmp_dict)

In [11]:
train_data_idx = set(random.sample(range(0,S_sub.shape[0]), int(0.85*S_sub.shape[0])))
len(train_data_idx)

85000

In [12]:
train_data = []
test_data = []

In [13]:
train_data_idx_sorted = 0
test_data_idx_sorted = 0

for i in range(S_train.shape[0]):
    
    if i % 10000000 == 0:
        print(str(i) + ' videos processed')
    
    # Data is a list of list of the following elems : index of doc and a bag-of-word sparse Vector
    if i in train_data_idx:
        train_data.append([train_data_idx_sorted, get_dict_for_row(S_train.getrow(i).todok().items())])
        train_data_idx_sorted += 1
    else:
        test_data.append([test_data_idx_sorted, get_dict_for_row(S_train.getrow(i).todok().items())])
        test_data_idx_sorted += 1

0 videos processed


### Tuning the best number of topics

Since we are selecting only the best number of topics from subsamples of videos, no need to keep index of the videos

- train the model on `train_data`
compute log-perplexity on `test_data`

- choose the number of topics such that log-perplex is minimized

In [14]:
numbers_topics = [6, 8, 10, 12]
perplex_scores = []
models = []

In [19]:
train_df = spark.createDataFrame(train_data, ["id", "features"])
test_df = spark.createDataFrame(test_data, ["id", "features"])

In [23]:
for n_topic in numbers_topics:
    print('Computing with ' + str(n_topic) + ' topics...')
    lda = LDA(k=n_topic, seed=1)
    model = lda.fit(train_df)
    logperplexity = model.logPerplexity(test_df)
    
    models.append(model)
    perplex_scores.append(logperplexity)
    
    
    
print('The optimal choice for the number of topics : ' + str(numbers_topics[np.argmin(perplex_scores)]))

Computing with 6 topics...
Computing with 8 topics...
Computing with 10 topics...
Computing with 12 topics...
The optimal choice for the number of topics : 6


In [34]:
perplex_scores

[15.257216790716088, 18.78972936215072, 23.02752573719546, 27.81120252103568]

### Get the model on the whole dataset

In [35]:
n_topics_opt = numbers_topics[np.argmin(perplex_scores)]

In [83]:
topic_columns = []
for i in range(n_topics_opt):
    topic_columns.append('Topic' + str(i))

In [None]:
data = []
for i in range(S.shape[0]):
    
    if i % 10000000 == 0:
        print(str(i) + ' videos processed')
    
    # Data is a list of list of the following elems : index of doc and a bag-of-word sparse Vector
    data.append([i, get_dict_for_row(S_train.getrow(i).todok().items())])

In [None]:
df = spark.createDataFrame(data, ["id", "features"])

In [None]:
lda = LDA(k=n_topics_opt, seed=1)\
                .setTopicDistributionCol('topicDistributionCol')\
                .setK(n_topics_opt)

In [None]:
model = lda.fit(df)

#### Save the model with its attributes for visualization

In [None]:
model.save('/dlabdata1/youtube_large/olam/LDA_Model/pyspark_ldamodel')

In [91]:
model_attributes = {}

In [98]:
model_attributes['perplexity'] = model.logPerplexity(df)
model_attributes['vocabSize'] = model.vocabSize()
model_attributes['n_topic'] = model.getK()

In [99]:
model_attributes

{'perplexity': 10.335341077216416, 'vocabSize': 663127, 'n_topic': 6}

In [78]:
model.describeTopics(maxTermsPerTopic=10).write\
                    .option('compression', 'gzip')\
                    .json('/dlabdata1/youtube_large/olam/LDA_Model/describe_topics.json')

In [86]:
spark.createDataFrame(model.topicsMatrix().toArray().tolist(), topic_columns)\
                    .write\
                    .option('compression', 'gzip')\
                    .json('/dlabdata1/youtube_large/olam/LDA_Model/topics_term_matrix.json')

In [96]:
model.transform(train_df).write\
                    .option('compression', 'gzip')\
                    .json('/dlabdata1/youtube_large/olam/LDA_Model/topics_doc_matrix.json')

In [100]:
with open('/dlabdata1/youtube_large/olam/LDA_Model/lda_model_attributes.pickle', 'wb') as f:
    pickle.dump(model_attributes, f)
f.close()

#### Test loading the files

In [103]:
loaded_model = LocalLDAModel.load('/dlabdata1/youtube_large/olam/LDA_Model/pyspark_ldamodel')

In [109]:
loaded_model.isDistributed()

False

In [106]:
# Load dictionnary of tokens
with open('/dlabdata1/youtube_large/olam/LDA_Model/lda_model_attributes.pickle', 'rb') as f:
    loaded_model_attributes = pickle.load(f)
f.close()

In [107]:
loaded_model_attributes

{'perplexity': 10.335341077216416, 'vocabSize': 663127, 'n_topic': 6}

In [116]:
df1 = spark.read.json('/dlabdata1/youtube_large/olam/LDA_Model/describe_topics.json')
df2 = spark.read.json('/dlabdata1/youtube_large/olam/LDA_Model/topics_term_matrix.json')
df3 = spark.read.json('/dlabdata1/youtube_large/olam/LDA_Model/topics_doc_matrix.json')

In [117]:
df1

DataFrame[termIndices: array<bigint>, termWeights: array<double>, topic: bigint]

In [118]:
df2

DataFrame[Topic0: double, Topic1: double, Topic2: double, Topic3: double, Topic4: double, Topic5: double]

In [119]:
df3

DataFrame[features: struct<indices:array<bigint>,size:bigint,type:bigint,values:array<double>>, id: bigint, topicDistributionCol: struct<type:bigint,values:array<double>>]

## Understand results from LDA

### A) Get tokens of topics

In [30]:
# Load dictionnary of tokens
with open('/dlabdata1/youtube_large/olam/id2word_2.pickle', 'rb') as f:
    id2word = pickle.load(f)
f.close()

In [31]:
topics_decribe = models[0].describeTopics(maxTermsPerTopic=5)

In [32]:
topics_decribe.show()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[146119, 25193, 3...|[0.00155618833174...|
|    1|[598533, 69659, 6...|[0.00296299689040...|
|    2|[60788, 655263, 3...|[0.00601695425345...|
|    3|[118607, 627194, ...|[0.00109346516697...|
|    4|[561976, 39325, 6...|[0.00105658456549...|
|    5|[658082, 347101, ...|[0.00125926833773...|
+-----+--------------------+--------------------+



In [33]:
for row in topics_decribe.rdd.collect():
    print('Topic ' + str(row.topic) + ': ')
    term_weights = row.termWeights
    for i, token_id in enumerate(row.termIndices):
        print('   With weight of ' + str(term_weights[i]) + ' : ' + id2word[token_id] )

Topic 0: 
   With weight of 0.0015561883317400421 : beat
   With weight of 0.0010150868096907332 : type
   With weight of 0.00043026050299143693 : monkey
   With weight of 0.00038950086709461386 : instrument
   With weight of 0.00032512815329500895 : galaxi
Topic 1: 
   With weight of 0.002962996890404119 : music
   With weight of 0.0007333861519990407 : song
   With weight of 0.0005652978340863234 : guitar
   With weight of 0.0004963401621737965 : new
   With weight of 0.0004889577751426235 : video
Topic 2: 
   With weight of 0.006016954253455886 : news
   With weight of 0.005686152362795667 : game
   With weight of 0.004070489911662575 : video
   With weight of 0.003236206563313916 : movi
   With weight of 0.002747477712124675 : gameplay
Topic 3: 
   With weight of 0.0010934651669757037 : pokemon
   With weight of 0.0006376327416773075 : fortnit
   With weight of 0.0005805983087450661 : asmr
   With weight of 0.00033741058447619727 : world
   With weight of 0.0003008724356315609 : na

### B) Get visualisation

In [138]:
topicTermDist = model.topicsMatrix()

In [143]:
topicTermDist

DenseMatrix(663127, 20, [0.9083, 0.7525, 0.8744, 0.7795, 0.7803, 0.8172, 0.7363, 0.8225, ..., 0.8475, 0.7457, 0.5989, 0.9623, 0.588, 0.7651, 0.7585, 0.8564], 0)

## Issue : need to remove tokens of length 1/2 , tokens with another alphabet and numerical tokens

Hence, save the S_final2 sparse matrix and get the id2word2 dict that map the token_id to the token in the new matrix

In [129]:
id_token_to_remove = []
token_to_keep = []

In [130]:
def englishAlpha(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return True
    else:
        return False

In [131]:
for token_id, token in id2word.items():
    if len(token) < 3 or nonEnglishAlpha(token) or token.isnumeric():
        id_token_to_remove.append(token_id)
    else:
        token_to_keep.append(token)

In [132]:
len(id_token_to_remove)

81014

In [133]:
len(token_to_keep)

663127

In [134]:
len(token_to_keep) + len(id_token_to_remove) == len(id2word)

True

In [135]:
S

<68638982x744141 sparse matrix of type '<class 'numpy.uint8'>'
	with 1393937498 stored elements in Compressed Sparse Row format>

In [136]:
id_token_to_keep = (np.delete(np.arange(len(id2word)), id_token_to_remove))

In [137]:
S = S[:, id_token_to_keep]

In [138]:
S

<68638982x663127 sparse matrix of type '<class 'numpy.uint8'>'
	with 1251231562 stored elements in Compressed Sparse Row format>

In [139]:
token_to_keep = set(token_to_keep)

In [142]:
id2word_new = {}

k = 0
for i, token in enumerate(id2word.values()):
    if token in token_to_keep:
        id2word_new[k] = token
        k += 1

In [145]:
scipy.sparse.save_npz('/dlabdata1/youtube_large/olam/matrices/S_final2.npz', S)

In [146]:
with open('/dlabdata1/youtube_large/olam/id2word_2.pickle', 'wb') as f:
    pickle.dump(id2word_new, f)
f.close()