## NOTEBOOK DESCRIPTION:

This notebook describe the way we tried to do dimensionality reduction on the bipartite graph using pyspark

In [20]:
import scipy
import json

import numpy as np

from scipy.sparse import dok_matrix
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.feature import PCA

In [2]:
conf = SparkConf().setMaster("local[4]").setAll([('spark.executor.memory', '4g'),('spark.driver.memory','16g'),('spark.driver.maxResultSize', '0')])

# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

In [3]:
'''
Construct SparseVector bag-of-word for each row 

PARAMETERS:
    - row: the row of the sparse matrix being processed
    - S: the sparse matrix (the data)
RETURN:
    - The sparse vector representation of the row
'''
def get_dict_for_row(row, S):
    tmp_dict = {}
    for key, value in row:
        tmp_dict[key[1]] = value

    return SparseVector(S.shape[1], tmp_dict)

In [4]:
# Load data
print('Loading data...')
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/jouven/final_embedding_sparse_matrix/sparse_matrix_bipartite.npz')


Loading data...


In [15]:
# Select features and channels to make the sparse matrice smaller in order to test that everything works before 
# running it into the pyspark cluster
features_relevant = np.arange(1000000)
channels_relevant = np.arange(100)

# Select videos
S = S[ :, features_relevant]


Process matrix ...


NameError: name 'S2' is not defined

In [16]:
all_data = []

print('Process matrix ...')
for i in range(S.shape[0]):

    if i % 20000 == 0:
        print(str(i) + ' channels processed...')

    all_data.append([i, get_dict_for_row(S.getrow(i).todok().items(), S)])
    
    
# Construct dataframe for PCA
all_df = spark.createDataFrame(all_data, ["channel_index", "features"])

Process matrix ...
0 channels processed...
20000 channels processed...
40000 channels processed...
60000 channels processed...
80000 channels processed...
100000 channels processed...
120000 channels processed...
140000 channels processed...


In [21]:
all_df.write\
        .option('compression', 'gzip')\
        .json('/dlabdata1/youtube_large/jouven/pyspark_data.json')

In [24]:
# Load the data to make a test in local
df_load = spark.read.json('/dlabdata1/youtube_large/jouven/pyspark_data.json')

In [27]:
# Load data
print('Loading data...')

# Process data
data_process = []

print('Processing data...')
for row in df_load.collect():
    features = row['features']
    data_process.append([row['channel_index'], SparseVector(
        features['size'], features['indices'], features['values'])])

data_process = spark.createDataFrame(
    data_process, ['channel_index', 'features'])

pca = PCA(k=100, inputCol="features", outputCol="pca_features")
model = pca.fit(data_process)
model.transform(data_process)

Loading data...
Processing data...


IllegalArgumentException: 'Argument with more than 65535 cols: 1000000'