In [1]:
from datetime import datetime
from pyspark.sql import SparkSession

spark = (SparkSession.builder.appName("pyspark-rdd-demo-{}".format(datetime.today()))
        .master("spark://spark-master:7077")      
        .getOrCreate())
# spark.sparkContext.getConf().getAll()

In [2]:
products = spark.read.format("parquet").load("s3a://warehouse/gold/tiki/products.parquet")
users = spark.read.format("parquet").load("s3a://warehouse/gold/tiki/users.parquet")
reviews = spark.read.format("parquet").load("s3a://warehouse/gold/tiki/reviews.parquet")

In [3]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from underthesea import word_tokenize
import re

@udf(StringType())  # Specify return type as String
def process_text(document):
    # Change to lowercase
    document = document.lower()
    
    # Remove HTTP links (using regular expression)
    document = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', '', document)
    
    # Remove line breaks (replace with space)
    document = re.sub(r'[\r\n]+', ' ', document)
    
    # Replace '/' and ',' with space
    document = document.replace('/', ' ').replace(',', ' ')
    
    # Remove punctuations using regular expression
    document = re.sub(r'[^\w\s]', '', document)
    
    # Remove extra spaces (replace multiple spaces with a single space)
    document = re.sub(r'[\s]{2,}', ' ', document)
    
    # Tokenize text using word_tokenize from underthesea
    document = word_tokenize(document, format="text")
    
    return document

# Assuming `products` is your DataFrame
df = products

# Create the 'info' column by concatenating product_name, description, and specifications
df = df.withColumn('info', df['product_name'] + ' ' + df['description'] + ' ' + df['specifications'])

# Apply the UDF to the 'info' column
df = df.withColumn('processed_info', process_text(df['info']))

# Show the results
df.select('processed_info').show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1231, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1067, in read_udfs
    udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 529, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 90, in read_command
    command = serializer._read_with_length(file)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 174, in _read_with_length
    return self.loads(obj)
           ^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 472, in loads
    return cloudpickle.loads(obj, encoding=encoding)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'underthesea'


In [58]:
from gensim import models
from gensim import similarities

def load_stopword(STOP_WORDS):
    with open(STOP_WORDS, 'r', encoding = 'utf-8') as file:
        stop_words = file.read()
    stop_words = stop_words.split('\n')
    return stop_words
    
def gensim_rcm(products, stop_words):
    # Tokenize the product content
    info = [[text for text in x.split()] for x in products['content']]
    
    # Create a dictionary mapping words to unique IDs
    dictionary = corpora.Dictionary(info)
    
    # Filter out stop words and words that appear only once in the corpus
    stop_ids = [dictionary.token2id[stopword] for stopword in stop_words if stopword in dictionary.token2id]
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
    dictionary.filter_tokens(stop_ids + once_ids)
    dictionary.compactify()
    
    # Convert the product content into a Bag of Words (BoW) format
    corpus = [dictionary.doc2bow(text) for text in info]
    
    # Create a TF-IDF model from the corpus
    tfidf = models.TfidfModel(corpus)
    
    # The number of unique features (terms) in the dictionary
    feature_cnt = len(dictionary.token2id)
    
    # Convert the TF-IDF corpus into a sparse matrix for memory efficiency
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_cnt)
    
    # Return the dictionary, TF-IDF model, and similarity index for further use
    dictionary.save('dictionary.gensim')
    tfidf.save('tfidf_model.gensim')
    index.save('similarity_index.gensim')
    return dictionary, tfidf, index

stop_words = load_stopword("vietnamese-stopwords.txt")
_ = gensim_rcm(products.toPandas(), stop_words)
tfidf = models.TfidfModel.load('tfidf_model.gensim')
index = similarities.SparseMatrixSimilarity.load('similarity_index.gensim')
dictionary = corpora.Dictionary.load('dictionary.gensim')

KeyError: 'content'

In [53]:
import pandas as pd
product_ID = 276854528
n = 5
product_result= df[df.product_id == product_ID].head(1)   
view_product = product_result['content'].to_string(index = False)
view_product = view_product.split()
view_product

['chổi',
 'vệ_sinh',
 'rửa',
 'xe',
 'ô_tô',
 'cây',
 'lau',
 'nhà',
 'đặc_biệt',
 'd...']

In [54]:
bow_vector = dictionary.doc2bow(view_product)
sim = index[tfidf[bow_vector]]

In [55]:
list_id = []
list_score = []
for i in range(len(sim)):
    list_id.append(i)
    list_score.append(sim[i])

df_result = pd.DataFrame({'id': list_id,
                            'score': list_score})

five_highest_score = df_result.sort_values(by = 'score', ascending = False).head(n +1)
idToList = list(five_highest_score['id'])

product_find = df[df.index.isin(idToList)]
result = product_find[['product_id','product_name']]
result = pd.concat([result, five_highest_score], axis = 1).sort_values(by = 'score', ascending = False)
result = result[result.product_id != product_ID]
result

Unnamed: 0,product_id,product_name,id,score
3804,203428870,"Chổi Lau Quét Bụi Xe Ô Tô, Chổi Lau Vệ Sinh Xe...",3804,0.770375
5026,275182599,Chổi Vệ Sinh Rửa Xe Ô Tô - Cây Lau Nhà Đặc Biệ...,5026,0.69924
9077,192706518,Khăn lau xe ô tô đa năng microfiber màu vàng -...,9077,0.613022
6932,71150931,Chổi Quét Bụi Ô TÔ Đa Năng NB 64cm,6932,0.601728
4328,270677877,Khăn lau xe ô tô đa năng microfiber xám không ...,4328,0.595468
