In [2]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [3]:
#Entrypoint 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL ").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [4]:
import pandas as pd
import numpy as np
import surprise
from surprise.model_selection import GridSearchCV, KFold
import pickle

In [14]:
# Function to preprocess data and build the recommendation model
def preprocess_and_build_model(df):
    # Convert 'product_id' column to string type
    df['product_id'] = df['product_id'].astype(str)

    # Function to convert product ID to generic alphabetic product name based on category
    def convert_product_name(row):
        category = row['product_category_name_english']
        product_id = row['product_id']  # No need to convert to string now
        # Create a generic product name by concatenating category abbreviation and a numerical index
        category_abbreviation = ''.join(word[:].upper() for word in category.split())
        index = int(''.join(filter(str.isdigit, product_id)))  # Extract numerical part from product_id
        product_name = f'{category_abbreviation}_{index}'
        return product_name

    # Convert product IDs to product names
    df['product_name'] = df.apply(convert_product_name, axis=1)

    # Preprocess the data
    lowest_rating = df['review_score'].min()
    highest_rating = df['review_score'].max()

    # Create a dictionary to map unique customer IDs to sequential numbers
    id_mapping = {id_: idx + 1 for idx, id_ in enumerate(df['customer_id'].unique())}

    # Convert customer IDs to simple generic unique numbers
    df['customer_id'] = df['customer_id'].map(id_mapping)

    # Define the Reader
    reader = surprise.Reader(rating_scale=(int(lowest_rating), int(highest_rating)))
    data = surprise.Dataset.load_from_df(df[['customer_id', 'product_name', 'review_score']], reader)

    # Grid search for best parameters
    param_grid = {'lr_all': np.linspace(0.001, 1, 3), 'reg_all': np.linspace(0.01, 0.8, 3),
                  'n_factors': [40, 30]}
    kfold = KFold(random_state=23, n_splits=5, shuffle=True)
    gs = GridSearchCV(surprise.SVD, param_grid, joblib_verbose=3, measures=['rmse', 'mae'], cv=kfold, n_jobs=-1)
    gs.fit(data)

    algo = gs.best_estimator['rmse']
    algo.fit(data.build_full_trainset())

    # Dumping the model using pickle
    with open('recommendation_model.pkl', 'wb') as model_file:
        pickle.dump(algo, model_file)

    # Dumping id_mapping for later use
    with open('id_mapping.pkl', 'wb') as id_mapping_file:
        pickle.dump(id_mapping, id_mapping_file)

if __name__ == "__main__":
    df = spark.table("ecomm.olist_data_table_s")\
    .select('customer_id','product_id', 'product_category_name_english', 'review_score').toPandas()
    preprocess_and_build_model(df)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 17.6min finished
