In [1]:
# import libraries
import time
import os
import pandas as pd
import numpy as np
from pyspark import SparkContext
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F
from pyspark.sql.functions import explode
from IPython.display import Image
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import clear_output

# Start up spark cluster

In [2]:
appName="Collaborative Filtering with PySpark"
# initialize the spark session
spark = SparkSession.builder\
    .appName(appName)\
    .config('spark.driver.memory', '15g')\
    .getOrCreate()
# get sparkcontext from the sparksession
sc = spark.sparkContext

# Define Data loading function

In [3]:
def load_dataset(path):
    #define schema
    schema = StructType([
        StructField("item", StringType(), True),
        StructField("user", StringType(), True),
        StructField("rating", StringType(), True),
        StructField("timestamp", IntegerType(), True)])

    df = spark.read.csv(path ,header=False,schema=schema)
    df = df.withColumn("rating", df["rating"].cast(IntegerType()))
    
    #provide index values for item and user to convert them into integers
    stringIndexer = StringIndexer(inputCols=["item","user"], outputCols=["itemIndex","userIndex"])
    model = stringIndexer.fit(df)
    df = model.transform(df)
    
    #df = df.withColumn("userIndex", df["userIndex"].cast(IntegerType()))
    
    return df

# Define ALS object

In [4]:
als = ALS(maxIter=15, 
          implicitPrefs=False,
          regParam=0.25,
          userCol="userIndex", 
          itemCol="itemIndex", 
          ratingCol="rating",
          coldStartStrategy="drop")

# Define Recommendation Algorithm

In [5]:
# display top n recommended artists for a user
def recommendedItems(userIndex, model, n):
    test = model.recommendForAllUsers(n)\
        .filter(col('userIndex')==userIndex)\
        .select(["recommendations.itemIndex","recommendations.rating"])\
        .withColumn("rec_exp", explode("itemIndex"))\
        .select(col('rec_exp'))\
        .withColumnRenamed('rec_exp', 'itemIndex')
        
    return test

# recs = recommendedItems(9386, n=6)
# recs.show()

## Print out n recommendations for user

In [6]:
def print_recs(recs_df):
    output_string = ''
    for i, row in recs_df.iterrows():
        output_string += f' {i+1}. {row.title}\n'
        output_string += ' '*5 + 'https://www.amazon.com/dp/{}'.format(row['item']) + '\n'
    return output_string


In [7]:
def get_recommendation(userID, model, item_df, meta_df, n=6):
    recs = recommendedItems(userID, model, n=6)
    nrecommendations_df = recs.join(item_df, on='itemIndex').toPandas()
    recs_df = nrecommendations_df.merge(meta_df)
    return print_recs(recs_df)
    
#get_recommendation(9386)

In [8]:
print(10 + 10)

20


In [8]:
def get_fnames(path=r'./ratings_data'):
    fnames = [f.split('.')[0] for f in os.listdir(path) if f.endswith('.csv')]
    return fnames

fnames = get_fnames()
dropdown_category = widgets.Dropdown(options = fnames)


#create display
output_recs = widgets.Output()

        
#get user id
userid_input = widgets.BoundedFloatText(
min=0, max=1000000, value=9386, step=1)


btn = widgets.Button(description='Recommend')

def btn_eventhandler(obj):
    #Load data
    print('Loading Data...')
    start_time = time.time()
    fname = dropdown_category.value +'.csv'
    df = load_dataset(r'./ratings_data/' + fname)
    
    test_fraction = .35
    training, test = df.randomSplit([1-test_fraction, test_fraction])
    
    #get items
    item_df = df.groupby("itemIndex")\
       .agg(F.min("item"))\
       .withColumnRenamed('min(item)', 'item')
    item_df = item_df.withColumn("itemIndex", item_df["itemIndex"].cast(IntegerType()))
    
    #get meta information
    meta_df = pd.read_csv(f'./meta_data/meta_{fname}').rename(columns={'asin':'item'})
    print(f' Time taken to load data: {time.time() - start_time:.1f} seconds.\n')
    
    print('Fitting Data...')
    start_time = time.time()
    model = als.fit(training)
    print(f' Time taken to fit data: {time.time() - start_time:.1f} seconds.\n')
    
    
    
    userid = int(userid_input.value)
    start_time = time.time()
    print('Calculating Recommendations...\n')
    
    #output
    title = f'{dropdown_category.value} recommendations for user {userid}:'
    print(title)
    print('-'*len(title))
    
    
    recs = get_recommendation(userid, model, item_df, meta_df)
    print(recs)
    print(f' Time taken to recommend: {time.time() - start_time:.1f} seconds.')
    
    

print('Category')
display(dropdown_category)
print('\nUserid')
display(userid_input)

display(btn)
btn.on_click(btn_eventhandler)

Category


Dropdown(options=('Software', 'Automotive', 'Movies_and_TV', 'Industrial_and_Scientific', 'Grocery_and_Gourmet…


Userid


BoundedFloatText(value=9386.0, max=1000000.0, step=1.0)

Button(description='Recommend', style=ButtonStyle())

If above list and button doesn't appear - run the following cell

In [9]:
def run_dist_als(fname, userid):
    print('Loading Data...')
    start_time = time.time()
    fname = fname +'.csv'
    df = load_dataset(r'./ratings_data/' + fname)
    
    test_fraction = .35
    training, test = df.randomSplit([1-test_fraction, test_fraction])
    
    #get items
    item_df = df.groupby("itemIndex")\
       .agg(F.min("item"))\
       .withColumnRenamed('min(item)', 'item')
    item_df = item_df.withColumn("itemIndex", item_df["itemIndex"].cast(IntegerType()))
    
    #get meta information
    meta_df = pd.read_csv(f'./meta_data/meta_{fname}').rename(columns={'asin':'item'})
    print(f' Time taken to load data: {time.time() - start_time:.1f} seconds.\n')
    
    print('Fitting Data...')
    start_time = time.time()
    model = als.fit(training)
    print(f' Time taken to fit data: {time.time() - start_time:.1f} seconds.\n')
    
    
    
    userid = int(userid_input.value)
    start_time = time.time()
    print('Calculating Recommendations...\n')
    
    #output
    title = f'{dropdown_category.value} recommendations for user {userid}:'
    print(title)
    print('-'*len(title))
    
    
    recs = get_recommendation(userid, model, item_df, meta_df)
    print(recs)
    print(f' Time taken to recommend: {time.time() - start_time:.1f} seconds.')

In [12]:
run_dist_als(fname='Appliances', userid=9386)

Loading Data...
 Time taken to load data: 3.4 seconds.

Fitting Data...
 Time taken to fit data: 81.3 seconds.

Calculating Recommendations...

Software recommendations for user 9386:
---------------------------------------
 1. Whirlpool Part Number UXA4905ADX: RACK-OVEN
     https://www.amazon.com/dp/B001RCIQS8
 2. Frigidaire PL30WC41EC30&quot; Stainless Steel Under Cabinet Range Hood
     https://www.amazon.com/dp/B000R9E5GY
 3. Ge WR17X11586 Refrigerator Door Bin Genuine Original Equipment Manufacturer (OEM) Part
     https://www.amazon.com/dp/B003BVZZSW
 4. Modern Maid 6&quot; Range Cooktop Stove Replacement Surface Burner Heating Element 31734601
     https://www.amazon.com/dp/B00DVKBSP2
 5. BERKEL 0827-00042 Capacitor
     https://www.amazon.com/dp/B00EN8OBWS
 6. Bake Element for General Electric, Hotpoint, WB44X195
     https://www.amazon.com/dp/B009PNOIK6

 Time taken to recommend: 223.9 seconds.
