In [2]:
import warnings
warnings.filterwarnings('ignore')

import math
import numpy as np
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import Row

# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [5]:
sc

In [6]:
base_df = pd.read_csv('data/rtr_data.csv')
base_df.drop(columns='Unnamed: 0', inplace=True)

In [7]:
base_df

Unnamed: 0,fit,user_id,bust_size,item_id,weight_lbs,rating,rented_for,review_text,body_type,review_summary,category,height_inches,size,age,review_date,review-summary,keywords
0,fit,420272,34d,2260466,137,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,68.0,14,28.0,2016-04-20,An adorable romper! Belt and zipper were a lit...,"['adorable', 'romper', 'belt', 'zipper', 'litt..."
1,fit,273551,34b,153475,132,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,66.0,12,36.0,2013-06-18,I rented this dress for a photo shoot. The the...,"['photo', 'shoot', 'theme', 'hollywood', 'glam..."
2,fit,909926,34c,126335,135,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,65.0,8,34.0,2014-02-12,I rented this for my company's black tie award...,"['company', 'black', 'tie', 'awards', 'banquet..."
3,fit,151944,34b,616682,145,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,69.0,12,27.0,2016-09-26,I have always been petite in my upper body and...,"['always', 'petite', 'upper', 'body', 'extreme..."
4,fit,734848,32b,364092,138,8.0,date,Didn't actually wear it. It fit perfectly. The...,athletic,Traditional with a touch a sass,dress,68.0,8,45.0,2016-04-30,Didn't actually wear it. It fit perfectly. The...,"['actually', 'perfectly', 'fabric', 'little', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146132,fit,66386,34dd,2252812,140,10.0,work,Fit like a glove!,hourglass,LOVE IT!!! First Item Im thinking of buying!,jumpsuit,69.0,8,42.0,2016-05-18,Fit like a glove! LOVE IT!!! First Item Im thi...,"['like', 'glove', 'love', 'first', 'item', 'im..."
146133,fit,118398,32c,682043,100,10.0,work,The pattern contrast on this dress is really s...,petite,LOVE it!,dress,61.0,4,29.0,2016-09-30,The pattern contrast on this dress is really s...,"['pattern', 'contrast', 'really', 'stunning', ..."
146134,fit,47002,36a,683251,135,6.0,everyday,"Like the other DVF wraps, the fit on this is f...",straight & narrow,"Loud patterning, flattering fit",dress,68.0,8,31.0,2016-03-04,"Like the other DVF wraps, the fit on this is f...","['like', 'dvf', 'wraps', 'fantastic', 'albeit'..."
146135,fit,961120,36c,126335,165,10.0,wedding,This dress was PERFECTION. it looked incredib...,pear,loved this dress it was comfortable and photog...,dress,66.0,16,31.0,2015-11-25,This dress was PERFECTION. it looked incredib...,"['perfection', 'looked', 'incredible', 'photos..."


In [8]:
ratings_df = base_df[['user_id', 'item_id', 'rating']]

In [9]:
spark_ratings = spark.createDataFrame(ratings_df)

In [10]:
train, test = spark_ratings.randomSplit([0.7, 0.3])

In [11]:
als_model = ALS(
    itemCol='item_id',
    userCol='user_id',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.75,
    rank=5,  # 5
    coldStartStrategy='nan')

# als_model = ALS(
#     itemCol='item_id',
#     userCol='user_id',
#     ratingCol='rating',
#     nonnegative=True,    
#     regParam=0.1,
#     rank=10) 

In [12]:
recommender = als_model.fit(train)

In [15]:
test_pred = recommender.transform(test)

In [16]:
test_pred.show()

+-------+-------+------+----------+
|user_id|item_id|rating|prediction|
+-------+-------+------+----------+
| 371542| 213089|  10.0|  8.717025|
|  52229| 213089|   8.0|       NaN|
| 768039| 213089|  10.0| 6.8804464|
| 752238| 213089|   8.0| 6.8368726|
| 818603| 277366|  10.0|  8.974133|
| 524777| 277366|  10.0|       NaN|
| 712415| 333479|   8.0|  8.570589|
| 260350| 333479|  10.0| 7.3483515|
| 585785| 333479|  10.0|  8.875811|
| 179367| 333479|  10.0|  8.426922|
| 214108| 333479|  10.0|   7.37896|
| 244376| 640839|  10.0|  9.104555|
| 240007| 640839|  10.0|  8.856159|
| 996705| 640839|   8.0|       NaN|
| 421056| 640839|   8.0|  8.287586|
| 366322| 640839|  10.0|  7.327038|
| 642748| 640839|  10.0|  9.122535|
| 962574| 640839|   8.0|  6.850277|
|  70480| 730008|   8.0|  7.952133|
| 614722| 730008|  10.0|  8.103798|
+-------+-------+------+----------+
only showing top 20 rows



In [17]:
from sklearn.metrics import mean_squared_error

In [23]:
test_true = test_pred.select('rating')
true_df = test_true.toPandas()

In [34]:
test_predicted = test_pred.select('prediction')
pred_df = test_predicted.toPandas()

In [35]:
pred_df.fillna(ratings_df['rating'].mean(), inplace=True)

In [36]:
np.sqrt(mean_squared_error(true_df, pred_df))

1.7479125880928894

In [None]:
# suprise SVD gave 1.39 rmse, gonna go with that