# Spark Recommender Pair Exercise

## Loading  and Cleaning Data

We have two options here

1. Load data into a Pandas dataframe, convert to a Spark dataframe
    * Careful! This only works because our dataset is small. Usually when we use Spark our datasets are too large to fit in memory.
2. Load data into a Spark RDD, convert to a Spark dataframe

### Pandas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql.types import *
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Build our Spark Session and Context
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext
spark, sc;

# Read the ratings data into a Pandas DataFrame
ratings_pd_df = pd.read_csv('data/training.csv')

# Convert to a Spark DataFrame, dropping `timestamp` column will happend before the train
ratings_df = spark.createDataFrame(ratings_pd_df) #.drop('timestamp', axis=1))

In [2]:
ratings_df    = ratings_df.orderBy('timestamp')
ratings_pd_df = ratings_df.toPandas()

n_rows = ratings_pd_df.shape[0] # int(n_rows*.8)


train = ratings_pd_df.head(int(n_rows*.95))
test  = ratings_pd_df.tail(int(n_rows*.05))

sp_train = spark.createDataFrame(train.drop('timestamp', axis=1))
sp_test  = spark.createDataFrame(test.drop('timestamp', axis=1))

In [3]:
req_pd = pd.read_csv('data/requests.csv')
sp_req = spark.createDataFrame(req_pd)

In [4]:
als_model = ALS(userCol='user',
                itemCol='movie',
                ratingCol='rating',
                nonnegative=True,
                regParam=0.01,
                maxIter=20,
                rank=10,
                coldStartStrategy = "nan"
               )

recommender = als_model.fit(sp_train)

In [5]:
# Make predictions for the whole test set
prediction_sp = recommender.transform(sp_req)

## Evaluation

Time to evaluate our model. We'll calculate the RMSE of our predicted ratings and also look at a violin plot of true ratings (x-axis) vs the predicted ratings (y-axis).

In [None]:
# Dump the predictions to Pandas DataFrames to make our final calculations easier
# predictions_df = predictions.toPandas()
# train_df = sp_train.toPandas()

In [24]:
predictions_pd = prediction_sp.toPandas()

In [10]:
movie_list = predictions_pd['movie'].unique()

In [11]:
movie_means = predictions_pd.groupby('movie', sort=False).mean()[ 'prediction']

In [12]:
predictions_pd[predictions_pd.movie == 463].mean()['prediction']

2.3410957753658295

In [13]:
movie_means.head()

movie
148    2.689983
463    2.341096
471    3.785024
496    3.673284
833    2.741599
Name: prediction, dtype: float32

In [9]:
predictions_pd.head()

Unnamed: 0,user,movie,prediction
0,53,148,
1,4169,148,3.073922
2,5333,148,2.538544
3,4387,148,2.457482
4,840,148,


In [27]:
movie_means = predictions_pd.groupby('movie', sort=False).mean()[ 'prediction']

pred_nulls = predictions_pd[predictions_pd['prediction'].isna()]

indexed_means = pred_nulls.apply(lambda x: movie_means.loc[x['movie']], axis=1)

predictions_pd['prediction'].loc[pred_nulls.index] = indexed_means

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [28]:
predictions_pd.head(10)

Unnamed: 0,user,movie,prediction
0,53,148,2.689983
1,4169,148,3.073922
2,5333,148,2.538544
3,4387,148,2.457482
4,840,148,2.689983
5,216,148,2.689983
6,482,148,2.689983
7,752,148,2.689983
8,424,148,2.689983
9,970,463,3.042729


In [8]:
for ind, row in predictions_pd.head(10).iterrows():
#     print(np.isnan(row['prediction']))
    
    if np.isnan(row['prediction']):
#         row['prediction'] = predictions_pd[predictions_pd.movie == int(row['movie'])].mean()['prediction']
        predictions_pd.iloc[ind]['prediction'] = 3 #predictions_pd[predictions_pd.movie == int(row['movie'])].mean()['prediction']
        print("in if ", row['movie'], "p ", predictions_pd.iloc[ind]['prediction'])

in if  148.0 p  nan
in if  148.0 p  nan
in if  148.0 p  nan
in if  148.0 p  nan
in if  148.0 p  nan
in if  148.0 p  nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
for ind, row in predictions_pd.head(10).iterrows():
#     print(np.isnan(row['prediction']))
    
    if np.isnan(row['prediction']):
        row['prediction'] = predictions_pd[predictions_pd.movie == int(row['movie'])].mean()['prediction']
        print("in if ", row['movie'], predictions_pd.iloc[ind]['prediction'])

In [None]:
predictions_pd.head()

In [None]:
for ind, row in predictions_pd.iterrows():
    if np.isnan(row['prediction']):
        row['prediction'] = predictions_pd[predictions_pd.movie == row['movie']].mean()['prediction']

In [None]:
for ind, row in predictions_pd.iterrows():
    if np.isnan(row['prediction']):
        row['prediction'] = predictions_pd[predictions_pd.movie == row['movie']].mean()['prediction']

In [None]:
predictions_pd.loc['prediction'].iloc[ind] 

In [None]:
predictions_pd.head()

In [None]:
len(movie_list)

In [None]:
len(movie_means)

In [None]:
# for i, mInd in enumerate(movie_list):
i = 1
int(movie_list[i])

In [None]:
z = predictions_pd[predictions_pd.movie == int(movie_list[i])]
z.fillna(int(movie_means[i]), inplace=True)

In [None]:
predictions_pd[predictions_pd.movie == 148].fillna(3, inplace=True)

In [None]:
z = predictions_pd[predictions_pd.movie == 463]

In [None]:
z.fillna(3, inplace=True)

In [None]:
impute_val = movie_means.mean()
movie_means = movie_means.fillna(impute_val)

In [None]:
movie_means.isna().sum()

In [None]:
predictions_pd[predictions_pd.movie == movie_list[0]]['prediction'].mean()

In [None]:
predictions_pd.head()

In [None]:
movie_list = predictions_pd['movie'].unique()

In [None]:
for m_ind in movie_list:
    if predictions_pd.loc[m_ind].empty:
        predictions_pd.loc['movie_means'] = predictions_pd[predictions_pd.movie == m_ind]['prediction'].mean()
        

In [None]:
predictions_pd

In [None]:
# Fill any missing values with the mean rating
# There are multiple things you could fill with, this is just one example
predictions_pd = predictions_pd.fillna(4.5)

In [None]:
predictions_pd.head()

In [None]:
f, ax = plt.subplots(figsize=(12,8))
plt.hist(predictions_pd['movie'].value_counts(), bins=30)
plt.ylabel('Movie Counts', fontsize=12);
plt.xlabel('Movie ID', fontsize=12);

In [29]:
predictions_pd.head()

Unnamed: 0,user,movie,prediction
0,53,148,2.689983
1,4169,148,3.073922
2,5333,148,2.538544
3,4387,148,2.457482
4,840,148,2.689983


In [None]:
'''
mask = predictions_pd['rating'] > 3

mask.sum()

predictions_pd[['rating','prediction']][mask].head(10)



# predictions_pd['squared_error'] = (predictions_pd['rating'][mask] - predictions_pd['prediction'][mask])**2

predictions_pd['sq_err_45'] = (predictions_pd['rating'][mask] - predictions_pd['prediction'][mask])**2

predictions_pd.describe().T

predictions_pd.fillna(0, inplace=True)

# Calculate RMSE
np.sqrt(sum(predictions_pd['sq_err_45']) / mask.sum())
'''

In [30]:
# predictions_pd.to_csv('data/pred2.csv', sep=',', index=False)

1.4129060095515595

als_model = ALS(nonnegative=True, regParam=0.01, maxIter=20,
                rank=10, coldStartStrategy = "nan")
                
2.5177126250676602

als_model = ALS(nonnegative=True, regParam=0.01, maxIter=15,
                rank=10, coldStartStrategy = "nan")

In [None]:
# # Create array of predictions for violinplot
# data = [predictions_pd['prediction'][predictions_pd['rating'] == rating] for rating in range(1, 6)]

# plt.violinplot(data, range(1,6), showmeans=True)
# plt.xlabel('True Ratings')
# plt.ylabel('Predicted Ratings')
# plt.title('True vs. ALS Recommender Predicted Ratings')
# plt.show()