In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import datetime
plt.style.use('dark_background')
import sys
sys.path.insert(1, '/home/mauricio/code/mcr')
from mcr.util import glimpse, plot_value_counts, plot_value_counts_timeseries, missing_report, plot_missing, plot_unique, plot_duplicates, size

from pyspark import SparkContext
# SparkContext.getOrCreate(conf: Optional[pyspark.conf.SparkConf] = None) -> 'SparkContext'
sc = SparkContext.getOrCreate()
# sc.setLogLevel('DEBUG')

from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName('spark_application').getOrCreate()
print(spark.version)

from pyspark.sql import functions as F
from pyspark.sql.types import *

23/04/26 13:09:39 WARN Utils: Your hostname, rig resolves to a loopback address: 127.0.1.1; using 192.168.0.105 instead (on interface enp6s0)
23/04/26 13:09:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/26 13:09:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/26 13:09:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
3.3.2


# Matrix multiplication

In [2]:
U = np.array([[1,2,3],
              [4,5,6],
              [7,8,9]])
P = np.array([[9,8,7],
              [6,5,4],
              [3,2,1]])

In [3]:
NP = np.array([[1*9+2*6+3*3, 1*8+2*5+3*2, 1*7+2*4+3*1],
              [4*9+5*6+6*3, 4*8+5*5+6*2, 4*7+5*4+6*1],
              [7*9+8*6+9*3, 7*8+8*5+9*2, 7*7+8*4+9*1]])
NP

array([[ 30,  24,  18],
       [ 84,  69,  54],
       [138, 114,  90]])

In [4]:
NP = np.array([
    [U[0,0]*P[0,0] + U[0,1]*P[1,0] + U[0,2]*P[2,0], U[0,0]*P[0,1] + U[0,1]*P[1,1] + U[0,2]*P[2,1], U[0,0]*P[0,2] + U[0,1]*P[1,2] + U[0,2]*P[2,2]],
    [U[1,0]*P[0,0] + U[1,1]*P[1,0] + U[1,2]*P[2,0], U[1,0]*P[0,1] + U[1,1]*P[1,1] + U[1,2]*P[2,1], U[1,0]*P[0,2] + U[1,1]*P[1,2] + U[1,2]*P[2,2]],
    [U[2,0]*P[0,0] + U[2,1]*P[1,0] + U[2,2]*P[2,0], U[2,0]*P[0,1] + U[2,1]*P[1,1] + U[2,2]*P[2,1], U[2,0]*P[0,2] + U[2,1]*P[1,2] + U[2,2]*P[2,2]]
])
NP

array([[ 30,  24,  18],
       [ 84,  69,  54],
       [138, 114,  90]])

In [5]:
U @ P

array([[ 30,  24,  18],
       [ 84,  69,  54],
       [138, 114,  90]])

In [6]:
np.matmul(U, P)

array([[ 30,  24,  18],
       [ 84,  69,  54],
       [138, 114,  90]])

In [7]:
np.dot(U, P)

array([[ 30,  24,  18],
       [ 84,  69,  54],
       [138, 114,  90]])

# Overview of matrix factorization

Matrix factorization, or matrix decomposition, is essentially the opposite of matrix multiplication. Rather than multiplying two matrices together to get one new matrix.

Matrix factorization splits a matrix into two or more matrices which, when multiplied back together, produce an approximation of the original matrix.

There are several different mathematical approaches for this, each of which has a different application. We aren't going to go into any of that here, we are simply going to review the factorization that ALS performs.

Used in the context of collaborative filtering, ALS uses a factorization called non-negative matrix factorization (NMF). Because matrix factorization generally returns only approximations of the original matrix, in some cases, they can return negative values in the factor matrices, even when attempting to predict positive values. When predicting what rating a user will give to an item, negative values don't really make sense. Neither do they make sense in the context of latent features.

**For this reason, the version of ALS that we will use will require that the factorization return only positive values**

## Exercises

### Non-negative matrix factorization

It's possible for one matrix to have two equally close factorizations where one has all positive values and the other has some negative values.

The matrix M has been factored twice using two different factorizations. Take a look at each pair of factor matrices L and U, and W and H to see the differences. Then use their products to see that they produce essentially the same product.

In [8]:
def getRMSE(pred, actual):
    """
    Returns RMSE between predictions and actual observations
    
    Parameters: 
        predictions: pandas dataframe of value predictions
        actual values: pandas dataframe of actual values that predictions are trying to predict
        
    Returns: RMSE value in decimal format
    """
    RMSE =  (((pred - actual)**2).sum().sum()/(pred.shape[0]*pred.shape[1]))**.5
    return round(RMSE,3)

In [9]:
M = pd.DataFrame(np.array([[1, 2, 1, 2],
                           [0, 0, 0, 0],
                           [1, 2, 2, 1],
                           [0, 0, 0, 0]]))
L = pd.DataFrame(np.array([[ 1.        ,  0.        ,  0.        ,  0.        ],
                           [ 0.01      , -0.42105263,  0.09831579,  1.        ],
                           [ 1.        ,  0.        ,  1.        ,  0.        ],
                           [ 0.1       ,  1.        ,  0.        ,  0.        ]]))
U = pd.DataFrame(np.array([[ 1.        ,  2.        ,  1.        ,  2.        ],
                           [ 0.        , -0.19      , -0.099     , -0.198     ],
                           [ 0.        ,  0.        ,  1.        , -1.        ],
                           [ 0.        ,  0.        ,  0.        ,  0.19494737]]))
W = pd.DataFrame(np.array([[2.61, 0.24, 0.  , 0.12],
                           [0.  , 0.05, 0.02, 0.17],
                           [1.97, 0.  , 0.58, 0.83],
                           [0.05, 0.  , 0.  , 0.  ]]))
H = pd.DataFrame(np.array([[0.38, 0.65, 0.34, 0.41],
                           [0.  , 1.2 , 0.15, 3.72],
                           [0.42, 1.09, 1.38, 0.07],
                           [0.  , 0.11, 0.65, 0.17]]))

In [10]:
# View the L, U, W, and H matrices.
print("Matrices L and U:") 
print(L)
print(U)
print('Matrix multiplication LU:')
print(L@U)
print('Matrix M:')
print(M)
# Calculate RMSE between LU and M
print("RMSE of LU: ", getRMSE(L@U, M))

Matrices L and U:
      0         1         2    3
0  1.00  0.000000  0.000000  0.0
1  0.01 -0.421053  0.098316  1.0
2  1.00  0.000000  1.000000  0.0
3  0.10  1.000000  0.000000  0.0
     0     1      2         3
0  1.0  2.00  1.000  2.000000
1  0.0 -0.19 -0.099 -0.198000
2  0.0  0.00  1.000 -1.000000
3  0.0  0.00  0.000  0.194947
Matrix multiplication LU:
      0     1      2      3
0  1.00  2.00  1.000  2.000
1  0.01  0.10  0.150  0.200
2  1.00  2.00  2.000  1.000
3  0.10  0.01  0.001  0.002
Matrix M:
   0  1  2  3
0  1  2  1  2
1  0  0  0  0
2  1  2  2  1
3  0  0  0  0
RMSE of LU:  0.072


In [11]:
print("Matrices W and H:")
print(W)
print(H)
print('Matrix multiplication WH:')
print(W@H)
print('Matrix M:')
print(M)
# Calculate RMSE between LU and M
print("RMSE of WH: ", getRMSE(W@H, M))

Matrices W and H:
      0     1     2     3
0  2.61  0.24  0.00  0.12
1  0.00  0.05  0.02  0.17
2  1.97  0.00  0.58  0.83
3  0.05  0.00  0.00  0.00
      0     1     2     3
0  0.38  0.65  0.34  0.41
1  0.00  1.20  0.15  3.72
2  0.42  1.09  1.38  0.07
3  0.00  0.11  0.65  0.17
Matrix multiplication WH:
        0       1       2       3
0  0.9918  1.9977  1.0014  1.9833
1  0.0084  0.1005  0.1456  0.2163
2  0.9922  2.0040  2.0097  0.9894
3  0.0190  0.0325  0.0170  0.0205
Matrix M:
   0  1  2  3
0  1  2  1  2
1  0  0  0  0
2  1  2  2  1
3  0  0  0  0
RMSE of WH:  0.071


# How ALS alternates to generate predictions

In [12]:
# Read data from CSV file
ratings = spark.read.csv('ratings.csv',
                         sep=',',
                         header=True,
                         inferSchema=True)

# Get number of records
print("The data contain %d records." % ratings.count())

# View the first five records
ratings.show(5)

# Check column data types
print(ratings.dtypes)

The data contain 100004 records.
+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     31|   2.5|1260759144|
|     1|   1029|   3.0|1260759179|
|     1|   1061|   3.0|1260759182|
|     1|   1129|   2.0|1260759185|
|     1|   1172|   4.0|1260759205|
+------+-------+------+----------+
only showing top 5 rows

[('userId', 'int'), ('movieId', 'int'), ('rating', 'double'), ('timestamp', 'int')]


> In total, there are 671 users and 9066 movies. Of the 6.1 million possible ratings we could have in this matrix with this many users and movies, we only have about 100,000. That means that 98% percent of the matrix is totally blank. This makes sense because 9066 movies is far too many movies for any normal person to watch in their lifetime. One of the benefits of ALS is that it works well with sparse matrices like this.

In [13]:
movie_count = ratings.select('movieId').distinct().count()
print(f'{movie_count=}')
user_count = ratings.select('userId').distinct().count()
print(f'{user_count=}')
print(f'Possible ratings {movie_count*user_count:,}')

movie_count=9066
user_count=671
Possible ratings 6,083,286


In [14]:
rating_count = ratings.count()
print(f'{rating_count=:,} ({rating_count/(movie_count*user_count):.2f}%)')

rating_count=100,004 (0.02%)


Now, the first thing ALS does with a matrix like this is factor it into two different matrices.

$$R-> U*P$$

Remember that factorizations like this produce two matrices which, when multiplied back together, produce an approximation of the original matrix.

In order to get the closest approximation of the original matrix R, ALS first fills in the factor matrices with random numbers and then makes slight adjustments to the matrices one at a time until it has the best approximation possible.

In other words, ALS holds the matrix R and the matrix U constant, and makes adjustments to the matrix P. It then multiplies the two factor matrices together to see how far the predictions are from the original matrix using root means squared error or RMSE as an error metric.

The RMSE basically tells you, on average, how far off your predictions are from the actual values. We'll talk more about this later in the course. Note that in calculating the RMSE, only the values that existed in the original matrix are considered. The missing values are not considered.

It then holds P and R constant and adjusts values in the matrix U. The RMSE is calculated again, and ALS again switches, and calculates the RMSE again. ALS will continue to iterate until instructed to stop, at which point, ALS has the best possible approximation of the original matrix R.

The beauty of all of this is that when the RMSE is fully minimized, ALS simply multiplies the matrices back together, and the blank cells are filled in with predictions.

In other words, when we take a sparse matrix and factor it into two matrices $R -> U*P$, every rating in the original matrix must have a respective row and column full of values in the respective factor matrices that can be multiplied back together to approximate that original value. And since there is at least one rating in every row and at least one rating in every column of the original matrix, when ALS creates the two respective factor matrices, there are values in every cell of the two factor matrices, which allows us to then create predictions for the previously blank spaces.

So when ALS iterates to make sure that it’s resulting product is as close to those original cells as possible, the result is that the previously blank cells are now filled in with values that are based on how each user has behaved in the past relative to the behavior of similar users. 

## Exercises

### Estimating recommendations

In [15]:
U = pd.DataFrame(np.array([[0.8, 0.01, 0.3, 0.8],
                           [0.4, 0.01, 0.06, 0.2],
                           [0.05, 2.1, 0.01, 2.2],
                           [0.3, 0.01, 0.2, 0.2],
                           [0.1, 1.5, 0.9, 0.0],
                           [0.0, 0.03, 0.4, 0.5],
                           [0.01, 0.02, 0.66, 0.4],
                           [0.9, 0.7, 0.0, 1.0],
                           [1.0, 2.0, 0.04, 0.2]]),
                 index=['User_1', 'User_2', 'User_3', 'User_4', 'User_5', 'User_6', 'User_7', 'User_8', 'User_9'],
                 columns=['U_LF_1', 'U_LF_2', 'U_LF_3', 'U_LF_4'])
U

Unnamed: 0,U_LF_1,U_LF_2,U_LF_3,U_LF_4
User_1,0.8,0.01,0.3,0.8
User_2,0.4,0.01,0.06,0.2
User_3,0.05,2.1,0.01,2.2
User_4,0.3,0.01,0.2,0.2
User_5,0.1,1.5,0.9,0.0
User_6,0.0,0.03,0.4,0.5
User_7,0.01,0.02,0.66,0.4
User_8,0.9,0.7,0.0,1.0
User_9,1.0,2.0,0.04,0.2


In [16]:
P = pd.DataFrame(np.array([[0.5, 0.1, 0.4, 1.1],
                           [0.2, 2.0, 0.0, 0.01],
                           [0.3, 1.9, 0.6, 0.9],
                           [1.0, 0.2, 1.0, 0.89]]),
                 index=["P_LF_1", "P_LF_2", "P_LF_3", "P_LF_4"],
                 columns=["Movie_1", "Movie_2", "Movie_3", "Movie_4"])
P

Unnamed: 0,Movie_1,Movie_2,Movie_3,Movie_4
P_LF_1,0.5,0.1,0.4,1.1
P_LF_2,0.2,2.0,0.0,0.01
P_LF_3,0.3,1.9,0.6,0.9
P_LF_4,1.0,0.2,1.0,0.89


Looking at U and P, which movie do you think will have the highest recommendation for User_3.

In [17]:
# Multiply factor matrices
UP = np.matmul(U.values, P.values)

# Convert to pandas DataFrame
pd.DataFrame(UP, columns = P.columns, index = U.index).style.background_gradient(axis=None)

Unnamed: 0,Movie_1,Movie_2,Movie_3,Movie_4
User_1,1.292,0.83,1.3,1.8621
User_2,0.42,0.214,0.396,0.6721
User_3,2.648,4.664,2.226,2.043
User_4,0.412,0.47,0.44,0.6881
User_5,0.62,4.72,0.58,0.935
User_6,0.626,0.92,0.74,0.8053
User_7,0.607,1.375,0.8,0.9612
User_8,1.59,1.69,1.36,1.887
User_9,1.112,4.216,0.624,1.334


### RMSE and ALS alternates

In [18]:
T = pd.DataFrame({0: {0: 1.292, 1: 0.42, 2: 0.08, 3: 0.412, 4: 0.62, 5: 0.626, 6: 0.0, 7: 1.59, 8: 0.0}, 1: {0: 0.0, 1: 0.0, 2: 4.664, 3: 0.47, 4: 0.0, 5: 0.0, 6: 1.375, 7: 1.69, 8: 4.216}, 2: {0: 1.3, 1: 0.396, 2: 2.226, 3: 0.0, 4: 0.58, 5: 0.0, 6: 0.8, 7: 1.36, 8: 0.624}, 3: {0: 0.0, 1: 0.6721, 2: 2.043, 3: 0.0, 4: 0.935, 5: 0.8053, 6: 0.9612, 7: 0.0, 8: 0.0}})
F1 = pd.DataFrame({0: {0: 2, 1: 1, 2: 1, 3: 1, 4: 3, 5: 4, 6: 2, 7: 4, 8: 4}, 1: {0: 4, 1: 3, 2: 4, 3: 4, 4: 3, 5: 2, 6: 4, 7: 3, 8: 1}, 2: {0: 3, 1: 2, 2: 4, 3: 4, 4: 1, 5: 4, 6: 3, 7: 3, 8: 3}, 3: {0: 3, 1: 1, 2: 3, 3: 3, 4: 3, 5: 1, 6: 4, 7: 4, 8: 2}})
F2 = pd.DataFrame({0: {0: 0.7276845605456583, 1: 0.2972162515949431, 2: 0.6684103841772332, 3: 0.10356292108571152, 4: 0.2788049127462368, 5: 0.1833175716784896, 6: 0.0, 7: 0.9819681824878523, 8: 0.0}, 1: {0: 0.0, 1: 0.0, 2: 5.133314382547687, 3: 0.4494655185366921, 4: 0.0, 5: 0.0, 6: 1.817580601657848, 7: 2.720610558441755, 8: 2.667366674847455}, 2: {0: 0.7816137663665544, 1: 0.2898334245675139, 2: 2.090947607279091, 3: 0.0, 4: 0.5643273601209946, 5: 0.0, 6: 0.7181012176124341, 7: 1.029218987922719, 8: 0.8200154534789706}, 3: {0: 0.0, 1: 0.1499720792763105, 2: 1.7276154180386931, 3: 0.0, 4: 0.32621995264108916, 5: 0.22984785075635933, 6: 0.6873269305813324, 7: 0.0, 8: 0.0}})
F3 = pd.DataFrame({0: {0: 1.492244544405576, 1: 0.3971790359091946, 2: 0.1412048503383586, 3: 0.1092555086372637, 4: 0.6192730100600089, 5: 0.37792092895633184, 6: 0.0, 7: 1.391170523003995, 8: 0.0}, 1: {0: 0.0, 1: 0.0, 2: 4.940441255371812, 3: 0.3430329338282129, 4: 0.0, 5: 0.0, 6: 1.3506756720195645, 7: 1.8109451554711358, 8: 3.820890212142733}, 2: {0: 0.9906046679345504, 1: 0.43573289233260093, 2: 1.8814918023645575, 3: 0.0, 4: 0.6868592910926297, 5: 0.0, 6: 0.6762102891509367, 7: 1.3902807929383285, 8: 1.1602964864686154}, 3: {0: 0.0, 1: 0.5634621047651994, 2: 1.9051201248296687, 3: 0.0, 4: 0.9732927573503258, 5: 0.6082012051404062, 6: 0.8832044907058253, 7: 0.0, 8: 0.0}})
F4 = pd.DataFrame({0: {0: 1.2328116960563509, 1: 0.3086750565288739, 2: 0.39914187738232604, 3: 0.2818453154193397, 4: 0.5314547994330507, 5: 0.3191773892685046, 6: 0.0, 7: 1.6153841993425948, 8: 0.0}, 1: {0: 0.0, 1: 0.0, 2: 4.868624864462818, 3: 0.41975332506620483, 4: 0.0, 5: 0.0, 6: 1.4883573519834, 7: 1.5536205404463495, 8: 4.01875305549703}, 2: {0: 0.9634322034866029, 1: 0.40656803594098884, 2: 1.9875103148593356, 3: 0.0, 4: 0.6268334544838284, 5: 0.0, 6: 0.7192897770862041, 7: 1.5268895114218426, 8: 0.8568217198229018}, 3: {0: 0.0, 1: 0.8277327520777978, 2: 1.6734205007276095, 3: 0.0, 4: 1.098207680580611, 5: 0.8558954578032543, 6: 0.8075196202801012, 7: 0.0, 8: 0.0}})
F5 = pd.DataFrame({0: {0: 1.1918276586761334, 1: 0.48296043557884366, 2: 0.27082246932783494, 3: 0.3648945271982186, 4: 0.7038666570625396, 5: 0.8291173813793535, 6: 0.0, 7: 1.4762825249091456, 8: 0.0}, 1: {0: 0.0, 1: 0.0, 2: 4.665225281243459, 3: 0.4663077770482654, 4: 0.0, 5: 0.0, 6: 1.350147885750201, 7: 1.6329724133262897, 8: 4.086277241219637}, 2: {0: 0.7569023614052337, 1: 0.46694562856708427, 2: 2.179591287033028, 3: 0.0, 4: 0.6615227944651105, 5: 0.0, 6: 0.78887516819961, 7: 1.4576973259168429, 8: 0.7357222893419664}, 3: {0: 0.0, 1: 0.6349129096467335, 2: 2.058922581112409, 3: 0.0, 4: 0.8960993494430277, 5: 0.41369289143304633, 6: 1.0276194917678194, 7: 0.0, 8: 0.0}})
F6 = pd.DataFrame({0: {0: 1.3925335702488837, 1: 0.4484334008021943, 2: 0.08601778758214597, 3: 0.24883534923607198, 4: 0.6595383383218987, 5: 0.611369099120054, 6: 0.0, 7: 1.5109305267866935, 8: 0.0}, 1: {0: 0.0, 1: 0.0, 2: 4.6288782331984715, 3: 0.43708137302705913, 4: 0.0, 5: 0.0, 6: 1.3386551785475418, 7: 1.681586567710403, 8: 4.21410842023286}, 2: {0: 1.1909538847011532, 1: 0.38891205376055604, 2: 2.2059278309456136, 3: 0.0, 4: 0.5709599432288439, 5: 0.0, 6: 0.8599375017858101, 7: 1.4450324091551967, 8: 0.6293274706160811}, 3: {0: 0.0, 1: 0.6541628872016827, 2: 2.0519239375916167, 3: 0.0, 4: 0.9099036222343462, 5: 0.8155947107816741, 6: 0.9447891763067716, 7: 0.0, 8: 0.0}})

In [19]:
{f'F{i+1}':getRMSE(preds, T) for i, preds in enumerate([F1,F2,F3,F4,F5,F6])}

{'F1': 2.479, 'F2': 0.439, 'F3': 0.176, 'F4': 0.152, 'F5': 0.132, 'F6': 0.045}

# Data preparation for Spark ALS

In [20]:
ratings.show(5)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     31|   2.5|1260759144|
|     1|   1029|   3.0|1260759179|
|     1|   1061|   3.0|1260759182|
|     1|   1129|   2.0|1260759185|
|     1|   1172|   4.0|1260759205|
+------+-------+------+----------+
only showing top 5 rows



In [21]:
ratings.where('movieId < 7').groupBy('userId').pivot('movieId').agg(F.coalesce(F.first('rating'))).show(5)

+------+----+----+----+----+----+----+
|userId|   1|   2|   3|   4|   5|   6|
+------+----+----+----+----+----+----+
|   471| 3.5| 3.0|null|null|null|null|
|   496|null| 4.0| 3.0|null|null| 5.0|
|   463| 3.0|null|null|null|null|null|
|   623| 4.5|null|null|null|null|null|
|   516|null|null| 3.0|null| 3.0| 4.0|
+------+----+----+----+----+----+----+
only showing top 5 rows



## Steps to get integer ID's
1. Extract unique userIds and movieIds
2. Assign unique integers to each id
3. Rejoin unique integer id's back to the ratings data

In [22]:
# Extracting distinct user IDs
users = ratings.select('userId').distinct()
users.show(3)

# Coalesce method to avoid duplicate ids in distinct partitions
users = users.coalesce(1)

# Persist method to keep these values thwe same across all dataframe operations
users = users.withColumn("userIntId", F.monotonically_increasing_id()).persist()
users.show(3)

+------+
|userId|
+------+
|   148|
|   463|
|   471|
+------+
only showing top 3 rows

+------+---------+
|userId|userIntId|
+------+---------+
|   148|        0|
|   463|        1|
|   471|        2|
+------+---------+
only showing top 3 rows



## Movie integer IDs

In [23]:
movies = ratings.select("movieId").distinct()
movies = movies.coalesce(1)
movies = movies.withColumn("movieIdInt", F.monotonically_increasing_id()).persist()
movies.show(3)

+-------+----------+
|movieId|movieIdInt|
+-------+----------+
|   1580|         0|
|   2659|         1|
|   3794|         2|
+-------+----------+
only showing top 3 rows



## Joining UserIds and MovieIds

In [24]:
ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     31|   2.5|1260759144|
|     1|   1029|   3.0|1260759179|
|     1|   1061|   3.0|1260759182|
|     1|   1129|   2.0|1260759185|
|     1|   1172|   4.0|1260759205|
|     1|   1263|   2.0|1260759151|
|     1|   1287|   2.0|1260759187|
|     1|   1293|   2.0|1260759148|
|     1|   1339|   3.5|1260759125|
|     1|   1343|   2.0|1260759131|
|     1|   1371|   2.5|1260759135|
|     1|   1405|   1.0|1260759203|
|     1|   1953|   4.0|1260759191|
|     1|   2105|   4.0|1260759139|
|     1|   2150|   3.0|1260759194|
|     1|   2193|   2.0|1260759198|
|     1|   2294|   2.0|1260759108|
|     1|   2455|   2.5|1260759113|
|     1|   2968|   1.0|1260759200|
|     1|   3671|   3.0|1260759117|
+------+-------+------+----------+
only showing top 20 rows



In [25]:
ratings_w_int_ids = ratings.join(users, "userId", "left").join(movies, "movieId", "left")
ratings_w_int_ids.show()

+-------+------+------+----------+---------+----------+
|movieId|userId|rating| timestamp|userIntId|movieIdInt|
+-------+------+------+----------+---------+----------+
|     31|     1|   2.5|1260759144|      146|        88|
|   1029|     1|   3.0|1260759179|      146|      2919|
|   1061|     1|   3.0|1260759182|      146|      1484|
|   1129|     1|   2.0|1260759185|      146|      7456|
|   1172|     1|   4.0|1260759205|      146|      1955|
|   1263|     1|   2.0|1260759151|      146|      3636|
|   1287|     1|   2.0|1260759187|      146|      4852|
|   1293|     1|   2.0|1260759148|      146|      4161|
|   1339|     1|   3.5|1260759125|      146|        89|
|   1343|     1|   2.0|1260759131|      146|      5705|
|   1371|     1|   2.5|1260759135|      146|      8279|
|   1405|     1|   1.0|1260759203|      146|      5827|
|   1953|     1|   4.0|1260759191|      146|      7229|
|   2105|     1|   4.0|1260759139|      146|      3086|
|   2150|     1|   3.0|1260759194|      146|    

In [26]:
ratings_data = ratings_w_int_ids.select(F.col("userIntId").alias("userid"),
                                        F.col("movieIdInt").alias("movieId"),
                                        F.col("rating"))
ratings_data.show(3)

+------+-------+------+
|userid|movieId|rating|
+------+-------+------+
|   146|     88|   2.5|
|   146|   2919|   3.0|
|   146|   1484|   3.0|
+------+-------+------+
only showing top 3 rows



In [27]:
ratings_data.rdd.getNumPartitions()

1

## Exercises

In [28]:
def to_long(df, by=["User"]):
    """
    Converts traditional or "wide" dataframe into a "row-based" dataframe, also known as a "dense" or "long" dataframe.

    Parameters:
      - df: array of columns with column names
      - by: name of column which serves as

    Returns: Row-based dataframe with no null values
    """
    cols = [c for c in df.columns if c not in by]
    # Create and explode an array of (column_name, column_value) structs
    kvs = F.explode(
        F.array([F.struct(F.lit(c).alias("Movie"), F.col(c).alias("Rating")) for c in cols])
    ).alias("kvs")
    return (
        df.select(by + [kvs])
        .select(by + ["kvs.Movie", "kvs.Rating"])
        .filter("rating IS NOT NULL")
    )

### Correct format and distinct users

In [29]:
R = pd.DataFrame.from_dict({"Coco": {0: "4", 1: "5", 2: "2", 3: None},
                            "Shrek": {0: "3", 1: "4", 2: None, 3: "5"},
                            "Sneakers": {0: "3", 1: "2", 2: "5", 3: "2"},
                            "Swing Kids": {0: "4", 1: None, 2: "2", 3: "2"},
                            "User": {0: "James Alking",
                                     1: "Elvira Marroquin",
                                     2: "Jack Bauer",
                                     3: "Julia James"}})
R = spark.createDataFrame(R)
R.show()

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


+----+-----+--------+----------+----------------+
|Coco|Shrek|Sneakers|Swing Kids|            User|
+----+-----+--------+----------+----------------+
|   4|    3|       3|         4|    James Alking|
|   5|    4|       2|      null|Elvira Marroquin|
|   2| null|       5|         2|      Jack Bauer|
|null|    5|       2|         2|     Julia James|
+----+-----+--------+----------+----------------+



In [30]:
# Use the to_long() function to convert the dataframe to the "long" format.
ratings = to_long(R)
ratings.show()

+----------------+----------+------+
|            User|     Movie|Rating|
+----------------+----------+------+
|    James Alking|      Coco|     4|
|    James Alking|     Shrek|     3|
|    James Alking|  Sneakers|     3|
|    James Alking|Swing Kids|     4|
|Elvira Marroquin|      Coco|     5|
|Elvira Marroquin|     Shrek|     4|
|Elvira Marroquin|  Sneakers|     2|
|      Jack Bauer|      Coco|     2|
|      Jack Bauer|  Sneakers|     5|
|      Jack Bauer|Swing Kids|     2|
|     Julia James|     Shrek|     5|
|     Julia James|  Sneakers|     2|
|     Julia James|Swing Kids|     2|
+----------------+----------+------+



In [31]:
# Get unique users and repartition to 1 partition
users = ratings.select("User").distinct().coalesce(1)
users.show()

+----------------+
|            User|
+----------------+
|    James Alking|
|Elvira Marroquin|
|      Jack Bauer|
|     Julia James|
+----------------+



In [32]:
# Create a new column of unique integers called "userId" in the users dataframe.
users = users.withColumn("userId", F.monotonically_increasing_id()).persist()
users.show()

+----------------+------+
|            User|userId|
+----------------+------+
|Elvira Marroquin|     0|
|      Jack Bauer|     1|
|    James Alking|     2|
|     Julia James|     3|
+----------------+------+



### Assigning integer id's to movies

In [33]:
# Extract the distinct movie id's
movies = ratings.select("Movie").distinct()

# Repartition the data to have only one partition.
movies = movies.coalesce(1) 

# Create a new column of movieId integers. 
movies = movies.withColumn("movieId", F.monotonically_increasing_id()).persist() 

# Join the ratings, users and movies dataframes
movie_ratings = ratings.join(users, "User", "left").join(movies, "Movie", "left")
movie_ratings.show()

+----------+----------------+------+------+-------+
|     Movie|            User|Rating|userId|movieId|
+----------+----------------+------+------+-------+
|  Sneakers|    James Alking|     3|     2|      0|
|      Coco|    James Alking|     4|     2|      1|
|Swing Kids|    James Alking|     4|     2|      2|
|     Shrek|    James Alking|     3|     2|      3|
|  Sneakers|Elvira Marroquin|     2|     0|      0|
|      Coco|Elvira Marroquin|     5|     0|      1|
|     Shrek|Elvira Marroquin|     4|     0|      3|
|  Sneakers|      Jack Bauer|     5|     1|      0|
|      Coco|      Jack Bauer|     2|     1|      1|
|Swing Kids|      Jack Bauer|     2|     1|      2|
|  Sneakers|     Julia James|     2|     3|      0|
|Swing Kids|     Julia James|     2|     3|      2|
|     Shrek|     Julia James|     5|     3|      3|
+----------+----------------+------+------+-------+



# ALS parameters and hyperparameters

## Example ALS model code

In [34]:
from pyspark.ml.recommendation import ALS
?ALS

[0;31mInit signature:[0m
[0mALS[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrank[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m10[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmaxIter[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m10[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mregParam[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnumUserBlocks[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m10[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnumItemBlocks[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m10[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mimplicitPrefs[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malpha[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m1.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muserCol[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'user'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0m

        als_model = ALS(
            userCol="userId",
            itemCol="movieId",
            ratingCol="rating",
            rank=25,
            maxIter=100,
            regParam=0.05,
            alpha=40,
            nonnegative=True,
            coldStartStrategy="drop",
            implicitPrefs=False,
        )

Arguments:
* userCol : Name of column that contains user id's
* itemCol : Name of column that contains item id's
* ratingCol : Name of column that contains ratings

Hyperparameters:
* rank , k: number of latent features
* maxIter : number of iterations
* regParam : Lambda is simply a number that is added to an error metric to keep the algorithm from converging too quickly and overfitting to the training data.
* alpha : Discussed later. Only used with implicit ratings, and not used with explicit ratings.

Additional Arguments:
* nonnegative = True : Ensures positive numbers (as ratings should be only positive in our case)
* coldStartStrategy = "drop" : Addresses issues with test/train split
* implicitPrefs = True : True/False depending on ratings type

## Cold start strategy

You might be familiar with the term coldStartStrategy already.

In the context of ALS, when splitting data into test and train sets, it's possible for a user to have all of their ratings inadvertantly put into the test set, leaving nothing in the train set to be used for making a prediction.

In this case, ALS can't make meaningful predictions for that user, or calculate an error metric.

To avoid this, we set the `coldStartStrategy="drop"` which tells Spark that when these situations arise, to not use them to calculate the RMSE, and to only use users that have ratings in both the test AND training set. 

## Exercises

### Build out an ALS model

In [35]:
# Read data from CSV file
ratings = spark.read.csv('ratings.csv',
                         sep=',',
                         header=True,
                         inferSchema=True).drop('timestamp')
ratings.show(3)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|     31|   2.5|
|     1|   1029|   3.0|
|     1|   1061|   3.0|
+------+-------+------+
only showing top 3 rows



In [36]:
# Split the ratings dataframe into training and test data
(training_data, test_data) = ratings.randomSplit([0.8, 0.2], seed=42)
training_data.show(3)
test_data.show(3)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|     31|   2.5|
|     1|   1029|   3.0|
|     1|   1129|   2.0|
+------+-------+------+
only showing top 3 rows

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|   1061|   3.0|
|     1|   1287|   2.0|
|     1|   1339|   3.5|
+------+-------+------+
only showing top 3 rows



In [37]:
# Set the ALS hyperparameters
from pyspark.ml.recommendation import ALS
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", rank =10, maxIter =15, regParam =.1,
          coldStartStrategy="drop", nonnegative=True, implicitPrefs=False)

In [38]:
# Fit the model to the training_data
model = als.fit(training_data)

In [39]:
# Generate predictions on the test_data
test_predictions = model.transform(test_data)
test_predictions.show(3)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   148|    185|   3.0| 3.1130092|
|   148|    364|   4.0|  4.105878|
|   148|    596|   4.5| 3.8943558|
+------+-------+------+----------+
only showing top 3 rows



In [40]:
test_predictions.describe().show()

+-------+------------------+------------------+-----------------+------------------+
|summary|            userId|           movieId|           rating|        prediction|
+-------+------------------+------------------+-----------------+------------------+
|  count|             19403|             19403|            19403|             19403|
|   mean|345.67453486574243|11384.022470751946|3.562232644436427| 3.386283258731197|
| stddev|193.14067426101184|24609.191823329118|1.050488277009394|0.7296737404411373|
|    min|                 1|                 1|              0.5|        0.08008587|
|    max|               671|            160563|              5.0|          5.791388|
+-------+------------------+------------------+-----------------+------------------+



### Build RMSE evaluator

In [41]:
# Import RegressionEvaluator
from pyspark.ml.evaluation import RegressionEvaluator

# Complete the evaluator code
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

# Extract the 3 parameters
print(evaluator.getMetricName())
print(evaluator.getLabelCol())
print(evaluator.getPredictionCol())

rmse
rating
prediction


### Get RMSE

In [42]:
# Evaluate the "test_predictions" dataframe
evaluator.evaluate(test_predictions)

0.9100062972496853

>  This RMSE means that on average, the model's test predictions are about .91 off from the true values.