# Matrix Factorization

In [2]:
import pandas as pd
import numpy as np

## Reading Ratings Data

In [10]:
ratings_df = pd.read_csv('u.data', sep = '\t')

In [11]:
ratings_df

Unnamed: 0,196,242,3,881250949
0,186,302,3,891717742
1,22,377,1,878887116
2,244,51,2,880606923
3,166,346,1,886397596
4,298,474,4,884182806
...,...,...,...,...
99994,880,476,3,880175444
99995,716,204,5,879795543
99996,276,1090,1,874795795
99997,13,225,2,882399156


In [12]:
ratings_df.columns = ['userid', 'movieid', 'rating', 'timestamp']

In [13]:
ratings_df

Unnamed: 0,userid,movieid,rating,timestamp
0,186,302,3,891717742
1,22,377,1,878887116
2,244,51,2,880606923
3,166,346,1,886397596
4,298,474,4,884182806
...,...,...,...,...
99994,880,476,3,880175444
99995,716,204,5,879795543
99996,276,1090,1,874795795
99997,13,225,2,882399156


In [14]:
len(ratings_df.userid.unique())

943

In [None]:
len(ratings_df.movieid.unique())

## Reading the movies metadata

In [15]:
movies_df = pd.read_csv('u.item', 
                        encoding = 'iso-8859-1', 
                        sep = '|', 
                        header = None, 
                        usecols=[0, 1])

In [16]:
movies_df

Unnamed: 0,0,1
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [17]:
movies_df.columns = ['movieid', 'moviename']

## Creating user-movies ratings matrix

In [18]:
user_movies_df = ratings_df.pivot( index='userid', 
                                 columns='movieid', 
                                 values = "rating" ).reset_index(drop=True)
user_movies_df.index = ratings_df.userid.unique()

In [19]:
user_movies_df

movieid,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
186,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
22,4.0,,,,,,,,,2.0,...,,,,,,,,,,
244,,,,,,,,,,,...,,,,,,,,,,
166,,,,,,,,,,,...,,,,,,,,,,
298,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
936,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
930,5.0,,,,,,4.0,,,,...,,,,,,,,,,
920,,,,,,,,,,,...,,,,,,,,,,


### Matrix Factorization Methods

In [20]:
import numpy as np

def als_matrix_factorization(R, num_features, lambda_reg, iterations):
    """
    Perform matrix factorization using Alternating Least Squares (ALS) on the
    incomplete matrix R with NaN values and return the error.
    
    R: the input matrix with NaNs
    num_features: the number of latent features
    lambda_reg: the regularization parameter
    iterations: the number of iterations to perform
    """
    
    num_users, num_items = R.shape
    W = np.random.rand(num_users, num_features)
    H = np.random.rand(num_items, num_features).T

    mask = ~np.isnan(R)
    
    errors = []

    for _ in range(iterations):
        # Update W
        for i in range(num_users):
            H_i = H[:, mask[i, :]]
            R_i = R[i, mask[i, :]]
            W[i, :] = np.linalg.solve(H_i @ H_i.T + lambda_reg * np.eye(num_features), H_i @ R_i)

        # Update H
        for j in range(num_items):
            W_j = W[mask[:, j], :]
            R_j = R[mask[:, j], j]
            H[:, j] = np.linalg.solve(W_j.T @ W_j + lambda_reg * np.eye(num_features), W_j.T @ R_j)

        # Calculate the reconstruction error
        R_hat = W @ H
        error = np.nansum((R - R_hat)**2 * mask)
        errors.append(np.sqrt(error))

    return W, H.T, np.round(np.sqrt(errors), 4)

## Factorizing User-Movies Ratings Matrix

In [21]:
num_features = 20
lambda_reg = 0.1
iterations = 200

W, H, errors = als_matrix_factorization(user_movies_df.to_numpy(), num_features, lambda_reg, iterations)

print("W (User Feature Matrix):")
print(W)
print("\nH (Item Feature Matrix):")
print(H)

W (User Feature Matrix):
[[ 0.4048701  -0.03897007  0.50488156 ...  0.66346704  0.69960462
   0.4948188 ]
 [ 0.85211913  0.15926745  0.34472401 ...  0.59342213  0.26531029
   0.67994607]
 [-1.2362301   0.95480561  2.09770818 ...  1.71715104 -1.23802506
   0.7432744 ]
 ...
 [ 0.84008572  1.64170859  0.55931652 ... -0.06492252  0.64971032
   0.21596466]
 [ 1.06164979  0.89937505  0.49527804 ...  0.34135776 -0.26156277
   0.82792881]
 [-1.16510724  1.58107538  1.03049978 ...  0.21606837  1.47092642
   0.29228967]]

H (Item Feature Matrix):
[[ 1.0803216   0.37125713  0.48613946 ...  0.40395634  0.55026939
   0.5834023 ]
 [ 0.42007843  0.23254991  1.05162204 ...  0.78337399  0.39997708
   0.41934519]
 [-0.30843466  0.37378707  0.49114105 ...  1.50745578 -0.63087839
  -0.2361849 ]
 ...
 [-0.13931799 -0.12655973  0.11257708 ...  0.06054179  0.0351915
   0.07101194]
 [ 0.49866402  0.15089852 -0.08598382 ...  0.23003688  0.29721033
   0.20645387]
 [-0.12266008  0.36536825  0.18735598 ...  0.303

In [22]:
errors

array([15.3957, 14.4045, 13.9902, 13.755 , 13.602 , 13.4942, 13.414 ,
       13.3512, 13.3003, 13.258 , 13.2222, 13.1914, 13.1649, 13.142 ,
       13.1221, 13.1046, 13.089 , 13.0749, 13.0621, 13.0502, 13.0392,
       13.029 , 13.0194, 13.0105, 13.0022, 12.9944, 12.987 , 12.9801,
       12.9736, 12.9674, 12.9615, 12.956 , 12.9508, 12.9458, 12.9411,
       12.9366, 12.9324, 12.9283, 12.9243, 12.9204, 12.9167, 12.913 ,
       12.9094, 12.9059, 12.9025, 12.8992, 12.896 , 12.8929, 12.89  ,
       12.8871, 12.8844, 12.8819, 12.8794, 12.877 , 12.8747, 12.8724,
       12.8703, 12.8682, 12.8662, 12.8643, 12.8625, 12.8607, 12.859 ,
       12.8573, 12.8557, 12.8541, 12.8525, 12.851 , 12.8496, 12.8481,
       12.8467, 12.8453, 12.8439, 12.8425, 12.8412, 12.8398, 12.8385,
       12.8372, 12.8359, 12.8346, 12.8334, 12.8322, 12.8309, 12.8297,
       12.8286, 12.8274, 12.8262, 12.8251, 12.824 , 12.8229, 12.8218,
       12.8207, 12.8196, 12.8186, 12.8176, 12.8165, 12.8155, 12.8146,
       12.8136, 12.8

In [23]:
W.shape

(943, 20)

In [24]:
H.shape

(1682, 20)

## Finding Similarity

In [25]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

movies_sim = 1 - pairwise_distances( H, metric="cosine" )
movies_sim_df = pd.DataFrame( movies_sim )

In [26]:
def get_similar_movies( movieid, topN = 5 ):
    movieidx = movies_df[movies_df.movieid == movieid].index[0]
    movies_df['similarity'] = movies_sim_df.iloc[movieidx]
    top_n = movies_df.sort_values( ["similarity"], ascending = False )[0:topN]   
    return top_n 

In [27]:
movies_sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,1.000000,0.723175,0.367188,0.746436,0.669365,0.307819,0.816264,0.855486,0.768247,0.568206,...,0.380636,0.607046,0.749838,0.749838,0.652782,0.461615,0.461615,0.461615,0.691825,0.682039
1,0.723175,1.000000,0.417101,0.594961,0.745721,0.037600,0.655613,0.647362,0.567495,0.472903,...,0.367450,0.504534,0.589281,0.589281,0.354466,0.270961,0.270961,0.270961,0.520745,0.486750
2,0.367188,0.417101,1.000000,0.370390,0.346888,0.323814,0.336741,0.182388,0.396329,0.130109,...,0.534088,0.392531,0.472827,0.472827,0.205100,0.481140,0.481140,0.481140,0.283053,0.470869
3,0.746436,0.594961,0.370390,1.000000,0.325153,0.173771,0.845970,0.720694,0.708542,0.568607,...,0.348235,0.522582,0.789448,0.789448,0.511646,0.526838,0.526838,0.526838,0.555941,0.728632
4,0.669365,0.745721,0.346888,0.325153,1.000000,0.139061,0.460681,0.558424,0.564397,0.520833,...,0.483753,0.485362,0.347313,0.347313,0.345799,0.327414,0.327414,0.327414,0.325104,0.473953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0.461615,0.270961,0.481140,0.526838,0.327414,0.582535,0.578398,0.369835,0.462095,0.281551,...,0.301722,0.339277,0.492706,0.492706,0.477775,1.000000,1.000000,1.000000,0.450609,0.641436
1678,0.461615,0.270961,0.481140,0.526838,0.327414,0.582535,0.578398,0.369835,0.462095,0.281551,...,0.301722,0.339277,0.492706,0.492706,0.477775,1.000000,1.000000,1.000000,0.450609,0.641436
1679,0.461615,0.270961,0.481140,0.526838,0.327414,0.582535,0.578398,0.369835,0.462095,0.281551,...,0.301722,0.339277,0.492706,0.492706,0.477775,1.000000,1.000000,1.000000,0.450609,0.641436
1680,0.691825,0.520745,0.283053,0.555941,0.325104,0.185853,0.636929,0.672852,0.609521,0.298040,...,0.182244,0.224495,0.467850,0.467850,0.607739,0.450609,0.450609,0.450609,1.000000,0.515208


## Finding Similar Movies

In [28]:
movies_df[movies_df.movieid == 127]

Unnamed: 0,movieid,moviename
126,127,"Godfather, The (1972)"


In [36]:
get_similar_movies(127)

Unnamed: 0,movieid,moviename,similarity
126,127,"Godfather, The (1972)",1.0
186,187,"Godfather: Part II, The (1974)",0.958585
179,180,Apocalypse Now (1979),0.937371
356,357,One Flew Over the Cuckoo's Nest (1975),0.93592
1481,1482,"Gate of Heavenly Peace, The (1995)",0.919086


In [37]:
get_similar_movies(222)

Unnamed: 0,movieid,moviename,similarity
221,222,Star Trek: First Contact (1996),1.0
209,210,Indiana Jones and the Last Crusade (1989),0.94011
163,164,"Abyss, The (1989)",0.919207
379,380,Star Trek: Generations (1994),0.905385
203,204,Back to the Future (1985),0.901694


In [59]:
get_similar_movies(118)

Unnamed: 0,movieid,moviename,similarity
117,118,Twister (1996),1.0
120,121,Independence Day (ID4) (1996),0.961934
221,222,Star Trek: First Contact (1996),0.897101
163,164,"Abyss, The (1989)",0.890772
392,393,Mrs. Doubtfire (1993),0.887616


In [65]:
get_similar_movies(465)

Unnamed: 0,movieid,moviename,similarity
464,465,"Jungle Book, The (1994)",1.0
417,418,Cinderella (1950),0.923353
141,142,Bedknobs and Broomsticks (1971),0.87428
587,588,Beauty and the Beast (1991),0.863984
70,71,"Lion King, The (1994)",0.852538
