In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF, PCA
#from lu_crtp import lu_crtp, compareApprox

In [7]:

def read_file():
    # Read data
    data = []
    with open('dataset/combined_data_1.txt', 'r') as file:
        current_movie_id = None
        for line in file:
            line = line.strip() 
            if ':' in line:
                # The file contains movies and their raiting from users
                # if there is a : in a line, it indiactes we reached the next movie
                current_movie_id = line.split(':')[0] 
            else:
                # Else the line contains a user id and a rating
                if line:
                    user_id, rating, _ = line.split(',')
                    data.append({
                        'MovieID': current_movie_id,
                        'UserID': user_id,
                        'Rating': rating
                    })

    # Create dataframe and change datatypes
    df = pd.DataFrame(data)
    df['MovieID'] = df['MovieID'].astype(int)
    df['UserID'] = df['UserID'].astype(int)
    df['Rating'] = df['Rating'].astype(float)
    return df

df = read_file()

In [8]:
# Change the structure to a user/item matrix
df = df.pivot(index='UserID', columns='MovieID', values='Rating')

# Fill NaN values
df.fillna(value=0, inplace=True)

In [35]:
from scipy.linalg import qr
import numpy as np

# input:
# A is a numpy matrix mxn
# P_c is a permutation matrix as an int ndarray of shape (N,)
# k is an integer
# output:
# Q_k is a numpy matrix
# R_k is a numpy matrix
def lu_crtp_4(A, P_c, k):
    AP_c = A[:, P_c]
    AP_c_selected_columns = AP_c[:, :k]
    Q_k, R_k = np.linalg.qr(AP_c_selected_columns)
    return Q_k, R_k

# input:
# a is a numpy matrix mxn
# k is an even integer
def lu_crtp(a, k):
    # According to https://epubs-siam-org.uaccess.univie.ac.at/doi/epdf/10.1137/13092157X QR_TP is the same
    # as RRQR. For a faster implementation, we therefore decided on using scipy's LAPACK interface.
    # Select k columns by using QR with tournament pivoting on A
    _, _, p_c = qr(a, pivoting=True)
    #p_c = p_c[:k]


    # Compute the thin QR factorization of the selected columns
    q_k, r_k = lu_crtp_4(a, p_c, k)
    print("q_k:")
    print(q_k.shape)
    print("r_k:")
    print(r_k.shape)


    # Select k rows by using QR with tournament pivoting on Q^T_k
    _, _, p_r = qr(q_k.T, pivoting=True)
    #p_r = p_r[:k]

    # Let A_ = P ...
    a_dash = a[p_r,:]
    a_dash = a_dash[:,p_c]
    rows, cols = a_dash.shape
    print("a_dash:")
    print(a_dash.shape)
    
    # Separate into block matrices
    a_dash_11 = a_dash[:k, :k]
    a_dash_21 = a_dash[k:, :k]
    a_dash_12 = a_dash[:k, k:]
    print("a_dash_11:")
    print(a_dash_11.shape)
    print("a_dash_21:")
    print(a_dash_21.shape)
    print("a_dash_12:")
    print(a_dash_12.shape)


    # Compute L_21
    inv_a_dash_11 = np.linalg.inv(a_dash_11)
    l_21 = np.dot(a_dash_21, inv_a_dash_11)
    print("l_21:")
    print(l_21.shape)

    
    # Stack the block matrices
    i = np.identity(k)
    l_k = np.vstack((i, l_21))
    print("l_k:")
    print(l_k.shape)
    print(l_k)
    u_k = np.hstack((a_dash_11, a_dash_12))
    print("u_k:")
    print(u_k.shape)
    print(u_k)
    return p_r, p_c, l_k, u_k, r_k

# returns true if A and the approximation is equal
def compareApprox(A, p_r, p_c, l_k, u_k):
    approx = np.dot(l_k, u_k)
    print("approx:")
    print(approx)
    PA = p_A = A[p_r, :]
    PA = p_A[:, p_c]
    print("PA:")
    print(PA)
    return np.allclose(approx, PA)



In [36]:
A = np.array([[0, 1, 0, 0, 1],
              [1, 0, 0, 1, 0],
              [0, 0, 0, 0, 1],
              [0, 0, 1, 2, 0],
              [2, 0, 0, 0, 0]])

A = np.array([[1, 2, 3, 4, 5],
                [6, 7, 8, 9, 10],
               [11, 12, 13, 14, 15],
               [16, 17, 18, 19, 20],
               [21, 22, 23, 24, 25]])


In [37]:
k = min(A.shape[0], A.shape[1])
print(f"mxn:{A.shape}")

while(k >= 1):
    print(f"\n{k=}:")
    p_r, p_c, l_k, u_k, r_k = lu_crtp(A, k)
    print(compareApprox(A, p_r, p_c, l_k, u_k))
    k = k - 1

mxn:(5, 5)

k=5:
q_k:
(5, 5)
r_k:
(5, 5)
a_dash:
(5, 5)
a_dash_11:
(5, 5)
a_dash_21:
(0, 5)
a_dash_12:
(5, 0)
l_21:
(0, 5)
l_k:
(5, 5)
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
u_k:
(5, 5)
[[25 21 24 22 23]
 [10  6  9  7  8]
 [15 11 14 12 13]
 [20 16 19 17 18]
 [ 5  1  4  2  3]]
approx:
[[25. 21. 24. 22. 23.]
 [10.  6.  9.  7.  8.]
 [15. 11. 14. 12. 13.]
 [20. 16. 19. 17. 18.]
 [ 5.  1.  4.  2.  3.]]
PA:
[[25 21 24 22 23]
 [10  6  9  7  8]
 [15 11 14 12 13]
 [20 16 19 17 18]
 [ 5  1  4  2  3]]
True

k=4:
q_k:
(5, 4)
r_k:
(4, 4)
a_dash:
(5, 5)
a_dash_11:
(4, 4)
a_dash_21:
(1, 4)
a_dash_12:
(4, 1)
l_21:
(1, 4)
l_k:
(5, 4)
[[   1.    0.    0.    0.]
 [   0.    1.    0.    0.]
 [   0.    0.    1.    0.]
 [   0.    0.    0.    1.]
 [   0.  -32.   -2. -128.]]
u_k:
(4, 5)
[[ 5  1  4  2  3]
 [20 16 19 17 18]
 [25 21 24 22 23]
 [10  6  9  7  8]]
approx:
[[ 5.000e+00  1.000e+00  4.000e+00  2.000e+00  3.000e+00]
 [ 2.000e+01  1.600e+01  1.900e+01  

In [10]:
n_components = 3

model = NMF(n_components=n_components, init='random', random_state=42)
W = model.fit_transform(A)
H = model.components_

approximation_matrix = np.dot(W, H)

print("Original Matrix:")
print(A)
print("\nApproximation Matrix (Rank {}):".format(n_components))
print(approximation_matrix)

Original Matrix:
[[0 1 0 0 1]
 [1 0 0 1 0]
 [0 0 0 0 1]
 [0 0 1 2 0]
 [2 0 0 0 0]]

Approximation Matrix (Rank 3):
[[0.00000000e+00 7.23610135e-01 0.00000000e+00 0.00000000e+00
  1.17082206e+00]
 [1.00522663e+00 0.00000000e+00 3.43730687e-01 8.55867430e-01
  1.46082005e-12]
 [2.62884237e-11 4.47211927e-01 0.00000000e+00 9.54715126e-13
  7.23601792e-01]
 [0.00000000e+00 0.00000000e+00 8.63110472e-01 2.05742113e+00
  0.00000000e+00]
 [1.99736957e+00 0.00000000e+00 0.00000000e+00 7.25383522e-02
  2.90262658e-12]]


In [11]:
n_components = 3

pca = PCA(n_components=n_components)
data_transformed = pca.fit_transform(A)

approximation_matrix = np.dot(data_transformed, pca.components_) + pca.mean_

# Print the original and approximation matrices
print("Original Matrix:")
print(A)
print("\nApproximation Matrix (Rank {}):".format(n_components))
print(approximation_matrix)

Original Matrix:
[[0 1 0 0 1]
 [1 0 0 1 0]
 [0 0 0 0 1]
 [0 0 1 2 0]
 [2 0 0 0 0]]

Approximation Matrix (Rank 3):
[[ 4.02744740e-04  9.99930996e-01  2.88912811e-03 -1.09518431e-03
   1.00034622e+00]
 [ 1.04011704e+00 -6.87344053e-03  2.87783476e-01  8.90909667e-01
   3.44866440e-02]
 [-2.42346703e-03  4.15223919e-04 -1.73849737e-02  6.59013709e-03
   9.97916665e-01]
 [-1.68624841e-02  2.88912811e-03  8.79035350e-01  2.04585418e+00
  -1.44958456e-02]
 [ 1.97876616e+00  3.63809264e-03 -1.52322980e-01  5.77412045e-02
  -1.82536831e-02]]
