In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF, PCA

from lu_crtp import lu_crtp, compareApprox

In [2]:

def read_file():
    # Read data
    data = []
    with open('dataset/combined_data_1.txt', 'r') as file:
        current_movie_id = None
        for line in file:
            line = line.strip() 
            if ':' in line:
                # The file contains movies and their raiting from users
                # if there is a : in a line, it indiactes we reached the next movie
                current_movie_id = line.split(':')[0] 
            else:
                # Else the line contains a user id and a rating
                if line:
                    user_id, rating, _ = line.split(',')
                    data.append({
                        'MovieID': current_movie_id,
                        'UserID': user_id,
                        'Rating': rating
                    })

    # Create dataframe and change datatypes
    df = pd.DataFrame(data)
    df['MovieID'] = df['MovieID'].astype(int)
    df['UserID'] = df['UserID'].astype(int)
    df['Rating'] = df['Rating'].astype(float)
    return df

df = read_file()

In [3]:
# Change the structure to a user/item matrix
df = df.pivot(index='UserID', columns='MovieID', values='Rating')

# Fill NaN values
df.fillna(value=0, inplace=True)

In [5]:
data = df.to_numpy()

data = data[:10]

print(data.shape)

(10, 4499)


In [25]:
A = np.array([[4, 5, 4, 5],
              [9, 1, 2, 8],
              [6, 8, 5, 9],
              [1, 8, 6, 3]])

In [23]:
A = np.array([[0, 0, 4, 0],
              [0, 3, 0, 0],
              [1, 0, 0, 0],
              [0, 0, 0, 0]])

In [30]:
p_r, p_c, l_k, u_k, r_k = lu_crtp(A, 3)

print(compareApprox(A, p_r, p_c, l_k, u_k))

print(l_k.shape, u_k.shape)

approx:
[[9.         8.         5.        ]
 [8.         7.11111111 4.44444444]
 [3.         2.66666667 1.66666667]]
PA:
[[9 8 5]
 [8 1 2]
 [3 8 6]]
False
(3, 1) (1, 3)


In [16]:
n_components = 3

model = NMF(n_components=n_components, init='random', random_state=42)
W = model.fit_transform(A)
H = model.components_

approximation_matrix = np.dot(W, H)

print("Original Matrix:")
print(A)
print("\nApproximation Matrix (Rank {}):".format(n_components))
print(approximation_matrix)

Original Matrix:
[[4 5 4 5]
 [9 1 2 8]
 [6 8 5 9]
 [1 8 6 3]]

Approximation Matrix (Rank 3):
[[3.99888991 5.00011862 3.99824385 5.00075978]
 [9.00033833 0.99996772 2.00053161 7.9997678 ]
 [6.00014823 7.99998281 5.00023703 8.99989892]
 [1.00050338 7.99994844 6.0007895  2.99965455]]


In [22]:
n_components = 3

pca = PCA(n_components=n_components)
data_transformed = pca.fit_transform(A)

approximation_matrix = np.dot(data_transformed, pca.components_) + pca.mean_

# Print the original and approximation matrices
print("Original Matrix:")
print(A)
print("\nApproximation Matrix (Rank {}):".format(n_components))
print(approximation_matrix)

Original Matrix:
[[4 5 4 5]
 [9 1 2 8]
 [6 8 5 9]
 [1 8 6 3]]

Approximation Matrix (Rank 3):
[[4. 5. 4. 5.]
 [9. 1. 2. 8.]
 [6. 8. 5. 9.]
 [1. 8. 6. 3.]]


In [92]:
from scipy.linalg import qr

def lu_crtp_4_new(A, P_c, k):
    AP_c = A[:, P_c]
    AP_c_selected_columns = AP_c[:, :k]
    Q_k, R_k = np.linalg.qr(AP_c_selected_columns)
    return Q_k, R_k

# input:
# a is a numpy matrix mxn
# k is an even integer
def lu_crtp_new(a, k):
    # According to https://epubs-siam-org.uaccess.univie.ac.at/doi/epdf/10.1137/13092157X QR_TP is the same
    # as RRQR. For a faster implementation, we therefore decided on using scipy's LAPACK interface.
    # Select k columns by using QR with tournament pivoting on A
    _, _, p_c = qr(a, pivoting=True)
    # p_c = p_c[:k]


    # Compute the thin QR factorization of the selected columns
    q_k, r_k = lu_crtp_4_new(a, p_c, k)

    # Select k rows by using QR with tournament pivoting on Q^T_k
    _, _, p_r = qr(q_k.T, pivoting=True)
    # p_r = p_r[:k]

    # Let A_ = P ...
    a_dash = a[p_r,:]
    a_dash = a_dash[:,p_c]
    rows, cols = a_dash.shape
    # print(a_dash)
    
    # Separate into block matrices
    a_dash_11 = a_dash[:k, :k]
    a_dash_21 = a_dash[k:, :k]
    a_dash_12 = a_dash[:k, k:]
    # print(a_dash_11)
    # print(a_dash_21)
    # print(a_dash_12)

    # Compute L_21
    inv_a_dash_11 = np.linalg.inv(a_dash_11)
    l_21 = np.dot(a_dash_21, inv_a_dash_11)
    print(l_21)
    
    # Stack the block matrices
    i = np.identity(k)
    l_k = np.vstack((i, l_21))
    print("l_k:")
    print(l_k.shape)
    u_k = np.hstack((a_dash_11, a_dash_12))
    print("u_k:")
    print(u_k.shape)
    return p_r, p_c, l_k, u_k, r_k

In [89]:
array = np.random.randint(0, 100, size=(10, 10))

p_r, p_c, l_k, u_k, r_k = lu_crtp_new(array, 5)

print(compareApprox(array, p_r, p_c, l_k, u_k))

print(l_k.shape, u_k.shape)

[[ 0.20853279  0.41774971  0.56462198 -0.18526555  0.12817519]
 [ 0.30873109 -0.75704093 -0.04489362  0.8245574   0.31267936]
 [-0.00452472  0.23581918 -0.11873141  0.15308768  0.64773754]
 [-0.8329817  -0.15218214  0.62530036  0.51023859  0.56967122]
 [-0.19194485 -0.29792153  0.80175121 -0.40140453  0.86917022]]
l_k:
(10, 5)
u_k:
(5, 10)
approx:
[[ 19.          72.          69.          36.          80.
    2.          11.          39.          33.           8.        ]
 [ 53.          88.          77.           0.          15.
   28.          44.          51.           5.          26.        ]
 [ 67.          97.          19.          74.          47.
   83.          83.          61.          63.          75.        ]
 [ 99.          72.          53.           1.          84.
   42.          91.          29.          52.          46.        ]
 [ 50.          14.          90.          46.          24.
   72.           6.          27.          68.          69.        ]
 [ 52.         

In [95]:
A = np.array([[1, 2, 3, 4, 5],
                   [6, 7, 8, 9, 10],
                   [11, 12, 13, 14, 15],
                   [16, 17, 18, 19, 20],
                   [21, 22, 23, 24, 25]])
k = 5

while(k >= 1):
    print(f"\n{k=}:")
    p_r, p_c, l_k, u_k, r_k = lu_crtp_new(A, k)
    print(compareApprox(A, p_r, p_c, l_k, u_k))
    k = k - 1



k=5:
[]
l_k:
(5, 5)
u_k:
(5, 5)
approx:
[[15. 11. 14. 12. 13.]
 [20. 16. 19. 17. 18.]
 [25. 21. 24. 22. 23.]
 [10.  6.  9.  7.  8.]
 [ 5.  1.  4.  2.  3.]]
PA:
[[15 11 14 12 13]
 [20 16 19 17 18]
 [25 21 24 22 23]
 [10  6  9  7  8]
 [ 5  1  4  2  3]]
True

k=4:
[[-7.5 -6.   3.   2. ]]
l_k:
(5, 4)
u_k:
(4, 5)
approx:
[[  5.    1.    4.    2.    3. ]
 [ 15.   11.   14.   12.   13. ]
 [ 10.    6.    9.    7.    8. ]
 [ 25.   21.   24.   22.   23. ]
 [-47.5 -13.5 -39.  -22.  -30.5]]
PA:
[[ 5  1  4  2  3]
 [15 11 14 12 13]
 [10  6  9  7  8]
 [25 21 24 22 23]
 [20 16 19 17 18]]
False

k=3:
[[8. 0. 0.]
 [8. 8. 0.]]
l_k:
(5, 3)
u_k:
(3, 5)
approx:
[[  5.   1.   4.   2.   3.]
 [ 10.   6.   9.   7.   8.]
 [ 25.  21.  24.  22.  23.]
 [ 40.   8.  32.  16.  24.]
 [120.  56. 104.  72.  88.]]
PA:
[[ 5  1  4  2  3]
 [10  6  9  7  8]
 [25 21 24 22 23]
 [20 16 19 17 18]
 [15 11 14 12 13]]
False

k=2:
[[0.5  0.5 ]
 [0.25 0.75]
 [0.75 0.25]]
l_k:
(5, 2)
u_k:
(2, 5)
approx:
[[ 5.  1.  4.  2.  3.]
 [25. 21