In [20]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF, PCA

from lu_crtp import lu_crtp, compareApprox

In [7]:

def read_file():
    # Read data
    data = []
    with open('dataset/combined_data_1.txt', 'r') as file:
        current_movie_id = None
        for line in file:
            line = line.strip() 
            if ':' in line:
                # The file contains movies and their raiting from users
                # if there is a : in a line, it indiactes we reached the next movie
                current_movie_id = line.split(':')[0] 
            else:
                # Else the line contains a user id and a rating
                if line:
                    user_id, rating, _ = line.split(',')
                    data.append({
                        'MovieID': current_movie_id,
                        'UserID': user_id,
                        'Rating': rating
                    })

    # Create dataframe and change datatypes
    df = pd.DataFrame(data)
    df['MovieID'] = df['MovieID'].astype(int)
    df['UserID'] = df['UserID'].astype(int)
    df['Rating'] = df['Rating'].astype(float)
    return df

df = read_file()

In [8]:
# Change the structure to a user/item matrix
df = df.pivot(index='UserID', columns='MovieID', values='Rating')

# Fill NaN values
df.fillna(value=0, inplace=True)

In [5]:
A = np.array([[4, 5, 4, 5],
              [9, 1, 2, 8],
              [6, 8, 5, 9],
              [1, 8, 6, 3]])

In [18]:
p_r, p_c, l_k, u_k, r_k = lu_crtp(A, 3)

print(compareApprox(A, p_r, p_c, l_k, u_k))

approx:
[[9.         8.         5.        ]
 [8.         7.11111111 4.44444444]
 [3.         2.66666667 1.66666667]]
PA:
[[9 8 5]
 [8 1 2]
 [3 8 6]]
False


In [16]:
n_components = 3

model = NMF(n_components=n_components, init='random', random_state=42)
W = model.fit_transform(A)
H = model.components_

approximation_matrix = np.dot(W, H)

print("Original Matrix:")
print(A)
print("\nApproximation Matrix (Rank {}):".format(n_components))
print(approximation_matrix)

Original Matrix:
[[4 5 4 5]
 [9 1 2 8]
 [6 8 5 9]
 [1 8 6 3]]

Approximation Matrix (Rank 3):
[[3.99888991 5.00011862 3.99824385 5.00075978]
 [9.00033833 0.99996772 2.00053161 7.9997678 ]
 [6.00014823 7.99998281 5.00023703 8.99989892]
 [1.00050338 7.99994844 6.0007895  2.99965455]]


In [22]:
n_components = 3

pca = PCA(n_components=n_components)
data_transformed = pca.fit_transform(A)

approximation_matrix = np.dot(data_transformed, pca.components_) + pca.mean_

# Print the original and approximation matrices
print("Original Matrix:")
print(A)
print("\nApproximation Matrix (Rank {}):".format(n_components))
print(approximation_matrix)

Original Matrix:
[[4 5 4 5]
 [9 1 2 8]
 [6 8 5 9]
 [1 8 6 3]]

Approximation Matrix (Rank 3):
[[4. 5. 4. 5.]
 [9. 1. 2. 8.]
 [6. 8. 5. 9.]
 [1. 8. 6. 3.]]
