# Single value decomposition

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as 
from mpl_toolkits.mplot3d import Axes3D

## 1.Load dataset

In [None]:
# Replace 'path_to_u.data' with the actual path to your u.data file
file_path = 'u.data'

# Reading the file
df = pd.read_csv(file_path, sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
# Displaying the first few rows of the DataFrame
print(df.head())

## 2.Create User-Matrix matrix

We will use the pivot method in pandas and fill the nan with 0's

In [None]:
#Create a User-Item Matrix
user_item_matrix = df.pivot(index='user_id', columns='item_id', values='rating')
user_item_matrix.fillna(0, inplace=True)
user_item_matrix.head()

## 3. Decompose the user-matrix with SVD

In [None]:
U, sigma, Vt = np.linalg.svd(user_item_matrix, full_matrices=True)

## 4. Reduce the dimensionality to capture $85\%$ of the variance

In [None]:
Sigma = np.zeros((U.shape[0], Vt.shape[0]))
np.fill_diagonal(Sigma, sigma)
# Calculate the total variance (sum of squares of singular values)
total_variance = sum(sigma**2)

# Calculate the variance explained by each singular value and the cumulative variance
variance_explained = np.cumsum(sigma**2) / total_variance

# Find the number of singular values needed to reach 85% of the total variance
num_singular_values = np.where(variance_explained >= 0.85)[0][0] + 1
print('Number of Singular values for 85% of variance:')
print(num_singular_values)

# Reduced SVD
U_reduced = U[:, :num_singular_values]
sigma_reduced = np.diag(sigma[:num_singular_values])
Vt_reduced = Vt[:num_singular_values, :]

### 4a. Plot the cumulative variance and the singular values magnitude

In [None]:
# Plot the singular values and the cumulative variance
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(sigma)
plt.title("Singular Values")
plt.xlabel("Singular Value Index")
plt.ylabel("Singular Value Magnitude")

plt.subplot(1, 2, 2)
plt.plot(variance_explained)
plt.axhline(y=0.85, color='r', linestyle='--')
plt.title("Cumulative Variance Explained")
plt.xlabel("Number of Singular Values")
plt.ylabel("Cumulative Variance")
plt.show()

### 4b. Plot the chosen single values

In [None]:
# Plot only the chosen singular values
chosen_singular_values = sigma[:num_singular_values]
plt.figure(figsize=(8, 4))
plt.bar(range(len(chosen_singular_values)), chosen_singular_values)
plt.title("Chosen Singular Values (85% Variance)")
plt.xlabel("Singular Value Index")
plt.ylabel("Singular Value Magnitude")
plt.show()

## 5.Generate 10 random users profiles and recommend 5 movies per each user.

In [None]:
k = Vt_reduced.shape[0]  # Number of features
num_users = 10
np.random.seed(0)  # For reproducible results

# Generate random ratings for each user for all k features
new_user_profiles = np.random.randint(0, 6, size=(num_users, k))

# Calculate the predicted ratings for new users
predicted_ratings = np.dot(new_user_profiles, Vt_reduced)

# Recommend top 5 movies for each new user
num_recommendations = 5
top_movie_indices = np.argsort(predicted_ratings, axis=1)[:, -num_recommendations:][:, ::-1]

# Print recommended movie indices for each new user
for i, movie_indices in enumerate(top_movie_indices):
    print(f"Recommended Movies for New User {i+1}: {movie_indices + 1}")  # +1 if movies are 1-indexed

## 6.Find the top 5 similar users to user 10 

In [None]:

# Transform user features
USigma = np.dot(U_reduced, sigma_reduced)

# Select a user, for example, user 10 (index 9 in zero-based indexing)
selected_user_index = 9
selected_user_features = USigma[selected_user_index, :]

# Function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Compute similarity with all other users
similarities = np.array([cosine_similarity(selected_user_features, user_features) for user_features in USigma])

# Find the top 5 similar users (excluding the selected user)
similar_users = np.argsort(similarities)[::-1][1:6]  # Skipping the first one as it will be the selected user itself

print(f"Top 5 similar users to User {selected_user_index + 1}: {similar_users + 1}")  # +1 for 1-based indexing


## 7. Recommend a movie that user 10 has not watched but was watched by any of the top most similar users.

In [None]:
R = user_item_matrix.to_numpy()
# Find the movies that user 10 has not watched
user_10_index = 9  # User 10 in zero-based indexing
user_10_watched = np.where(R[user_10_index, :] > 0)[0]  # Movies that user 10 has rated

# Find movies watched by similar users
similar_users_movies = set()
for user_index in similar_users:
    watched_movies = np.where(R[user_index, :] > 0)[0]
    similar_users_movies.update(watched_movies)

# Recommend a movie that user 10 has not watched but similar users have
recommendation = list(similar_users_movies.difference(user_10_watched))

# Select one movie to recommend (you can also rank these based on some criteria)
if recommendation:
    recommended_movie = recommendation[0]  # For example, recommend the first in the list
    print(f"Recommended Movie for User 10: {recommended_movie + 1}")  # +1 for 1-based indexing
else:
    print("No suitable movie found to recommend.")


## 8.Examine the V^T matrix obtained from SVD, where each row represents a latent feature, and each column represents a movie. Choose a few latent features and analyze their movie associations.

In [None]:

# Select the first three features
first_three_features = Vt[:3, :]

# Plot each of the first three features
for i, feature in enumerate(first_three_features):
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(feature)), feature)
    plt.title(f'Feature {i + 1}')
    plt.xlabel('Movie Index')
    plt.ylabel('Feature Strength')
    plt.show()

Interpreting these plots can provide insights into what each latent feature might represent. For example, if a particular feature has strong positive associations with a group of movies, and those movies share common characteristics (like being of the same genre), it's likely that the feature captures this characteristic.

Keep in mind, the interpretation of these latent features is not always straightforward and might require domain knowledge or additional data. The movie indices on the x-axis would need to be mapped back to actual movie titles for a more meaningful analysis.

### 3d Plot of the latent variables

In [None]:
# Assuming Vt is the matrix obtained from SVD
# Select the first three features from V^T
first_three_features = Vt[:3, :]

# Transpose to get movies as rows and features as columns
movies_in_feature_space = first_three_features.T

# Create a 3D plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Each movie is a point in the plot
# movies_in_feature_space[:, 0] is the association with the first feature, and so on
ax.scatter(movies_in_feature_space[:, 0], movies_in_feature_space[:, 1], movies_in_feature_space[:, 2])

ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_zlabel('Feature 3')
ax.set_title('Movies in the First Three Latent Feature Space')

plt.show()