<a href="https://colab.research.google.com/github/mohammed21kamall/Graduation-Project/blob/main/collaborative.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

def dcg(relevance_scores, b=2):
    dcg_value = 0
    for i, rel in enumerate(relevance_scores):
        if i < b:
            dcg_value += rel
        else:
            dcg_value += rel / np.log2(i + 1)
    return dcg_value

def ndcg(relevance_scores):
    ideal_relevance = sorted(relevance_scores, reverse=True)
    return dcg(relevance_scores) / dcg(ideal_relevance)

def hlu(relevance_scores, d, h=2):
    hlu_value = 0
    for i, score in enumerate(relevance_scores):
        r_id = max(score - d, 0)
        hlu_value += r_id / (2 ** (i / h))
    return hlu_value / len(relevance_scores)

# Example user ratings
user_ratings = [1, 0, 1, 1, 0]  # For DCG and NDCG
average_rating = 3  # For HLU example
relevance_scores = [3, 4, 2, 5, 1]  # Example relevance scores for HLU
d = np.mean(relevance_scores)

# Calculations
print("DCG:", dcg(user_ratings))
print("NDCG:", ndcg(user_ratings))
print("HLU:", hlu(relevance_scores, d))


DCG: 2.1309297535714578
NDCG: 0.8099531166420328
HLU: 0.282842712474619


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Example user-item rating matrix
data = {
    'User1': {'Item1': 5, 'Item2': 3, 'Item3': 4, 'Item4': np.nan},
    'User2': {'Item1': 3, 'Item2': 1, 'Item3': 2, 'Item4': 3},
    'User3': {'Item1': 4, 'Item2': 2, 'Item3': np.nan, 'Item4': 5},
    'User4': {'Item1': 3, 'Item2': 3, 'Item3': 1, 'Item4': 4},
    'User5': {'Item1': np.nan, 'Item2': 4, 'Item3': 5, 'Item4': 2}
}

df = pd.DataFrame(data).T
print("User-Item Rating Matrix:")
print(df)

# Fill missing values with the user's average rating
df_filled = df.apply(lambda row: row.fillna(row.mean()), axis=1)
print("\nUser-Item Matrix with Filled NaN values:")
print(df_filled)

# Calculate cosine similarity between users
user_similarity = cosine_similarity(df_filled)
user_similarity_df = pd.DataFrame(user_similarity, index=df.index, columns=df.index)
print("\nUser Similarity Matrix:")
print(user_similarity_df)

def predict_rating(user, item, k=2):
    if np.isnan(df.loc[user, item]):
        # Get the indices of the k most similar users
        similar_users = user_similarity_df[user].sort_values(ascending=False).index[1:k+1]
        # Calculate the predicted rating using weighted sum
        numerator = sum(user_similarity_df.loc[user, other_user] * df.loc[other_user, item]
                        for other_user in similar_users if not np.isnan(df.loc[other_user, item]))
        denominator = sum(user_similarity_df.loc[user, other_user]
                          for other_user in similar_users if not np.isnan(df.loc[other_user, item]))
        if denominator == 0:
            return df.loc[user].mean()  # Fallback to the user's average rating if no similar users have rated the item
        return numerator / denominator
    else:
        return df.loc[user, item]

# Predict ratings for all missing values
predictions = df.apply(lambda row: row.index.map(lambda item: predict_rating(row.name, item)), axis=1)
predicted_ratings = pd.DataFrame(predictions.tolist(), index=df.index, columns=df.columns)

print("\nPredicted Ratings:")
print(predicted_ratings)


User-Item Rating Matrix:
       Item1  Item2  Item3  Item4
User1    5.0    3.0    4.0    NaN
User2    3.0    1.0    2.0    3.0
User3    4.0    2.0    NaN    5.0
User4    3.0    3.0    1.0    4.0
User5    NaN    4.0    5.0    2.0

User-Item Matrix with Filled NaN values:
          Item1  Item2     Item3  Item4
User1  5.000000    3.0  4.000000    4.0
User2  3.000000    1.0  2.000000    3.0
User3  4.000000    2.0  3.666667    5.0
User4  3.000000    3.0  1.000000    4.0
User5  3.666667    4.0  5.000000    2.0

User Similarity Matrix:
          User1     User2     User3     User4     User5
User1  1.000000  0.975321  0.976802  0.915475  0.939233
User2  0.975321  1.000000  0.990991  0.916380  0.845524
User3  0.976802  0.990991  1.000000  0.921262  0.872624
User4  0.915475  0.916380  0.921262  1.000000  0.795970
User5  0.939233  0.845524  0.872624  0.795970  1.000000

Predicted Ratings:
          Item1  Item2     Item3     Item4
User1  5.000000    3.0  4.000000  4.000759
User2  3.000000    1.0

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error

# Example user-item rating matrix
data = {
    'User1': {'Item1': 5, 'Item2': 3, 'Item3': 4, 'Item4': np.nan},
    'User2': {'Item1': 3, 'Item2': 1, 'Item3': 2, 'Item4': 3},
    'User3': {'Item1': 4, 'Item2': 2, 'Item3': np.nan, 'Item4': 5},
    'User4': {'Item1': 3, 'Item2': 3, 'Item3': 1, 'Item4': 4},
    'User5': {'Item1': np.nan, 'Item2': 4, 'Item3': 5, 'Item4': 2}
}

df = pd.DataFrame(data).T
print("User-Item Rating Matrix:")
print(df)

# Fill NaN values with the mean of each user's ratings
df_filled = df.apply(lambda x: x.fillna(x.mean()), axis=1)
print("\nUser-Item Matrix with Filled NaN values:")
print(df_filled)

# Convert the DataFrame to a NumPy array
R = df_filled.values

# Perform Singular Value Decomposition
U, sigma, Vt = svds(R, k=2)  # k is the number of latent factors

# Convert sigma to a diagonal matrix
sigma = np.diag(sigma)

# Reconstruct the predicted rating matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

# Convert the predicted ratings to a DataFrame
predicted_df = pd.DataFrame(predicted_ratings, columns=df.columns, index=df.index)
print("\nPredicted Ratings:")
print(predicted_df)

# Function to predict rating
def predict_rating(user, item):
    if np.isnan(df.loc[user, item]):
        return predicted_df.loc[user, item]
    else:
        return df.loc[user, item]

# Predict ratings for all missing values
predictions = df.apply(lambda row: row.index.map(lambda item: predict_rating(row.name, item)), axis=1)
predicted_ratings = pd.DataFrame(predictions.tolist(), index=df.index, columns=df.columns)

print("\nPredicted Ratings with Filled NaNs:")
predicted_ratings.head()


User-Item Rating Matrix:
       Item1  Item2  Item3  Item4
User1    5.0    3.0    4.0    NaN
User2    3.0    1.0    2.0    3.0
User3    4.0    2.0    NaN    5.0
User4    3.0    3.0    1.0    4.0
User5    NaN    4.0    5.0    2.0

User-Item Matrix with Filled NaN values:
          Item1  Item2     Item3  Item4
User1  5.000000    3.0  4.000000    4.0
User2  3.000000    1.0  2.000000    3.0
User3  4.000000    2.0  3.666667    5.0
User4  3.000000    3.0  1.000000    4.0
User5  3.666667    4.0  5.000000    2.0

Predicted Ratings:
          Item1     Item2     Item3     Item4
User1  4.563644  3.251073  4.068051  4.213206
User2  2.699448  1.584915  1.774392  3.092043
User3  4.318044  2.655268  3.062645  4.733084
User4  3.223603  1.692475  1.744820  4.047576
User5  3.850677  3.652193  5.131313  1.942277

Predicted Ratings with Filled NaNs:


Unnamed: 0,Item1,Item2,Item3,Item4
User1,5.0,3.0,4.0,4.213206
User2,3.0,1.0,2.0,3.0
User3,4.0,2.0,3.062645,5.0
User4,3.0,3.0,1.0,4.0
User5,3.850677,4.0,5.0,2.0


In [1]:
import numpy as np

def dcg_at_k(r, k, method=1):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.0

def ndcg_at_k(r, k, method=1):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.0
    return dcg_at_k(r, k, method) / dcg_max

def hlu(r, h=5, d=2):
    r = np.array(r)
    relevances = np.maximum(r - d, 0)
    discounts = np.power(2, (np.arange(len(r)) / h))
    return np.sum(relevances / discounts)

# مثال للتطبيق
# تقييمات حقيقية وتقييمات متوقعة
true_ratings = [3, 2, 3, 0, 1, 2]
predicted_ratings = [2, 3, 2, 0, 1, 3]

# حساب DCG و NDCG
k = len(true_ratings)
dcg = dcg_at_k(predicted_ratings, k)
ndcg = ndcg_at_k(predicted_ratings, k)
print(f"DCG: {dcg}, NDCG: {ndcg}")

# حساب HLU
hlu_score = hlu(predicted_ratings)
print(f"HLU: {hlu_score}")


DCG: 6.348263629272981, NDCG: 0.888988644525587
HLU: 1.3705505632961241


In [2]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

# 1. تحضير البيانات
# افتراض: لدينا بيانات تقييمات على شكل مصفوفة حيث الصفوف هي المستخدمين والأعمدة هي العناصر
user_item_matrix = np.array([
    [4, 0, 0, 5, 1, 0, 0],
    [5, 5, 4, 0, 0, 0, 1],
    [0, 0, 0, 2, 4, 5, 0],
    [0, 3, 0, 0, 0, 0, 5],
    [5, 4, 4, 0, 0, 0, 3],
])

# 2. توقع التوصيات باستخدام KNN
def predict_ratings(user_item_matrix, n_neighbors=2):
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
    model_knn.fit(user_item_matrix)
    distances, indices = model_knn.kneighbors(user_item_matrix, n_neighbors=n_neighbors)

    predicted_ratings = np.zeros(user_item_matrix.shape)
    for i, neighbors in enumerate(indices):
        neighbor_ratings = user_item_matrix[neighbors]
        predicted_ratings[i] = neighbor_ratings.mean(axis=0)

    return predicted_ratings

predicted_ratings = predict_ratings(user_item_matrix)

# 3. حساب المقاييس
def dcg_at_k(r, k, method=1):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.0

def ndcg_at_k(r, k, method=1):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.0
    return dcg_at_k(r, k, method) / dcg_max

def hlu(r, h=5, d=2):
    r = np.array(r)
    relevances = np.maximum(r - d, 0)
    discounts = np.power(2, (np.arange(len(r)) / h))
    return np.sum(relevances / discounts)

# مثال للتطبيق
true_ratings = user_item_matrix[0]
pred_ratings = predicted_ratings[0]

# حساب DCG و NDCG
k = len(true_ratings)
dcg = dcg_at_k(pred_ratings, k)
ndcg = ndcg_at_k(pred_ratings, k)
print(f"DCG: {dcg}, NDCG: {ndcg}")

# حساب HLU
hlu_score = hlu(pred_ratings)
print(f"HLU: {hlu_score}")

# 4. تحسين التوصيات
# يمكن تحسين التوصيات عن طريق تعديل المعاملات في خوارزمية التوصية أو تجربة خوارزميات مختلفة.


DCG: 8.53197730594367, NDCG: 0.9810185289329865
HLU: 2.8298769776932238


In [3]:
true_ratings

array([4, 0, 0, 5, 1, 0, 0])

In [4]:
pred_ratings

array([4.5, 2. , 2. , 2.5, 0.5, 0. , 1.5])

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load the MovieLens dataset (small dataset for simplicity)
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# Use the genres column for feature extraction
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
movies['genres'] = movies['genres'].fillna('')

# Compute TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Create a reverse map of indices and movie titles
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies['title'].iloc[movie_indices]

# Example usage: Get recommendations for a specific movie
print(get_recommendations('Toy Story (1995)'))


FileNotFoundError: [Errno 2] No such file or directory: 'ml-latest-small/movies.csv'

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load the MovieLens dataset (small dataset for simplicity)
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# Create a pivot table with users as rows and movies as columns
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

# Fill NaN values with 0 (assuming no rating given as 0)
user_item_matrix.fillna(0, inplace=True)

# Compute the cosine similarity matrix
item_similarity = cosine_similarity(user_item_matrix.T)

# Convert to a DataFrame for easier handling
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

def get_item_based_recommendations(user_id, user_item_matrix, item_similarity_df, top_n=10):
    # Get the ratings given by the user
    user_ratings = user_item_matrix.loc[user_id]

    # Find the movies the user has rated
    rated_items = user_ratings[user_ratings > 0].index

    # Initialize an empty dictionary to store scores
    scores = {}

    # Loop through the rated items
    for item in rated_items:
        # Get the similarity scores for the current item
        similar_items = item_similarity_df[item]

        # Loop through the similar items
        for similar_item, score in similar_items.iteritems():
            if similar_item in rated_items:
                # Skip items the user has already rated
                continue

            if similar_item not in scores:
                scores[similar_item] = 0

            # Add the weighted score to the score dictionary
            scores[similar_item] += score * user_ratings[item]

    # Convert the scores dictionary to a series
    scores_series = pd.Series(scores)

    # Sort the scores in descending order and get the top N items
    recommended_items = scores_series.sort_values(ascending=False).head(top_n)

    # Return the recommended item IDs
    return recommended_items.index

# Example usage: Get recommendations for a specific user
user_id = 1
recommended_item_ids = get_item_based_recommendations(user_id, user_item_matrix, item_similarity_df)

# Map the item IDs to movie titles
recommended_movies = movies[movies['movieId'].isin(recommended_item_ids)]
print(recommended_movies[['movieId', 'title']])
