In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np

**charger les données des évaluations**

In [None]:
rating_data = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u1.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

**Charger les donner des films**

In [None]:
movies_data = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u.item', sep='|', encoding='latin-1', usecols=[0, 1], names=['item_id', 'title'])

In [None]:
movie_ratings = pd.merge(movies_data, rating_data, on='item_id').pivot_table(index=['user_id'], columns=['title'], values='rating')

**Remplir les valeurs manquantes**

In [None]:
movie_ratings = movie_ratings.fillna(0)

**Normalisation des évaluation**

In [None]:
def normalize(row):
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row

In [None]:
user_ratings = movie_ratings.apply(normalize)

**Appliquer k-means clustering**

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(user_ratings)



**Obtenir les étiquttes des clusters**

In [None]:
cluster_labels = kmeans.predict(user_ratings)

**Ajouter les étiquettes des cluster sur dataframe**

In [None]:
user_ratings['cluster'] = cluster_labels

**Faire les recommendations**

In [None]:
user_id = 1

In [None]:
user_cluster = user_ratings.loc[user_id]['cluster']

In [None]:
cluster_users = user_ratings[user_ratings['cluster'] == user_cluster].index

In [None]:
cluster_ratings = movie_ratings.loc[cluster_users].fillna(0)

In [None]:
cluster_mean_ratings = cluster_ratings.mean()

In [None]:
rated_movies = rating_data[rating_data['user_id'] == user_id]['item_id'].unique()

In [None]:
unrated_movies = cluster_mean_ratings[~cluster_mean_ratings.index.isin(rated_movies)]

In [None]:
recommended_movies = unrated_movies.sort_values(ascending=False)[:10].index.tolist()

In [None]:
print(recommended_movies)

['Star Wars (1977)', 'Raiders of the Lost Ark (1981)', 'Return of the Jedi (1983)', 'Empire Strikes Back, The (1980)', 'Pulp Fiction (1994)', 'Indiana Jones and the Last Crusade (1989)', 'Fugitive, The (1993)', 'Back to the Future (1985)', 'Braveheart (1995)', 'Silence of the Lambs, The (1991)']


**calculer la précision**

In [None]:
precision = 0.0
num_recommended = len(recommended_movies)
if num_recommended > 0:
    num_true_positives = 0
    for movie_id in recommended_movies:
        if movie_ratings.loc[user_id][movie_id] >= 4.0:
            num_true_positives += 1
    precision = num_true_positives / num_recommended


In [None]:
print(precision)

0.5


### **PCA+ K-means **

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np


In [None]:
rating_data = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u1.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

In [None]:
movies_data = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u.item', sep='|', encoding='latin-1', usecols=[0, 1], names=['item_id', 'title'])

In [None]:
movies_data = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u.item', sep='|', encoding='latin-1', usecols=[0, 1], names=['item_id', 'title'])

In [None]:
movie_ratings = pd.merge(movies_data, rating_data, on='item_id').pivot_table(index=['user_id'], columns=['title'], values='rating')

In [None]:
movie_ratings = movie_ratings.fillna(0)

In [None]:
def normalize(row):
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row


In [None]:
user_ratings = movie_ratings.apply(normalize)

# **Appliquer PCA**

In [None]:
pca = PCA(n_components=50)

In [None]:
pca.fit(user_ratings)

In [None]:
user_ratings_pca = pca.transform(user_ratings)

***appliquer k-means clustering***

In [None]:
kmeans = KMeans(n_clusters=5)

In [None]:
kmeans.fit(user_ratings_pca)



**Obtenir les étiquettes des clusters**


In [None]:
cluster_labels = kmeans.predict(user_ratings_pca)

**Ajouter les étiquettes sur le dataframe**

In [None]:
user_ratings['cluster'] = cluster_labels

**Faire des recommendations**

In [None]:
user_id = 1

In [None]:
user_cluster = user_ratings.loc[user_id]['cluster']

In [None]:
cluster_users = user_ratings[user_ratings['cluster'] == user_cluster].index

In [None]:
cluster_ratings = movie_ratings.loc[cluster_users].fillna(0)

In [None]:
cluster_mean_ratings = cluster_ratings.mean()

In [None]:
rated_movies = rating_data[rating_data['user_id'] == user_id]['item_id'].unique()

In [None]:
unrated_movies = cluster_mean_ratings[~cluster_mean_ratings.index.isin(rated_movies)]

In [None]:
recommended_movies = unrated_movies.sort_values(ascending=False)[:10].index.tolist()

In [None]:
print(recommended_movies)

['Star Wars (1977)', 'Raiders of the Lost Ark (1981)', 'Return of the Jedi (1983)', 'Silence of the Lambs, The (1991)', 'Pulp Fiction (1994)', 'Empire Strikes Back, The (1980)', 'Back to the Future (1985)', 'Godfather, The (1972)', 'Indiana Jones and the Last Crusade (1989)', "Schindler's List (1993)"]


# **calculer precision**

In [None]:
precision = 0.0
num_recommended = len(recommended_movies)
if num_recommended > 0:
    num_true_positives = 0
    for movie_id in recommended_movies:
        if movie_ratings.loc[user_id][movie_id] >= 4.0:
            num_true_positives += 1
    precision = num_true_positives / num_recommended


In [None]:
print(recommended_movies)

['Star Wars (1977)', 'Raiders of the Lost Ark (1981)', 'Return of the Jedi (1983)', 'Empire Strikes Back, The (1980)', 'Pulp Fiction (1994)', 'Indiana Jones and the Last Crusade (1989)', 'Silence of the Lambs, The (1991)', 'Back to the Future (1985)', 'Princess Bride, The (1987)', 'Fugitive, The (1993)']


In [None]:
print(precision)

0.6


## **The mean square error**

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error


**Charger les données **

In [None]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
train_data = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u1.base', sep='\t', names=names)
test_data = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u1.test', sep='\t', names=names)

**Calculer le nombre d’utilisateurs et d’éléments uniques**

In [None]:
n_users = max(train_data.user_id.unique())


In [None]:
n_items = max(train_data.item_id.unique())

# **Initialisation des baies pour la formation et le test des données**

In [None]:
train_data_array = np.zeros((n_users, n_items))


In [None]:
test_data_array = np.zeros((n_users, n_items))

**Fill the train_data_array with evaluations from the training database**

In [None]:
for row in train_data.itertuples():
    train_data_array[row.user_id-1, row.item_id-1] = row.rating

**Remplir le tableau test_data_array avec les évaluations de l'ensemble de données de test**

In [None]:
for row in test_data.itertuples():
    test_data_array[row.user_id-1, row.item_id-1] = row.rating

**Normalisation des données d'apprentissage**

In [None]:
means = np.nanmean(train_data_array, axis=1)

In [None]:
user_ratings_train_norm = train_data_array - means.reshape(-1, 1)

**Appliquer k-means clustering aux évaluations des utilisateurs train norm**

In [None]:
kmeans = KMeans(n_clusters=5)

In [None]:
kmeans.fit(user_ratings_train_norm)



**Prédire les clusters pour les évaluations des utilisateurs test norm***

In [None]:
user_ratings_test_norm = test_data_array - means.reshape(-1, 1)

In [None]:
predicted_clusters = kmeans.predict(user_ratings_test_norm)

**calculer les notes prédites pour chaque élément**

In [None]:
predicted_ratings = np.zeros(test_data_array.shape)
for i in range(test_data_array.shape[0]):
    for j in range(test_data_array.shape[1]):
        if test_data_array[i, j] != 0:
            cluster = predicted_clusters[i]
            predicted_ratings[i, j] = kmeans.cluster_centers_[cluster][j] + means[i]

In [None]:
actual_ratings = np.zeros(test_data_array.shape)
for i in range(test_data_array.shape[0]):
    for j in range(test_data_array.shape[1]):
        if test_data_array[i, j] != 0:
            actual_ratings[i, j] = test_data_array[i, j]

In [None]:
mse = mean_squared_error(actual_ratings, predicted_ratings)

In [None]:
print("Mean squared error:", mse)

Mean squared error: 0.12475166738920163
