In [None]:
import pandas as pd 
import numpy as np
import random
random.seed(9001)
#pour avoir toujours les memes erreurs à chaque fois qu'on re exécute le projet.

In [None]:
from sqlalchemy import create_engine

engine = create_engine('postgres://pass_culture:passq@localhost:5434/pass_culture?sslmode=prefer')
connection = engine.connect()

In [None]:
user = pd.read_sql_query('SELECT id as user_id FROM "user" ORDER BY id', connection)

In [None]:
offer = pd.read_sql_query('SELECT id as offer_id FROM offer ORDER BY id', connection)

In [None]:
user_id, freq_users = np.unique(user.user_id, return_counts=True)#user_id les id des users, freq_users les freq de chaque user
offre_id, freq_offre = np.unique(offer.offer_id, return_counts=True)#offre_id les id des offres, freq_offre les freq de chaque offre
n_users = len(user_id)
n_offre = len(offre_id)
print("Le nombre des utilisateurs est : " + str(n_users) )
print("Le nombre des offres est : " + str(n_offre))

In [None]:
#On recupere la table des utilisateurs qui ont acheté des offres 
achete = pd.read_sql_query('SELECT "user".id as user_id, offer.id as offer_id\
                       FROM "booking" \
                       INNER JOIN "user" ON "user".id=booking."userId" \
                       INNER JOIN stock ON booking."stockId"=stock.id \
                       INNER JOIN offer ON offer.id=stock."offerId" \
                       WHERE booking."isUsed"=True AND booking."isCancelled"=False \
                       ORDER BY user_id', connection)
achete['rate']=6
achete

In [None]:
#On récupère les utilisateurs qui ont acheté mais pas consommés des offres 
pas_consome = pd.read_sql_query('SELECT "user".id as user_id, offer.id as offer_id\
                       FROM "booking" \
                       INNER JOIN "user" ON "user".id=booking."userId" \
                       INNER JOIN stock ON booking."stockId"=stock.id \
                       INNER JOIN offer ON offer.id=stock."offerId" \
                       WHERE booking."isUsed"=False AND booking."isCancelled"=False', connection)
pas_consome['rate']=5

In [None]:
#On récupère les utilisateurs qui ont acheté et annulé des offres 
annule = pd.read_sql_query('SELECT "user".id as user_id, offer.id as offer_id\
                       FROM "booking" \
                       INNER JOIN "user" ON "user".id=booking."userId" \
                       INNER JOIN stock ON booking."stockId"=stock.id \
                       INNER JOIN offer ON offer.id=stock."offerId" \
                       WHERE booking."isUsed"=False AND booking."isCancelled"=True', connection)
annule['rate']=4

In [None]:
#On récupère les utilisateurs qui ont liké des offres 
mis_en_fav = pd.read_sql_query('SELECT "userId" as user_id, "offerId" as offer_id \
                          FROM favorite', connection)
mis_en_fav['rate']=3

In [None]:
#On récupère les utilisateurs qui ont clické sur des offres
clic = pd.read_sql_query('SELECT "userId" AS user_id, "offerId" AS offer_id \
                          FROM recommendation \
                          WHERE "isClicked"=True', connection)
clic['rate']=2

In [None]:
ignore = pd.read_sql_query('SELECT "userId" AS user_id, "offerId" AS offer_id \
                          FROM recommendation \
                          WHERE "isClicked"=False', connection)
ignore['rate']=1

In [None]:
result = pd.concat([achete, pas_consome, annule, mis_en_fav, clic, ignore])

In [None]:
result = result.sort_values('rate').drop_duplicates(subset=['user_id', 'offer_id'], keep='last')
result.sort_values(by=['user_id'])

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(result[["user_id","offer_id","rate"]], test_size=0.20, random_state=123)

In [None]:
#Si on a une grande sparsity (rareté des données), on ne va pas arriver à calculer la similarité entre 
#2 utilisateurs (par ex, si chaque utilisateur a aimé differentes offres comparé aux autres utilisateurs),
#les modèles Model Based seront  plus efficace. Calculon alors la sparsity:
sparsity = round(1.0 - len(result) / float(n_users*n_offre), 3)
print ('The sparsity level of our data base is ' +  str(sparsity*100) + '%')
print('Le pourcentage de sparsity est grand donc, on peut confirmer que les modèles Model Based seront les \
modèles plus efficaces')

In [None]:
from scipy.sparse import coo_matrix
train_data_matrix = coo_matrix((train_data['rate'], (train_data['user_id'], train_data['offer_id'])))
train_data_matrix_by_row = train_data_matrix.tocsr()

test_data_matrix = coo_matrix((test_data['rate'], (test_data['user_id'], test_data['offer_id'])))
test_data_matrix_by_row = test_data_matrix.tocsr()


In [None]:
#calcule de la cos similarity : (construction du modèle)
from sklearn.metrics.pairwise import pairwise_distances
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
#offer_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')


In [None]:
def predict(ratings, similarity, type='user'):#prend
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating) #(type === array comme la var rating)
        pred = mean_user_rating + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)]) 
        
    x = np.zeros((n_users, n_offre))
    for i in range(0, n_offre):
        a=max(pred[:,i])
        b=min(pred[:,i])
        c=0
        d=5
        for j in range(0,n_users):
            x[j,i]=(pred[:,i][j]-(a-c))*d/(b-a+c)
    
    return x

In [None]:
#la prédiction avec les differents modèle
item_prediction = predict(train_data_matrix_by_row.T, item_similarity, type='user')