In [None]:
import numpy as np
import pandas as pd
import time
from stop_words import get_stop_words

from sqlalchemy import create_engine
engine = create_engine('postgres://pass_culture:passq@localhost:5434/pass_culture?sslmode=prefer')
connection = engine.connect()

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

### We get the offers reserved by users

In [None]:
debut = time.time()
offers_reserved_by_the_users = pd.read_sql_query("""SELECT booking."userId" as user_id, stock."offerId" as offer_id, type, description, offer.name 
                       FROM booking 
                       LEFT JOIN stock ON booking."stockId" = stock.id 
                       LEFT JOIN offer ON stock."offerId"=offer."id"
                       WHERE booking."isUsed"=True AND booking."isCancelled"=False 
                       AND offer.type!='EventType.ACTIVATION' AND offer.type != 'ThingType.ACTIVATION'
                       """, connection)

fin = time.time()
temps = (fin - debut)/60
print(temps)

In [None]:
offers_reserved_by_the_users

In [None]:
offers_reserved_by_the_users['offer_id'].nunique()

### We only keep offers with a description and which also have more than 600 characters

In [None]:
offers_reserved_by_the_users = offers_reserved_by_the_users[offers_reserved_by_the_users['description'].notnull()]
offers_reserved_by_the_users = offers_reserved_by_the_users[offers_reserved_by_the_users['description'].map(len) > 600]

In [None]:
offers_reserved_by_the_users = offers_reserved_by_the_users.reset_index(drop=True)

In [None]:
offers_reserved_by_the_users

In [None]:
offers_reserved_by_the_users['offer_id'].nunique()

### We calculate the TF-IDF of the offers

In [None]:
debut = time.time()

vectorizer = TfidfVectorizer(analyzer='word', \
                     stop_words=get_stop_words('french'), \
                     strip_accents = 'ascii', \
                     lowercase = True)

tfidf_matrix = vectorizer.fit_transform(offers_reserved_by_the_users['description'])

fin = time.time()
temps = (fin - debut)/60
print(temps)

### We calculate the similarity between the offers

In [None]:
debut = time.time()

#Linear kernel = cosine_similarity quand on a une très grande quantité de données (linear kernel est plus rapide)
cosinus_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

fin = time.time()
temps = (fin - debut)/60
print(temps)

### We retrieve for each offer, offers that are similar

In [None]:
debut = time.time()

result = {} #Dictionnaire pour les résultats sous la forme (offer_id : (Score, offer_id))

#On itère sur toutes les lignes
#Pour chaque ligne, on cherche les scores les plus élevés et on les trie par ordre decroissant 
#On récupère les ids des offres et on les mets dans résulat
for idx, row in offers_reserved_by_the_users.iterrows(): 
    similar_indexes = cosinus_similarity[idx].argsort()[:-6:-1] 
    similar_offers = [(cosinus_similarity[idx][i], offers_reserved_by_the_users['offer_id'][i]) for i in similar_indexes]
    result[row['offer_id']] = similar_offers[1:]
  
fin = time.time()
temps = (fin - debut)/60
print(temps)

In [None]:
debut = time.time()

#Get the name of the offer
def nom(id):
    return offers_reserved_by_the_users.loc[offers_reserved_by_the_users['offer_id'] == id]['name'].tolist()[0]

#Get the description of the offer 
def description(id): 
    return offers_reserved_by_the_users.loc[offers_reserved_by_the_users['offer_id'] == id]['description'].tolist()[0]

#Get the id of the offer 
def offer_id(id): 
    return offers_reserved_by_the_users.loc[offers_reserved_by_the_users['offer_id'] == id]['offer_id'].tolist()[0]    

#Get'num' offers similar to the offer_id
def similar(id, num):
    if (num == 0):
        print("Cannot recommend an offer because you did not choose the number of offers to recommend")
    
    else :
        print("We recommend " + str(num) + " similar offers to " + nom(id) + " \n" + "Description : \
" + description(id) + "\n" + "Id : " + str(offer_id(id)))
        print("\n" + "###########################################################################################")
        
    recs = result[id][:num]
    recs = list(set(recs)) #Pour supprimer les doublons
    for rec in recs:
        print(" \n You would also like the offer: " + nom(rec[1]) + " (score: %.3f" % rec[0] + ") \n" + "Description : \
" + description(rec[1]) + "\n" + "Id : " + str(offer_id(rec[1])))
        print("\n" + "###########################################################################################")
        
fin = time.time()
temps = (fin - debut)/60
print(temps)

### Very good examples of :
- Michael Jackson's offer: id = 188929
- Dictionary's offer : id = 314706

In [None]:
id_offre = 314706
similar(id_offre, 4)

### We retrieve the TF-IDF vector of an offer

In [None]:
index = offers_reserved_by_the_users[offers_reserved_by_the_users['offer_id']==id_offre].index[0]
tfidf_of_the_offer = tfidf_matrix[index]

print("The title of the offer is: ", offers_reserved_by_the_users['name'][index], "\n" )

#On met les tf-idf dans un dataframe
df_tfidf = pd.DataFrame(tfidf_of_the_offer.T.todense(), index = vectorizer.get_feature_names(), columns=["tfidf"])
df_tfidf = df_tfidf.sort_values(by=["tfidf"], ascending=False)
print(df_tfidf.head(20))

### All the words present in the description:

In [None]:
print('All the words present in the description: ')
all_the_words_in_the_description = vectorizer.get_feature_names()
print(all_the_words_in_the_description)

In [None]:
len(all_the_words_in_the_description)