In [None]:
from passculture_recommendations.features.feature_engineering import get_featuresframe_of_all_the_interactions_from_0_to_5
from passculture_recommendations.features.feature_engineering import get_offers_that_interest_and_dont_interest_the_users
from passculture_recommendations.features.distribution_of_ratings_by_category import distribution_of_ratings
from passculture_recommendations.personalisation.svd import svd_for_the_recommendation_testing
from passculture_recommendations.personalisation.svd import svd_for_the_recommendation_training
from passculture_recommendations.personalisation.metrics import accuracy_recall_precision_f1
from passculture_recommendations.personalisation.metrics import plot_confusion_matrix
from passculture_recommendations.personalisation.metrics import plot_roc_curve
from passculture_recommendations.personalisation.metrics import find_the_thresholds_to_have_a_good_recall
from passculture_recommendations.features.feature_engineering import get_a_df_from_sql_query
from passculture_recommendations.features.add_support import add_support_in_type
from passculture_recommendations.features.recommendable_offers import get_all_the_recommendable_offers
from passculture_recommendations.features.recommendable_offers import get_all_the_recommendable_offers_from_bretagne

import pandas as pd 
import numpy as np
import time 

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sqlalchemy import create_engine
engine = create_engine('postgres://pass_culture:passq@localhost:5434/pass_culture?sslmode=prefer')
connection = engine.connect()

# Récupération des données

### Offres notées par les utilisateurs de 0 à 5
- Offres achetées : 5
- Offres achetées et pas consommées : 4 
- Offres achetées et annulées : 3 
- Offres mises en favoris : 2
- Offres cliquées : 1 
- Offres ignorées : 0 

In [None]:
"""debut = time.time()

offers_graded_from_0_to_5 = get_dataframe_of_all_the_interactions_from_0_to_5()

fin = time.time()
temps = (fin - debut)/60
print(temps)"""

In [None]:
"""#On enregistre en csv 
offers_graded_from_0_to_5.to_csv('offers_graded_from_0_to_5.csv', sep = '\t', index=False)"""

In [None]:
offers_graded_from_0_to_5 = pd.read_csv('offers_graded_from_0_to_5.csv', sep = '\t') 

### Offres notées par les utilisateurs (notes binaires)
- Note 1 : offres mises en favoris, achetées et annnulées, achetées et pas consommées, achetées 
- Note 0 : offres ignorées ou juste cliquées

In [None]:
debut = time.time()

offers_graded_from_0_to_1 = get_offers_that_interest_and_dont_interest_the_users(offers_graded_from_0_to_5)

fin = time.time()
temps = (fin - debut)/60
print(temps)

In [None]:
len(offers_graded_from_0_to_1['user_id'].unique())

In [None]:
data = offers_graded_from_0_to_1['note'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / offers_graded_from_0_to_1.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )

layout = dict(title = 'Distribution de {} notes'.format(offers_graded_from_0_to_1.shape[0]),
              xaxis = dict(title = 'Notes'),
              yaxis = dict(title = "Nombre de notes"))

fig = go.Figure(data=[trace], layout=layout)
fig.show()

### Distribution des notes par type et par isVirtual

In [None]:
number_of_rates_per_type = distribution_of_ratings(offers_graded_from_0_to_1, 'type')
number_of_rates_per_type

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(x=number_of_rates_per_type['type'], 
               y=number_of_rates_per_type['total'],
               name="Nombre de notes"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=number_of_rates_per_type['type'],
               y=number_of_rates_per_type['pourcentage_note1'],
               name="%tage note 1"),
    secondary_y=True,
)

#On ajoute le titre
fig.update_layout(title_text='Distribution des notes par type')

#Titre de l'axe x 
fig.update_xaxes(title_text="types")

#Titre des axes y 
fig.update_yaxes(title_text="Nombre de notes", secondary_y=False)
fig.update_yaxes(title_text="Pourcentage dans la note 1", secondary_y=True)

fig.show()

In [None]:
number_of_rates_per_isVirtual = distribution_of_ratings(offers_graded_from_0_to_1, 'isVirtual')
number_of_rates_per_isVirtual

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(x=number_of_rates_per_isVirtual['isVirtual'], 
               y=number_of_rates_per_isVirtual['total'],
               name="Nombre de notes"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=number_of_rates_per_isVirtual['isVirtual'],
               y=number_of_rates_per_isVirtual['pourcentage_note1'],
               name="%tage note 1"),
    secondary_y=True,
)

#On ajoute le titre
fig.update_layout(title_text='Distribution des notes par isVirtual')

#Titre de l'axe x 
fig.update_xaxes(title_text="Types")

#Titre des axes y 
fig.update_yaxes(title_text="Nombre de notes", secondary_y=False)
fig.update_yaxes(title_text="Pourcentage dans la note 1", secondary_y=True)

fig.show()

# Prédictions
### Pour chaque couple (user, offer) du testset, on prédit une note entre 0 et 1
- Plus le score est proche de 1, plus l'utilisateur est susceptible d'être intéréssé par l'offre
- On utilise l'algorithme SVD pour faire ces prédictions

In [None]:
"""debut = time.time()

algo = svd_for_the_recommendation_training(offers_graded_from_0_to_1)
predictions_of_the_grades = svd_for_the_recommendation_testing(algo)

fin = time.time()
temps = (fin - debut)/60
print(temps)"""

In [None]:
"""#On enregistre en csv 
predictions_of_the_grades.to_csv('predictions_of_the_grades.csv', sep = '\t', index=False)"""

In [None]:
predictions_of_the_grades = pd.read_csv('predictions_of_the_grades.csv', sep = '\t') 

In [None]:
data = predictions_of_the_grades['score'].apply(round).value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / predictions_of_the_grades.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )

layout = dict(title = 'Prédiction : Distribution de {} notes'.format(predictions_of_the_grades.shape[0]),
              xaxis = dict(title = 'Notes'),
              yaxis = dict(title = "Nombre de notes"))

fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [None]:
#On recupere les types des offres et la colonne isVirtual
predictions_of_the_grades = predictions_of_the_grades.merge(offers_graded_from_0_to_1, left_on=['user_id', 'offer_id','note'], \
                                                                                       right_on=['user_id', 'offer_id','note'])
predictions_of_the_grades.head()

### Distribution des notes par type et par isVirtual

In [None]:
number_of_rates_per_type_in_the_predictions = distribution_of_ratings(predictions_of_the_grades, 'type')
number_of_rates_per_type_in_the_predictions

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(x=number_of_rates_per_type_in_the_predictions['type'], 
               y=number_of_rates_per_type_in_the_predictions['total'],
               name="Nombre de notes"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=number_of_rates_per_type_in_the_predictions['type'],
               y=number_of_rates_per_type_in_the_predictions['pourcentage_note1'],
               name="%tage note 1"),
    secondary_y=True,
)

#On ajoute le titre
fig.update_layout(title_text='Distribution des notes par type dans la prédiction')

#Titre de l'axe x 
fig.update_xaxes(title_text="types")

#Titre des axes y 
fig.update_yaxes(title_text="Nombre de notes", secondary_y=False)
fig.update_yaxes(title_text="Pourcentage dans la note 1", secondary_y=True)

fig.show()

In [None]:
number_of_rates_per_isVirtual_in_the_predictions = distribution_of_ratings(predictions_of_the_grades, 'isVirtual')
number_of_rates_per_isVirtual_in_the_predictions

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(x=number_of_rates_per_isVirtual_in_the_predictions['isVirtual'], 
               y=number_of_rates_per_isVirtual_in_the_predictions['total'],
               name="Nombre de notes"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=number_of_rates_per_isVirtual_in_the_predictions['isVirtual'],
               y=number_of_rates_per_isVirtual_in_the_predictions['pourcentage_note1'],
               name="%tage note 1"),
    secondary_y=True,
)

#On ajoute le titre
fig.update_layout(title_text='Distribution des notes par isVirtual dans la prédiction')

#Titre de l'axe x 
fig.update_xaxes(title_text="Types")

#Titre des axes y 
fig.update_yaxes(title_text="Nombre de notes", secondary_y=False)
fig.update_yaxes(title_text="Pourcentage dans la note 1", secondary_y=True)

fig.show()

### Metrics
- Accuracy / Recall / Precision / F1
- Confusion matrix
- ROC curve
- Find the threshold to have a good recall
- New ROC curve with the threshold

In [None]:
y_true = predictions_of_the_grades['note']
y_pred = predictions_of_the_grades['score'].apply(round)
accuracy_recall_precision_f1(y_true, y_pred)

In [None]:
plot_confusion_matrix(y_true, y_pred)

In [None]:
y_true = predictions_of_the_grades['note']
y_pred = predictions_of_the_grades['score']
plot_roc_curve(y_true, y_pred)

In [None]:
roc = find_the_thresholds_to_have_a_good_recall(predictions_of_the_grades)
roc

In [None]:
# On change les predictions_of_the_grades en fonction du seuil 
predictions_of_the_grades['score_avec_seuil'] = predictions_of_the_grades['score'].apply(lambda x: 1 if x > roc['thresholds'].values[0] else 0)

y_true = predictions_of_the_grades['note']
y_pred = predictions_of_the_grades['score_avec_seuil']

plot_confusion_matrix(y_true, y_pred)

In [None]:
y_true = predictions_of_the_grades['note']
y_pred = predictions_of_the_grades['score_avec_seuil']
accuracy_recall_precision_f1(y_true, y_pred)

# Recommendation 

In [None]:
recommendable_offers_to_all_the_users = get_all_the_recommendable_offers()

### Get all the users/offers from Bretagne

In [None]:
recommendable_offers_to_all_the_users_in_bretagne = get_all_the_recommendable_offers_from_bretagne(recommendable_offers_to_all_the_users)

### Prediction for one user

In [None]:
user_id = 25549
offers_recommended_to_a_user = get_the_prediction_for_one_user(user_id, recommendable_offers_to_all_the_users, algo)

In [None]:
offers_recommended_to_a_user

# Algorithme Swap
## Diversification à travers tous les types
- Distance entre les offres - Réservation 

In [None]:
similarity_matrix = pd.read_csv('../matrix_type_type_correlation_physique_numerique.csv', sep = '\t') 
similarity_matrix.index = similarity_matrix['type']
del similarity_matrix['type']
similarity_matrix

In [None]:
query = """SELECT id as offer_id FROM discovery_view"""
discovery_view = get_a_df_from_sql_query(query, connection)

query = """ SELECT "offer".id as offer_id, url FROM "offer" """

offers_with_url = get_a_df_from_sql_query(query, connection)

In [None]:
predictions_of_the_grades = predictions_of_the_grades.merge(discovery_view, left_on='offer_id', right_on='offer_id')

In [None]:
 predictions_of_the_grades = predictions_of_the_grades.merge(offers_with_url, left_on='offer_id', right_on='offer_id')

In [None]:
offers_recommended_to_a_user = predictions_of_the_grades[predictions_of_the_grades['user_id']==63068]
offers_recommended_to_a_user = add_support_in_type(offers_recommended_to_a_user)
offers_recommended_to_a_user = offers_recommended_to_a_user.sort_values(by=['score'], ascending=False)
offers_recommended_to_a_user

In [None]:
#%pdb

In [None]:
K = 50
N = len(offers_recommended_to_a_user)

most_relevant_offers_recommended_to_a_user = offers_recommended_to_a_user[0:K]
least_relevant_offers_recommended_to_a_user = offers_recommended_to_a_user[K + 1:N]

average_of_the_score = []
similarity_of_the_set = []

number_of_offers_per_type_at_each_iteration = create_dataframe_of_the_name_of_all_the_types(
        offers_recommended_to_a_user)
number_of_offers_per_type_at_each_iteration = replace_dot_with_a_dash_in_the_column_type(
        number_of_offers_per_type_at_each_iteration)

number_of_exchanges = K

for i in range(0, number_of_exchanges):

    most_relevant_offers_recommended_to_a_user, least_relevant_offers_recommended_to_a_user, number_of_offers_per_type = add_one_offer_that_diversifies_the_recommended_offers(most_relevant_offers_recommended_to_a_user, least_relevant_offers_recommended_to_a_user, similarity_matrix)

    average_of_the_score.append(sum(most_relevant_offers_recommended_to_a_user['score']) / number_of_exchanges)

    similarity_of_the_set.append(compute_similarity_of_the_set(number_of_offers_per_type, similarity_matrix, number_of_exchanges)[0])

    number_of_offers_per_type_for_this_iteration = number_of_offer_per_category.compute_number_of_offer_per_category(
                most_relevant_offers_recommended_to_a_user, 'type', i)
    number_of_offers_per_type_for_this_iteration = replace_dot_with_a_dash_in_the_column_type(
                number_of_offers_per_type_for_this_iteration)
    
    number_of_offers_per_type_at_each_iteration = number_of_offers_per_type_at_each_iteration.merge(
                number_of_offers_per_type_for_this_iteration, how='outer', left_on='type', right_on='type')


### Moyenne des scores après chaque échange

In [None]:
x = np.arange(number_of_exchanges)
y = average_of_the_score

layout = dict(title = "Average scores after each exchange of offers",
              xaxis = dict(title = "Number of exchanges"),
              yaxis = dict(title = "Average scores"))

fig = go.Figure(data=go.Scatter(x=x, y=y), layout=layout)
fig.show()

### Similarité de l'ensemble après chaque échange 

In [None]:
x = np.arange(number_of_exchanges)
y = similarity_of_the_set

layout = dict(title = "Similarity of the set after each exchange of offers",
              xaxis = dict(title = "Number of exchanges"),
              yaxis = dict(title = "Similarity of the set"))

fig = go.Figure(data=go.Scatter(x=x, y=y), layout=layout)
fig.show()

### Similarité de l'ensemble avec le best et worst case

In [None]:
x = np.arange(70)
y = similarity_of_the_set

similarity_of_the_set_in_the_worst_case = worst_case_in_the_diversification_per_type(number_of_exchanges)
list_of_similarities_in_the_best_case = best_case_in_the_diversification_per_type(most_relevant_offers_recommended_to_a_user, number_of_exchanges, similarity_matrix)

layout = dict(title = "Similarity of the set after each exchange of offers with the best and worst case",
              xaxis = dict(title = "Number of exchanges"),
              yaxis = dict(title = "Similarity of the set"))

fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(x=x, y=y, mode='lines'))

# Line Horizontal --> Worst case
fig.add_shape(
    type="line",
            x0=0,
            y0=similarity_of_the_set_in_the_worst_case,
            x1=number_of_exchanges,
            y1=similarity_of_the_set_in_the_worst_case,
            line=dict(
                color="LightSeaGreen",
                width=4,
                dash="dashdot",
            ),
)

# Line Horizontal --> Best case
fig.add_shape(
    type="line",
            x0=0,
            y0=min(list_of_similarities_in_the_best_case),
            x1=number_of_exchanges,
            y1=min(list_of_similarities_in_the_best_case),
            line=dict(
                color="LightSeaGreen",
                width=4,
                dash="dashdot",
            ),
)

fig.show()

### Distribution des types après la diversification 

In [None]:
number_of_offers_per_type_at_each_iteration.dropna(axis=0, how='all', subset = range(0, 70), inplace=True)
number_of_offers_per_type_at_each_iteration

In [None]:
#data = pd.read_csv('number_of_offers_per_type_in_the_diversification_per_type.csv', sep = '\t')
data = number_of_offers_per_type_at_each_iteration
make_bar_chart(data, "type", "0", "69",title = "Distribution of the types after each exchange", frame_rate = 1)

## Diversification au sein d'un même type
### Distribution des types dans la table discovery_view pour voir au sein de quel type on va diversifier 

In [None]:
query = """SELECT type  FROM discovery_view"""
discovery_view = feature_engineering.get_a_df_from_sql_query(query, connection)
discovery_view

In [None]:
layout = dict(title = 'Distribution of {} offers'.format(len(discovery_view['type'])),
              xaxis = dict(title = 'Types'),
              yaxis = dict(title = "Number of offers"))

fig = go.Figure(data=[go.Bar(
            x = discovery_view['type'].value_counts().index, 
            y = discovery_view['type'].value_counts().values,
            text = discovery_view['type'].value_counts().values,
            textposition = 'auto', 
        )], layout=layout)

fig.show()

### On récupère la colonne ExtraData de la table offer

In [None]:
query = """SELECT type, "extraData"  FROM offer """
extradata = feature_engineering.get_a_df_from_sql_query(query, connection)
extradata

### Diversification au sein du type LIVRE_EDITION

In [None]:
query = """SELECT id, type, "extraData" FROM offer WHERE type = 'ThingType.LIVRE_EDITION' """
extradata_livres = feature_engineering.get_a_df_from_sql_query(query, connection)
extradata_livres

In [None]:
feature_of_the_book = get_feature_of_the_book(extradata_livres)
feature_of_the_book

In [None]:
extradata_livres = add_columns_of_the_features_in_df_extradata_livre(extradata_livres, feature_of_the_book)
extradata_livres

### On diversifie en fonction de la caractéristique "rayon"

In [None]:
number_of_none = extradata_livres['rayon'].isna().sum()
values = [len(extradata_livres)-number_of_none, number_of_none ]
labels = ["Not None", "None"]

fig = go.Figure(data=[go.Pie(labels=labels, values=values,
                             insidetextorientation='radial'
                            )])

fig.update_traces(hoverinfo='label+percent', textinfo='value+percent',
                  marker=dict(colors=['light blue', 'red']))



fig.show()




In [None]:
#On enlève les livres qui n'ont pas la caractéristique "rayon" renseignée
data_in_rayon = pd.DataFrame(extradata_livres['rayon'].unique(), columns=['rayon'])
data_in_rayon.dropna(inplace=True)
data_in_rayon.reset_index(drop=True, inplace=True)
data_in_rayon

In [None]:
cosinus_similarity = get_cosinus_similarity_using_the_tfidf(data_in_rayon['rayon'])


In [None]:
the_ten_most_similar_rayon = get_the_most_similar(data_in_rayon, cosinus_similarity)
the_ten_most_similar_rayon

## Diversification des offres en fonction du TF-IDF de leur description

In [None]:
"""debut = time.time()

predictions_of_the_grades = svd.svd_for_the_recommendation(offers_graded_from_0_to_1)

fin = time.time()
temps = (fin - debut)/60
print(temps)"""

predictions_of_the_grades = pd.read_csv('predictions_of_the_grades.csv', sep = '\t') 

In [None]:
#On recupere les types des offres et la colonne isVirtual
predictions_of_the_grades = predictions_of_the_grades.merge(offers_graded_from_0_to_1, left_on=['user_id', 'offer_id','note'],\
                                                            right_on=['user_id', 'offer_id','note'])

In [None]:
query = """SELECT id as offer_id FROM discovery_view"""
discovery_view = feature_engineering.get_a_df_from_sql_query(query, connection)

query = """ SELECT "offer".id as offer_id, url FROM "offer" """
offers_with_url = feature_engineering.get_a_df_from_sql_query(query, connection)

In [None]:
predictions_of_the_grades = predictions_of_the_grades.merge(discovery_view, left_on='offer_id', right_on='offer_id')
predictions_of_the_grades = predictions_of_the_grades.merge(offers_with_url, left_on='offer_id', right_on='offer_id')

In [None]:
offers_recommended_to_a_user = predictions_of_the_grades[predictions_of_the_grades['user_id']==63068]
offers_recommended_to_a_user = offers_recommended_to_a_user.sort_values(by=['score'], ascending=False)
offers_recommended_to_a_user = add_support.add_support_in_type(offers_recommended_to_a_user)

In [None]:
offers_recommended_to_a_user

### On récupère la description des offres 

In [None]:
query = """SELECT id as offer_id, description FROM offer """
all_the_offers = feature_engineering.get_a_df_from_sql_query(query, connection)

In [None]:
offers_recommended_to_a_user = offers_recommended_to_a_user.merge(all_the_offers, right_on="offer_id", left_on="offer_id")
offers_recommended_to_a_user

In [None]:
offers_recommended_to_a_user.dropna(subset=['description'], inplace=True) 
offers_recommended_to_a_user.reset_index(drop=True, inplace=True)
offers_recommended_to_a_user

In [None]:
cosinus_similarity = get_cosinus_similarity_using_the_tfidf(offers_recommended_to_a_user['description'])

### On fait K échanges pour diversifier l'ensemble 

In [None]:
most_relevant_offers_recommended_to_a_user, number_of_offers_per_type, sum_of_the_score, similarity_of_the_set = add_x_offers_that_diversifies_the_recommended_offers_using_the_description(offers_recommended_to_a_user, cosinus_similarity)


### On regarde le TF-IDF des mots de la descriptions des offres que l'on recommande à un utilisateur 

In [None]:
tfidf_of_the_offers = get_the_words_that_describe(most_relevant_offers_recommended_to_a_user, 'description')
tfidf_of_the_offers