In [None]:
import time 
import pandas as pd 
import numpy as np
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from stop_words import get_stop_words

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sqlalchemy import create_engine
engine = create_engine('postgres://pass_culture:passq@localhost:5434/pass_culture?sslmode=prefer')
connection = engine.connect()

# Diversification within the same type
## Based on the Extradata column

### Distribution of types in the discovery_view table to see within which type we will diversify

In [None]:
discovery_view = pd.read_sql_query("""SELECT type  FROM discovery_view""", connection)
discovery_view

In [None]:
layout = dict(title = 'Distribution of {} offers'.format(len(discovery_view['type'])),
              xaxis = dict(title = 'Types'),
              yaxis = dict(title = "Number of offers"))

fig = go.Figure(data=[go.Bar(
            x = discovery_view['type'].value_counts().index, 
            y = discovery_view['type'].value_counts().values,
            text = discovery_view['type'].value_counts().values,
            textposition = 'auto', 
        )], layout=layout)

fig.show()

### We get the extradata column from the offer table

In [None]:
extradata = pd.read_sql_query("""SELECT type, "extraData"  FROM offer """, connection)
extradata

## Diversification within the ThingType.LIVRE_EDITION type

In [None]:
extradata_livres = pd.read_sql_query("""SELECT id, type, "extraData"  
                                        FROM offer 
                                        WHERE type = 'ThingType.LIVRE_EDITION' """, connection)
extradata_livres

In [None]:
feature_of_the_book = []
for data in extradata_livres['extraData']:
    if data is None: 
        continue 
    for key in data.keys():
        if key not in feature_of_the_book:
            feature_of_the_book.append(key)

print("The characteristics that can be recovered from the Extradata column are: ")
feature_of_the_book 

In [None]:
for feature in feature_of_the_book:
    extradata_livres[feature] = extradata_livres['extraData'].apply(lambda x : x.get(feature) if x!=None else None )
extradata_livres

### We diversify according to the "rayon" characteristic

In [None]:
number_of_none = extradata_livres['rayon'].isna().sum()
values = [len(extradata_livres)-number_of_none, number_of_none ]
labels = ["Not None", "None"]

fig = go.Figure(data=[go.Pie(labels=labels, values=values,
                             insidetextorientation='radial'
                            )])

fig.update_traces(hoverinfo='label+percent', textinfo='value+percent',
                  marker=dict(colors=['light blue', 'red']))



fig.show()

In [None]:
#We remove books that do not have the specified "rayon" characteristic
data_in_rayon = pd.DataFrame(extradata_livres['rayon'].unique(), columns=['rayon'])
data_in_rayon.dropna(inplace=True)
data_in_rayon.reset_index(drop=True, inplace=True)
data_in_rayon

In [None]:
debut = time.time()

vectorizer = TfidfVectorizer(analyzer='word', \
                     stop_words=get_stop_words('french'), \
                     strip_accents = 'ascii', \
                     lowercase = True)

tfidf_matrix = vectorizer.fit_transform(data_in_rayon['rayon'])

fin = time.time()
temps = (fin - debut)/60
print(temps)

In [None]:
debut = time.time()

#Linear kernel = cosine_similarity quand on a une très grande quantité de données (linear kernel est plus rapide)
cosinus_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

fin = time.time()
temps = (fin - debut)/60
print(temps)

In [None]:
debut = time.time()

the_ten_most_similar_rayon = {} #Dictionnaire pour les résultats sous la forme (offer_id : (Score, offer_id))
#On itère sur toutes les lignes
#Pour chaque ligne, on cherche les scores les plus élevés et on les trie par ordre decroissant 
#On récupère les ids des offres et on les mets dans résulat
for idx, ligne in data_in_rayon.iterrows(): 
    similar_indexes = cosinus_similarity[idx].argsort()[:-12:-1] 
    similar_feature_rayon = [(cosinus_similarity[idx][i], data_in_rayon['rayon'][i]) for i in similar_indexes]
    the_ten_most_similar_rayon[ligne['rayon']] = similar_feature_rayon[1:]

    
fin = time.time()
temps = (fin - debut)/60
print(temps)

In [None]:
the_ten_most_similar_rayon

## Diversification within another type

In [None]:
extradata['type'].unique()

In [None]:
extradata_of_a_type = pd.read_sql_query("""SELECT id, type, "extraData"  
                                        FROM offer 
                                        WHERE type = 'EventType.CINEMA' """, connection)
extradata_of_a_type

In [None]:
number_of_none = extradata_of_a_type['extraData'].isna().sum()
values = [len(extradata_of_a_type)-number_of_none, number_of_none ]
labels = ["Not none", "None"]

fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='value+percent',
                             insidetextorientation='radial'
                            )])
fig.show()

In [None]:
feature_of_a_type = []
for data in extradata_of_a_type['extraData']:
    if data is None: 
        continue 
    for key in data.keys():
        if key not in feature_of_a_type:
            feature_of_a_type.append(key)

print("Les caractéristiques que l'on peut récupérer de la colonne Extradata sont : ")
feature_of_a_type 

In [None]:
for feature in feature_of_a_type:
    extradata_of_a_type[feature] = extradata_of_a_type['extraData'].apply(lambda x : x.get(feature) if x!=None else None )
extradata_of_a_type

In [None]:
extradata_of_a_type['showSubType'].unique()