In [None]:
import time 
import pandas as pd 
import numpy as np
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from stop_words import get_stop_words

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sqlalchemy import create_engine
engine = create_engine('postgres://pass_culture:passq@localhost:5434/pass_culture?sslmode=prefer')
connection = engine.connect()

# Diversification of offers according to the TF-IDF of their description

### Get the recommended offers

In [None]:
offers_recommended_to_a_user = pd.read_csv('../offers_recommended_to_a_user.csv', sep = '\t') 

In [None]:
offers_recommended_to_a_user.drop_duplicates(inplace=True) 
offers_recommended_to_a_user

### Get the description of the offers

In [None]:
offers_with_the_description = pd.read_sql_query("""SELECT id as offer_id, description, type FROM offer """, connection)
offers_with_the_description

In [None]:
offers_recommended_to_a_user = offers_recommended_to_a_user.merge(offers_with_the_description, right_on="offer_id", left_on="offer_id")
offers_recommended_to_a_user

In [None]:
offers_recommended_to_a_user.dropna(subset=['description'], inplace=True) 
offers_recommended_to_a_user.reset_index(drop=True, inplace=True)
offers_recommended_to_a_user

In [None]:
debut = time.time()

vectorizer = TfidfVectorizer(analyzer='word', \
                     stop_words=get_stop_words('french'), \
                     strip_accents = 'ascii', \
                     lowercase = True)

tfidf_matrix = vectorizer.fit_transform(offers_recommended_to_a_user['description'])

fin = time.time()
temps = (fin - debut)/60
print(temps)

### Calculate the similarity between the offers

In [None]:
debut = time.time()

#Linear kernel = cosine_similarity quand on a une très grande quantité de données (linear kernel est plus rapide)
cosinus_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

fin = time.time()
temps = (fin - debut)/60
print(temps)

In [None]:
def compute_similarity_of_the_offer_with_offers_using_the_index(index_of_the_offer, index_of_the_other_offers, similarity_matrix):
     return np.mean(similarity_matrix[index_of_the_offer, index_of_the_other_offers])
    
def compute_similarity_for_each_offer(most_relevant_offers_recommended_to_a_user):
    for index, row in most_relevant_offers_recommended_to_a_user.iterrows():
        other_offers = most_relevant_offers_recommended_to_a_user[most_relevant_offers_recommended_to_a_user['index'] != most_relevant_offers_recommended_to_a_user.loc[index, 'index']]
        index_of_the_other_offers = other_offers['index'].values  
        similarite = compute_similarity_of_the_offer_with_offers_using_the_index(most_relevant_offers_recommended_to_a_user.loc[index, 'index'], index_of_the_other_offers, cosinus_similarity)
        most_relevant_offers_recommended_to_a_user.loc[index, 'similarite'] = similarite
    return most_relevant_offers_recommended_to_a_user

def get_real_index_of_the_offer_with_the_highest_similarity(dataframe_of_most_relevant_offers):
    offer_with_the_highest_similarity = dataframe_of_most_relevant_offers[dataframe_of_most_relevant_offers['similarite'] == dataframe_of_most_relevant_offers['similarite'].max()]
    index_of_the_offer_with_the_highest_similarity = offer_with_the_highest_similarity['index'].values
    return index_of_the_offer_with_the_highest_similarity

def get_index_of_the_offer_with_the_highest_similarity(dataframe_of_most_relevant_offers):
    offer_with_the_highest_similarity = dataframe_of_most_relevant_offers[dataframe_of_most_relevant_offers['similarite'] == dataframe_of_most_relevant_offers['similarite'].max()]
    index_of_the_offer_with_the_highest_similarity = offer_with_the_highest_similarity['index'].values
    index_of_the_offer_with_the_highest_similarity = dataframe_of_most_relevant_offers[dataframe_of_most_relevant_offers['index'] == index_of_the_offer_with_the_highest_similarity[0]].index[0]
    return index_of_the_offer_with_the_highest_similarity

def get_index_to_exchange_in_the_least_relevant_offers(dataframe_of_least_relevant_offers, dataframe_of_most_relevant_offers, real_index_of_the_offer_with_the_highest_similarity, index_of_the_offer_with_the_highest_similarity, similarity_matrix):
    value_of_the_highest_similarity = dataframe_of_most_relevant_offers.loc[index_of_the_offer_with_the_highest_similarity, 'similarite']
    most_relevant_offers_without_the_offer_with_the_highest_similarity = dataframe_of_most_relevant_offers[dataframe_of_most_relevant_offers['index'] != real_index_of_the_offer_with_the_highest_similarity[0]]
    index_of_the_most_relevant_offers_without_the_offer_with_the_highest_similarity = most_relevant_offers_without_the_offer_with_the_highest_similarity['index'].values
    for i, row in dataframe_of_least_relevant_offers.iterrows():
        similarity_of_offer_in_the_least_relevant_offers = compute_similarity_of_the_offer_with_offers_using_the_index(row['index'], index_of_the_most_relevant_offers_without_the_offer_with_the_highest_similarity, similarity_matrix)
        if similarity_of_offer_in_the_least_relevant_offers < value_of_the_highest_similarity:
            dataframe_of_least_relevant_offers.loc[dataframe_of_least_relevant_offers['index'] == row['index'], 'similarite'] = similarity_of_offer_in_the_least_relevant_offers
            return row['index']
        
def exchange_offers_from_one_df_to_another(df1, df2, index1, index2):
    df1.loc[index1] = df2.loc[index2]
    return df1

def drop_the_exchange_offer_from_the_least_relevant_offers(dataframe_of_offers, index_of_offer_to_drop):
    dataframe_of_offers.drop(index_of_offer_to_drop, axis=0, inplace=True)
    return dataframe_of_offers

def compute_combination_of_k_among_n(k, n):
    return math.factorial(n) / (math.factorial(k) * math.factorial(n - k))

def compute_similarity_of_the_set(df1, similarity_matrix):
    similarity_of_the_set = 0. 
    for i, row in df1.iterrows():
        for j in range(i+1, len(df1)):
            similarity_of_the_set += (similarity_matrix[df1.loc[i, 'index'], df1.loc[j, 'index']])
    similarity_of_the_set = similarity_of_the_set / compute_combination_of_k_among_n(2, K)
    return similarity_of_the_set

In [None]:
def create_dataframe_of_the_name_of_all_the_types(df):
    number_of_offers_per_type = pd.DataFrame()
    number_of_offers_per_type['type'] = df['type'].value_counts().index
    return number_of_offers_per_type

def replace_dot_with_a_dash_in_the_column_type(df):
    list_of_types = []
    for types in df['type']:
        list_of_types.append(str(types).replace(".", "_"))
    df['type'] = list_of_types
    return df

In [None]:
def compute_number_of_offers_per_category(dataframe_of_offers, category, total):
    number_of_offers_per_type = pd.DataFrame(columns = [category, total])
    number_of_offers_per_type[category] = dataframe_of_offers[category].value_counts().index
    number_of_offers_per_type[total] = dataframe_of_offers[category].value_counts().array
    return number_of_offers_per_type

### We look at the TF-IDF of the words in the description of the offers that we recommend to a user

In [None]:
K = 40
most_relevant_offers_recommended_to_a_user = offers_recommended_to_a_user[0:K]
most_relevant_offers_recommended_to_a_user

In [None]:
tfidf_of_the_offers = pd.DataFrame(columns = np.arange(len(most_relevant_offers_recommended_to_a_user)))
for i, row in most_relevant_offers_recommended_to_a_user.iterrows():
    id_offre = row['offer_id']
    index = i
    tfidf_de_loffre = tfidf_matrix[index]

    #On met les tf-idf dans un dataframe
    df_tfidf = pd.DataFrame(tfidf_de_loffre.T.todense(), index = vectorizer.get_feature_names(), columns=["tfidf"])
    df_tfidf = df_tfidf.sort_values(by=["tfidf"], ascending=False)  
    df_tfidf = df_tfidf.head(10)
    tfidf_of_the_offers[i] = df_tfidf.index
    tfidf_of_the_offers = tfidf_of_the_offers.rename(columns={i: row['type']})

tfidf_of_the_offers

### We do K exchanges to diversify the whole

In [None]:
debut = time.time()

K = 40
N = len(offers_recommended_to_a_user)

offers_recommended_to_a_user['index'] = offers_recommended_to_a_user.index
most_relevant_offers_recommended_to_a_user = offers_recommended_to_a_user[0:K]
least_relevant_offers_recommended_to_a_user = offers_recommended_to_a_user[K+1:N]

sum_of_the_score = []
similarity_of_the_set = []

number_of_offers_per_type = create_dataframe_of_the_name_of_all_the_types(offers_recommended_to_a_user)
number_of_offers_per_type = replace_dot_with_a_dash_in_the_column_type(number_of_offers_per_type)

number_of_exchanges = 40

for i in range(0,number_of_exchanges):
    
    sum_of_the_score.append(sum(most_relevant_offers_recommended_to_a_user['score'])/K)

    most_relevant_offers_recommended_to_a_user = compute_similarity_for_each_offer(most_relevant_offers_recommended_to_a_user)
    
    real_index_of_the_offer_with_the_highest_similarity = get_real_index_of_the_offer_with_the_highest_similarity(most_relevant_offers_recommended_to_a_user)
    
    index_of_the_offer_with_the_highest_similarity = get_index_of_the_offer_with_the_highest_similarity(most_relevant_offers_recommended_to_a_user)
    
    index_to_exchange_in_the_least_relevant_offers = get_index_to_exchange_in_the_least_relevant_offers(least_relevant_offers_recommended_to_a_user, most_relevant_offers_recommended_to_a_user, real_index_of_the_offer_with_the_highest_similarity, index_of_the_offer_with_the_highest_similarity, cosinus_similarity)  
    
    if index_to_exchange_in_the_least_relevant_offers is not None : 
        
        most_relevant_offers_recommended_to_a_user = exchange_offers_from_one_df_to_another(most_relevant_offers_recommended_to_a_user, least_relevant_offers_recommended_to_a_user, index_of_the_offer_with_the_highest_similarity, index_to_exchange_in_the_least_relevant_offers)
    
        least_relevant_offers_recommended_to_a_user = drop_the_exchange_offer_from_the_least_relevant_offers(least_relevant_offers_recommended_to_a_user, index_to_exchange_in_the_least_relevant_offers)
    
        similarity_of_the_set.append(compute_similarity_of_the_set(most_relevant_offers_recommended_to_a_user, cosinus_similarity))
        
        number_of_offers_per_type_for_this_iteration = compute_number_of_offers_per_category(most_relevant_offers_recommended_to_a_user, 'type', i)
        number_of_offers_per_type_for_this_iteration = replace_dot_with_a_dash_in_the_column_type(number_of_offers_per_type_for_this_iteration)
        
        number_of_offers_per_type = number_of_offers_per_type.merge(number_of_offers_per_type_for_this_iteration, how='outer', left_on='type', right_on='type')
        
fin = time.time()
temps = (fin - debut)/60
print(temps)

### Average scores after each exchange

In [None]:
x = np.arange(number_of_exchanges)
y = sum_of_the_score

layout = dict(title = "Average scores after each exchange ",
              xaxis = dict(title = "Number of exchanges"),
              yaxis = dict(title = "Average scores"))

fig = go.Figure(data=go.Scatter(x=x, y=y), layout=layout)
fig.show()

### Similarity of the set after each exchange

In [None]:
x = np.arange(number_of_exchanges)
y = similarity_of_the_set

layout = dict(title = "Similarity after each exchange",
              xaxis = dict(title = "Number of exchanges"),
              yaxis = dict(title = "Similarity"))

fig = go.Figure(data=go.Scatter(x=x, y=y), layout=layout)
fig.show()

### Best/Worst case

### Best case
- We take 40 offers at random from the discovery view
- We calculate the similarity of the set
- We do this 1000 times and we get the smallest similarity

In [None]:
discovery_view = pd.read_sql_query("""SELECT offer.id as offer_id, offer.description, offer.type 
                                      FROM discovery_view 
                                      INNER JOIN offer 
                                      ON offer.id = discovery_view.id
                                    """, connection)
discovery_view

In [None]:
discovery_view.dropna(subset = ['description'], inplace=True) 
discovery_view.reset_index(drop=True, inplace=True)
discovery_view['index'] = discovery_view.index
discovery_view

In [None]:
debut = time.time()

vectorizer_discovery_view = TfidfVectorizer(analyzer='word', \
                     stop_words=get_stop_words('french'), \
                     strip_accents = 'ascii', \
                     lowercase = True)

tfidf_matrix_discovery_view = vectorizer_discovery_view.fit_transform(discovery_view['description'])

fin = time.time()
temps = (fin - debut)/60
print(temps)

In [None]:
debut = time.time()

#Linear kernel = cosine_similarity quand on a une très grande quantité de données (linear kernel est plus rapide)
cosinus_similarite_discovery_view = linear_kernel(tfidf_matrix_discovery_view, tfidf_matrix_discovery_view)

fin = time.time()
temps = (fin - debut)/60
print(temps)

In [None]:
debut = time.time()

from random import sample    
from random import shuffle 

similarite_ensemble_best_case = []

for i in range(1000):
    offers_sample = discovery_view.sample(K)
    offers_sample.reset_index(drop=True, inplace=True)
    similarite_ensemble_best_case.append(compute_similarity_of_the_set(offers_sample, cosinus_similarite_discovery_view))

fin = time.time()
temps = (fin - debut)/60
print(temps)

### Worst case
### In the worst case, all our offers "are similar" and therefore there is no diversification

- We get an offer randomly
- We add the offer that has the greatest similarity with the first
- We add the offer that has the greatest similarity with the second
- And so on until we have the 40 offers
- We calculate the similarity of the set
- We do this 1000 times then we recover the value of the greatest similarity

In [None]:
def get_the_most_similar_offers_by_starting_with_a_random_offer():
    most_similar_offers = discovery_view.sample(1)
    most_similar_offers.reset_index(drop=True, inplace=True)
    for i in range(49):
        index_of_the_offer = most_similar_offers['index'][most_similar_offers.index[-1]]
        cosinus_similarite_discovery_view[index_of_the_offer][index_of_the_offer] = 0 # Parce que c'est égal à 1 et quand on va chercher le max ca va prendre cette offre 
        index_of_the_following_offer = np.argmax(cosinus_similarite_discovery_view[index_of_the_offer])
        cosinus_similarite_discovery_view[index_of_the_offer][index_of_the_following_offer] = 0 # Il faut aussi l'enlever de l'offre qu'on ajoute sinon ils vont se renvoyer la balle 
        cosinus_similarite_discovery_view[index_of_the_following_offer][index_of_the_offer] = 0 # Il faut aussi l'enlever de l'offre qu'on ajoute sinon ils vont se renvoyer la balle 
        most_similar_offers = most_similar_offers.append(discovery_view[discovery_view['index'] == index_of_the_following_offer], ignore_index=True)
    return most_similar_offers

In [None]:
debut = time.time()

from random import sample    
from random import shuffle 

similarite_ensemble_worst_case = []

for i in range(1000):
    most_similar_offers = get_the_most_similar_offers_by_starting_with_a_random_offer()
    similarite_ensemble_worst_case.append(compute_similarity_of_the_set(most_similar_offers, cosinus_similarite_discovery_view))

fin = time.time()
temps = (fin - debut)/60
print(temps)

### Graph of the similarity of the set with the best and worst case

In [None]:
x = np.arange(70)
y = similarity_of_the_set

layout = dict(title = "Similarity after each exchange of offers",
              xaxis = dict(title = "Number of exchanges"),
              yaxis = dict(title = "Similarity"))

fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(x=x, y=y, mode='lines'))

# Line Horizontal --> Worst case
fig.add_shape(
    type="line",
            x0=0,
            y0=max(similarite_ensemble_best_case),
            x1=number_of_exchanges,
            y1=max(similarite_ensemble_best_case),
            line=dict(
                color="LightSeaGreen",
                width=4,
                dash="dashdot",
            ),
)

# Line Horizontal --> Best case
fig.add_shape(
    type="line",
            x0=0,
            y0=min(similarite_ensemble_best_case),
            x1=number_of_exchanges,
            y1=min(similarite_ensemble_best_case),
            line=dict(
                color="LightSeaGreen",
                width=4,
                dash="dashdot",
            ),
)

fig.show()

### Distribution of types after diversification

In [None]:
number_of_offers_per_type.dropna(axis=0, how='all', subset = range(0, 40), inplace=True)
number_of_offers_per_type

In [None]:
number_of_offers_per_type.to_csv('number_of_offers_per_type_in_diversification_TF_IDF.csv', index=False, sep = '\t') 

In [None]:
number_of_iteration = 40
fig = go.Figure(data=[go.Bar(
            x = number_of_offers_per_type['type'], 
            y = number_of_offers_per_type[0],
            text = number_of_offers_per_type[0],
            textposition = 'auto',
        )])

#We add the title
fig.update_layout(title_text='Distribution of the types before diversification')

#Title axis x
fig.update_xaxes(title_text="Types")

#Title axis y
fig.update_yaxes(title_text="Number of offers")

fig.show()

In [None]:
number_of_iteration = 40
fig = go.Figure(data=[go.Bar(
            x = number_of_offers_per_type['type'], 
            y = number_of_offers_per_type[15],
            text = number_of_offers_per_type[15],
            textposition = 'auto',
        )])

#We add the title
fig.update_layout(title_text='Distribution of the types after diversification')

#Title axis x
fig.update_xaxes(title_text="Types")

#Title axis y
fig.update_yaxes(title_text="Number of offers")

fig.show()

In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import plotly.io as pio

pio.templates.default = "simple_white"

data = pd.read_csv('number_of_offers_per_type_in_diversification_TF_IDF.csv', sep = '\t')

#Documentation for make_bar_chart_function
'''
    This function can be used with a dataset whose one column
    is categorical for which bar chart is required and other columns
    are various years which will serve as a frame rate.
'''

def make_bar_chart(dataset, categrical_col, start_year, end_year, title , frame_rate = 3):
    names = dataset[categrical_col]
    yvals = dataset.loc[:,start_year]
    def get_rgb_vals():
        r = np.random.randint(1,255)
        g = np.random.randint(1,255)
        b = np.random.randint(1,255)
        return [r,g,b]
    colors = []
    for i in range(len(names)):
        c = get_rgb_vals()
        colors.append("rgb(" + str(c[0]) + ","+ str(c[1]) + ","+ str(c[2]) + ")")
       
    def get_top_10(d):
        df = pd.DataFrame({"names":names, "pop":d, "color":colors})
        #data = df.sort_values(by = "pop").iloc[-15:,]
        return df

    listOfFrames = []
    for i in range(int(start_year),int(end_year)+1,frame_rate):
        d = data.loc[:,str(i)]
        pdata = get_top_10(d)
        listOfFrames.append(go.Frame(data = [go.Bar(x = pdata["names"], y = pdata["pop"],
                                                    marker_color = pdata["color"], text = pdata["names"],
                                                    hoverinfo = "none",textposition = "outside",
                                                    texttemplate = "%{x}<br>%{y:s}",cliponaxis = False)],
                                     layout = go.Layout(
                                         font = {"size":20},
                                         height = 700,
                                         xaxis = {"showline":False,"tickangle":-90, "visible":False},
                                         yaxis = {"showline":False, "visible":False},
                                        title = title + " For: "+ str(i))))

    fData = get_top_10(yvals)
    
    fig = go.Figure(
    data = [go.Bar(x = fData["names"], y = fData["pop"],
                   marker_color = fData["color"],text = fData["names"],
                  hoverinfo = "none",textposition = "outside",
                   texttemplate = "%{x}<br>%{y:s}",cliponaxis = False)],
    layout=go.Layout(
        title=title + " For: "+str(start_year),
        font = {"size":20},
        height = 700,
        xaxis = {"showline":False,"tickangle":-90, "visible":False},
        yaxis = {"showline":False, "visible":False},
        updatemenus=[dict(
            type="buttons",
            buttons=[dict(label="Play",
                          method="animate",
                          args=[None])])]
    ),
    frames=list(listOfFrames)
    )
    fig.show()

make_bar_chart(data, "type", "0", "39",title = "Distribution of types after each exchange", frame_rate = 1)