In [None]:
import time 
import pandas as pd 
import numpy as np
import pickle
import math

from itertools import combinations

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sqlalchemy import create_engine
engine = create_engine('postgres://pass_culture:passq@localhost:5434/pass_culture?sslmode=prefer')
connection = engine.connect()

# Diversification according to the types of the offers


### Distance between types according to the reservation of the users 

In [None]:
similarity_matrix = pd.read_csv('../matrix_type_type_correlation.csv', sep = '\t') 

In [None]:
similarity_matrix.index = similarity_matrix['type']
del similarity_matrix['type']
similarity_matrix

### Recommendable offers for one user in Bretagne

In [None]:
offers_recommended_to_a_user = pd.read_csv('../offers_recommended_to_a_user.csv', sep = '\t') 

In [None]:
offers_recommended_to_a_user.drop_duplicates(inplace=True) 
offers_recommended_to_a_user

In [None]:
def add_support_to_the_type(df):
    offers_with_url = pd.read_sql_query(""" SELECT "offer".id as offer_id, url, type, name FROM "offer" """, connection)
    df = df.merge(offers_with_url, left_on='offer_id', right_on='offer_id')
    for i, row in df.iterrows():
        df.loc[i, 'type'] = np.where(row.url is None, row['type'] + '_physique', row['type'] + '_numerique')

    return df

In [None]:
def add_the_type_and_the_description_of_the_offer(df):
    offers_with_url = pd.read_sql_query(""" SELECT "offer".id as offer_id, url, type, name FROM "offer" """, connection)
    df = df.merge(offers_with_url, left_on='offer_id', right_on='offer_id')
    return df

In [None]:
offers_recommended_to_a_user = add_the_type_and_the_description_of_the_offer(offers_recommended_to_a_user)
offers_recommended_to_a_user = offers_recommended_to_a_user.sort_values(by='score', ascending=False)
offers_recommended_to_a_user

## Diversification - Algorithm swap

### We calculate the similarity value for each type with the other types

In [None]:
def compute_similarity_of_each_type(number_of_offer_per_type, similarity_matrix):
    number_of_offer_per_type.loc[:,'similarity_between_types'] = 0
    for type1 in number_of_offer_per_type['type']:
        similarity_between_types = 0
        for type2 in number_of_offer_per_type['type']:
            number_of_type2 = number_of_offer_per_type.loc[number_of_offer_per_type['type']==str(type2), 'total'].values
            similarity_between_types += number_of_type2 * similarity_matrix.loc[str(type1), str(type2)]

        similarity_between_types -= 1
        number_of_offer_per_type.loc[number_of_offer_per_type['type']==str(type1), 'similarity_between_types'] = similarity_between_types

    return number_of_offer_per_type

### We retrieve the offer that most diversifies the set of offers to recommend

In [None]:
def get_the_index_of_the_offer_in_the_least_relevant_offers_that_diversifies_the_set \
        (least_relevant_offers_recommended_to_a_user, similarity_matrix,
         number_of_offers_per_type, type_with_the_highest_similarity):
    for index_of_the_offer_that_diversifies_the_set in least_relevant_offers_recommended_to_a_user.index:
        type_of_the_offer = \
        least_relevant_offers_recommended_to_a_user.loc[index_of_the_offer_that_diversifies_the_set]['type']

        type_of_the_offer = str(type_of_the_offer)

        similarity_of_the_offer = number_of_offers_per_type.loc[number_of_offers_per_type['type'] == type_of_the_offer, \
                                                                'similarity_between_types'].values
        similarity_of_the_offer = similarity_of_the_offer + 1  # because we add this type to the list of recommendation
        similarity_of_the_offer = similarity_of_the_offer - similarity_matrix.loc[
            type_of_the_offer, type_with_the_highest_similarity]  # because we delete the type that we exchange
        if similarity_of_the_offer < number_of_offers_per_type['similarity_between_types'].max():
            break

    return index_of_the_offer_that_diversifies_the_set


def exchange_an_offer_from_one_df_to_another(df1, df2, index1, index2):
    df1.loc[index1] = df2.loc[index2]
    return df1


def drop_the_exchange_offer_from_the_least_relevant_offers(dataframe_of_offers, index_of_offer_to_drop):
    dataframe_of_offers.drop(index_of_offer_to_drop, axis=0, inplace=True)
    return dataframe_of_offers


In [None]:
def create_dataframe_of_the_name_of_all_the_types(df):
    number_of_offers_per_type = pd.DataFrame()
    number_of_offers_per_type['type'] = df['type'].value_counts().index
    return number_of_offers_per_type

def replace_dot_with_a_dash_in_the_column_type(df):
    list_of_types = []
    for types in df['type']:
        list_of_types.append(str(types).replace(".", "_"))
    df['type'] = list_of_types
    return df

In [None]:
def compute_number_of_offers_per_category(dataframe_of_offers, category, total):
    number_of_offers_per_type = pd.DataFrame(columns = [category, total])
    number_of_offers_per_type[category] = dataframe_of_offers[category].value_counts().index
    number_of_offers_per_type[total] = dataframe_of_offers[category].value_counts().array
    return number_of_offers_per_type

### We calculate the similarity of the set
To calculate the similarity of the set, we need:
- For each type, calculate the binomial coefficient Cnk with k = 2 and n = the number of each type
- For each combination of type (type1, type2), calculate nb_type_1 x nb_type2 x correlation (type1, type2)

In [None]:
def compute_combination_of_k_among_n(k, n):
    return math.factorial(n) / (math.factorial(k) * math.factorial(n - k))


def compute_similarity_between_offers_of_same_type(number_of_offers_per_type: pd.DataFrame):
    for i, row in number_of_offers_per_type.iterrows():
        if row['total'] > 1:
            number_of_offers_per_type.loc[i, 'number_of_possible_pairs_of_offers'] = compute_combination_of_k_among_n(2, row['total'])
        else:
            number_of_offers_per_type.loc[i, 'number_of_possible_pairs_of_offers'] = 0
    return number_of_offers_per_type['number_of_possible_pairs_of_offers'].sum()


def compute_similarity_between_offers_of_different_types(type1, type2, number_of_offers_per_type, similarity_matrix):
    type1 = str(type1)
    type2 = str(type2)
    number_of_offers_of_type1 = number_of_offers_per_type.loc[
        number_of_offers_per_type['type'] == type1, 'total'].values
    number_of_offers_of_type2 = number_of_offers_per_type.loc[
        number_of_offers_per_type['type'] == type2, 'total'].values
    similarity_between_2_types = similarity_matrix.loc[type1, type2]
    return (number_of_offers_of_type1 * number_of_offers_of_type2) * similarity_between_2_types


def compute_similarity_of_the_set(number_of_offers_per_type, similarity_matrix, number_of_exchanges):
    similarity_of_the_set = compute_similarity_between_offers_of_same_type(
        number_of_offers_per_type)
    for type1, type2 in combinations(number_of_offers_per_type['type'], 2):
        type1 = str(type1)
        type2 = str(type2)
        similarity_between_offers_of_type1_and_type2 = compute_similarity_between_offers_of_different_types(
            type1, type2, number_of_offers_per_type, similarity_matrix)
        similarity_of_the_set += similarity_between_offers_of_type1_and_type2
    similarity_of_the_set = similarity_of_the_set / compute_combination_of_k_among_n(2, number_of_exchanges)
    return similarity_of_the_set

In [None]:
#%pdb

In [None]:
K = 5
N = len(offers_recommended_to_a_user)

most_relevant_offers_recommended_to_a_user = offers_recommended_to_a_user[0:K]
most_relevant_offers_recommended_to_a_user



In [None]:
least_relevant_offers_recommended_to_a_user = offers_recommended_to_a_user[K:10]
least_relevant_offers_recommended_to_a_user

In [None]:
#%pdb

In [None]:
debut = time.time()

K = 40
N = len(offers_recommended_to_a_user)

most_relevant_offers_recommended_to_a_user = offers_recommended_to_a_user[0:K]
least_relevant_offers_recommended_to_a_user = offers_recommended_to_a_user[K:N]

sum_of_the_score = []
similarity_of_the_set = []

number_of_offers_per_type_at_each_iteration = create_dataframe_of_the_name_of_all_the_types(offers_recommended_to_a_user)
number_of_offers_per_type_at_each_iteration = replace_dot_with_a_dash_in_the_column_type(number_of_offers_per_type_at_each_iteration)
 
number_of_exchanges = K

for i in range(0,number_of_exchanges):
    
    sum_of_the_score.append(sum(most_relevant_offers_recommended_to_a_user['score'])/K)
    
    number_of_offers_per_type = compute_number_of_offers_per_category(most_relevant_offers_recommended_to_a_user, 'type',
                                                                     'total')
    
    similarity_of_each_type = compute_similarity_of_each_type(number_of_offers_per_type, similarity_matrix)
    
    similarity_of_the_set.append(compute_similarity_of_the_set(number_of_offers_per_type, similarity_matrix, number_of_exchanges)[0])
    
    similarity_of_each_type = compute_similarity_of_each_type(number_of_offers_per_type, similarity_matrix)

    type_with_the_highest_similarity = similarity_of_each_type.loc[
        similarity_of_each_type['similarity_between_types'] == similarity_of_each_type[
            'similarity_between_types'].max(), 'type'].values[0]

    type_with_the_highest_similarity = str(type_with_the_highest_similarity)

    index_of_the_offer_to_exchange_in_the_most_relevant_offers = most_relevant_offers_recommended_to_a_user[
        most_relevant_offers_recommended_to_a_user['type'] == type_with_the_highest_similarity].index[0]

    index_of_the_offer_to_exchange_in_the_least_relevant_offers = get_the_index_of_the_offer_in_the_least_relevant_offers_that_diversifies_the_set(
        least_relevant_offers_recommended_to_a_user, similarity_matrix,
        number_of_offers_per_type, type_with_the_highest_similarity)

    most_relevant_offers_recommended_to_a_user = exchange_an_offer_from_one_df_to_another(
        most_relevant_offers_recommended_to_a_user,
        least_relevant_offers_recommended_to_a_user,
        index_of_the_offer_to_exchange_in_the_most_relevant_offers,
        index_of_the_offer_to_exchange_in_the_least_relevant_offers)

    least_relevant_offers_recommended_to_a_user = drop_the_exchange_offer_from_the_least_relevant_offers(
        least_relevant_offers_recommended_to_a_user, index_of_the_offer_to_exchange_in_the_least_relevant_offers)

    number_of_offers_per_type_for_this_iteration = compute_number_of_offers_per_category(most_relevant_offers_recommended_to_a_user, 'type', i)
    
    number_of_offers_per_type_for_this_iteration = replace_dot_with_a_dash_in_the_column_type(number_of_offers_per_type_for_this_iteration)
     
    number_of_offers_per_type_at_each_iteration = number_of_offers_per_type_at_each_iteration.merge(number_of_offers_per_type_for_this_iteration, how='outer', left_on='type', right_on='type')
    
fin = time.time()
temps = (fin - debut)/60
print(temps)

### Average scores after each exchange

In [None]:
x = np.arange(number_of_exchanges)
y = sum_of_the_score

layout = dict(title = "Average scores after each exchange",
              xaxis = dict(title = "Number of exchanges"),
              yaxis = dict(title = "Average scores"))

fig = go.Figure(data=go.Scatter(x=x, y=y), layout=layout)
fig.show()

### Similarity of the set after each exchange

In [None]:
x = np.arange(number_of_exchanges)
y = similarity_of_the_set

layout = dict(title = "Similarity after each exchange",
              xaxis = dict(title = "Number of exchanges"),
              yaxis = dict(title = "Similarity"))

fig = go.Figure(data=go.Scatter(x=x, y=y), layout=layout)
fig.show()

### Best/Worst case

- Worst case 

In [None]:
worst_case = pd.DataFrame({'type' : most_relevant_offers_recommended_to_a_user['type'][0], 'total' : [K]})

similarite_ensemble_worst_case = 0
for types in worst_case['type']:
    if worst_case.loc[worst_case['type']==types, 'total'].values == 1 :
        similarite_ensemble_worst_case += 0
    else : 
        n = worst_case.loc[worst_case['type']==types, 'total'].values
        k = 2
        Cnk = math.factorial(n)/(math.factorial(k)*math.factorial(n-k))
        similarite_ensemble_worst_case += Cnk
        similarite_ensemble_worst_case = similarite_ensemble_worst_case / Cnk
similarite_ensemble_worst_case

- Best case

In [None]:
types = ['EventType.SPECTACLE_VIVANT',
       'EventType.PRATIQUE_ARTISTIQUE',
       'ThingType.MUSIQUE', 'ThingType.LIVRE_AUDIO',
       'ThingType.PRESSE_ABO',
       'ThingType.LIVRE_EDITION',
       'ThingType.MUSEES_PATRIMOINE_ABO',
       'ThingType.LIVRE_EDITION', 'EventType.MUSIQUE',
       'ThingType.AUDIOVISUEL', 'ThingType.JEUX_VIDEO',
       'ThingType.INSTRUMENT', 'ThingType.AUDIOVISUEL',
       'ThingType.MUSIQUE',
       'EventType.MUSEES_PATRIMOINE', 'EventType.JEUX',
       'ThingType.PRATIQUE_ARTISTIQUE_ABO',
       'ThingType.CINEMA_ABO', 'EventType.CINEMA',
       'EventType.CONFERENCE_DEBAT_DEDICACE',
       'ThingType.MUSIQUE_ABO',
       'ThingType.SPECTACLE_VIVANT_ABO',
       'ThingType.JEUX_VIDEO_ABO',
       'ThingType.OEUVRE_ART',
       'ThingType.JEUX_VIDEO_ABO',
       'ThingType.CINEMA_CARD', 'EventType.JEUX']

group_of_types = []

for i in range(9):
    for type in types:
        group_of_types.append(type)
    

In [None]:
debut = time.time()

from random import sample    
from random import shuffle 

list_of_similarities = []

for i in range(1000):
    K=50
    random_types = sample(group_of_types, K)
    dataframe_of_random_types = pd.DataFrame(random_types, columns=['type'])
    number_of_offers_per_type = compute_number_of_offers_per_category(dataframe_of_random_types, 'type', 'total')
    list_of_similarities.append(compute_similarity_of_the_set(number_of_offers_per_type, similarity_matrix, K)[0])
    shuffle(random_types)

fin = time.time()
temps = (fin - debut)/60
print(temps)

### Graph of the similarity of the set with the best and worst case

In [None]:
x = np.arange(70)
y = similarity_of_the_set

layout = dict(title = "Similarité après chaque échange d'offre",
              xaxis = dict(title = "Nombre d'échanges"),
              yaxis = dict(title = "Similarité"))

fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(x=x, y=y, mode='lines'))

# Line Horizontal --> Worst case
fig.add_shape(
    type="line",
            x0=0,
            y0=similarite_ensemble_worst_case,
            x1=number_of_exchanges,
            y1=similarite_ensemble_worst_case,
            line=dict(
                color="LightSeaGreen",
                width=4,
                dash="dashdot",
            ),
)

# Line Horizontal --> Best case
fig.add_shape(
    type="line",
            x0=0,
            y0=min(list_of_similarities),
            x1=number_of_exchanges,
            y1=min(list_of_similarities),
            line=dict(
                color="LightSeaGreen",
                width=4,
                dash="dashdot",
            ),
)

fig.show()

### Distribution of types after diversification

In [None]:
number_of_offers_per_type_at_each_iteration.dropna(axis=0, how='all', subset = range(0, 40), inplace=True)
number_of_offers_per_type_at_each_iteration

In [None]:
number_of_offers_per_type_at_each_iteration.to_csv('number_of_offers_per_type_in_the_diversification_per_type.csv', index=False, sep = '\t') 

In [None]:
fig = go.Figure(data=[go.Bar(
            x = number_of_offers_per_type_at_each_iteration['type'], 
            y = number_of_offers_per_type_at_each_iteration[0],
            text = number_of_offers_per_type_at_each_iteration[0],
            textposition = 'auto',
        )])

#We add the title
fig.update_layout(title_text='Distribution of types debore diversification')

#Title axis x
fig.update_xaxes(title_text="Types")

#Title axis y
fig.update_yaxes(title_text="Number of offers")

fig.show()

In [None]:
fig = go.Figure(data=[go.Bar(
            x = number_of_offers_per_type_at_each_iteration['type'], 
            y = number_of_offers_per_type_at_each_iteration[15],
            text = number_of_offers_per_type_at_each_iteration[15],
            textposition = 'auto',
        )])

#We add the title
fig.update_layout(title_text='Distribution of types after diversification')

#Title axis x
fig.update_xaxes(title_text="Types")

#Title axis y
fig.update_yaxes(title_text="Number of offers")

fig.show()

### Animation of the distribution of types after each exchange

In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import plotly.io as pio

pio.templates.default = "simple_white"

data = pd.read_csv('number_of_offers_per_type_in_the_diversification_per_type.csv', sep = '\t')

#Documentation for make_bar_chart_function
'''
    This function can be used with a dataset whose one column
    is categorical for which bar chart is required and other columns
    are various years which will serve as a frame rate.
'''

def make_bar_chart(dataset, categrical_col, start_year, end_year, title , frame_rate = 3):
    names = dataset[categrical_col]
    yvals = dataset.loc[:,start_year]
    def get_rgb_vals():
        r = np.random.randint(1,255)
        g = np.random.randint(1,255)
        b = np.random.randint(1,255)
        return [r,g,b]
    colors = []
    for i in range(len(names)):
        c = get_rgb_vals()
        colors.append("rgb(" + str(c[0]) + ","+ str(c[1]) + ","+ str(c[2]) + ")")
       
    def get_top_10(d):
        df = pd.DataFrame({"names":names, "pop":d, "color":colors})
        #data = df.sort_values(by = "pop").iloc[-15:,]
        return df

    listOfFrames = []
    for i in range(int(start_year),int(end_year)+1,frame_rate):
        d = data.loc[:,str(i)]
        pdata = get_top_10(d)
        listOfFrames.append(go.Frame(data = [go.Bar(x = pdata["names"], y = pdata["pop"],
                                                    marker_color = pdata["color"], text = pdata["names"],
                                                    hoverinfo = "none",textposition = "outside",
                                                    texttemplate = "%{x}<br>%{y:s}",cliponaxis = False)],
                                     layout = go.Layout(
                                         font = {"size":20},
                                         height = 700,
                                         xaxis = {"showline":False,"tickangle":-90, "visible":False},
                                         yaxis = {"showline":False, "visible":False},
                                        title = title + " For: "+ str(i))))

    fData = get_top_10(yvals)
    
    fig = go.Figure(
    data = [go.Bar(x = fData["names"], y = fData["pop"],
                   marker_color = fData["color"],text = fData["names"],
                  hoverinfo = "none",textposition = "outside",
                   texttemplate = "%{x}<br>%{y:s}",cliponaxis = False)],
    layout=go.Layout(
        title=title + " For: "+str(start_year),
        font = {"size":20},
        height = 700,
        xaxis = {"showline":False,"tickangle":-90, "visible":False},
        yaxis = {"showline":False, "visible":False},
        updatemenus=[dict(
            type="buttons",
            buttons=[dict(label="Play",
                          method="animate",
                          args=[None])])]
    ),
    frames=list(listOfFrames)
    )
    fig.show()

make_bar_chart(data, "type", "0", "39",title = "Distribution of types after each exchange", frame_rate = 1)