In [94]:
import pandas as pd

def load_data():
    """
    Charge les fichiers CSV de données (services_activities, providers, venues) dans des DataFrames pandas.
    """
    services_activities = pd.read_csv('../data/services_activities.csv')
    service_providers = pd.read_csv('../data/service_providers.csv')
    venues = pd.read_csv('../data/venues.csv')
    
    return services_activities, service_providers, venues

def clean_data(services_activities, service_providers, venues):
    """
    Effectue un nettoyage basique des données si nécessaire. Par exemple, enlever les doublons,
    vérifier les valeurs manquantes, ou normaliser les chaînes de texte.
    """

    services_activities.drop_duplicates(inplace=True)
    service_providers.drop_duplicates(inplace=True)
    venues.drop_duplicates(inplace=True)
    
    services_activities = services_activities.applymap(lambda x: x.lower() if isinstance(x, str) else x)
    service_providers = service_providers.applymap(lambda x: x.lower() if isinstance(x, str) else x)
    venues = venues.applymap(lambda x: x.lower() if isinstance(x, str) else x)
    
    # D'autres opérations de nettoyage peuvent être ajoutées ici si nécessaire
    return services_activities, service_providers, venues

# Example of usage
# services_activities, providers, venues = load_data()
# services_activities, providers, venues = clean_data(services_activities, providers, venues)

In [95]:
services_activities, service_providers, venues = load_data()

**services_activities**

In [96]:
services_activities.isnull().sum()

service_id               0
service_provider_id    484
venue_id               537
service_category         0
type                     0
title                    0
description            528
dtype: int64

In [97]:
# any service is strictly either a service provider or a venue
print(services_activities[(services_activities['service_provider_id'].isna()) | (services_activities['venue_id'].isna())].nunique())
print(services_activities[(services_activities['service_provider_id'].isna()) & (services_activities['venue_id'].isna())].nunique())
print(services_activities['service_id'].nunique())

service_id             1021
service_provider_id      64
venue_id                 67
service_category          5
type                     43
title                   644
description             438
dtype: int64
service_id             0
service_provider_id    0
venue_id               0
service_category       0
type                   0
title                  0
description            0
dtype: int64
1021


In [98]:
# Most service provider don't have a description. Most venue have but a few don't.
print(services_activities[(services_activities['description'].isna()) & (services_activities['service_provider_id'].isna())].nunique())
print(services_activities[(services_activities['description'].isna()) & (services_activities['venue_id'].isna())].nunique())
print(services_activities['service_id'].nunique())

service_id             452
service_provider_id      0
venue_id                65
service_category         4
type                    26
title                  238
description              0
dtype: int64
service_id             76
service_provider_id    30
venue_id                0
service_category        2
type                    4
title                  64
description             0
dtype: int64
1021


**service_providers cleaning**

In [99]:
service_providers.isnull().sum() 

id                         0
title                      0
description               46
latitude                   0
longitude                  0
city                       3
region                     0
postal_code                2
department                 0
country                    0
category                   0
max_distance_in_meters     2
dtype: int64

In [100]:
service_providers[service_providers['max_distance_in_meters'].isna()]

Unnamed: 0,id,title,description,latitude,longitude,city,region,postal_code,department,country,category,max_distance_in_meters
15,444c2c41-2843-40f4-8d06-3818c6fe6754,Prestataire 15,,48.84713,2.293058,Paris,Île-de-France,75015,Département de Paris,France,ACTIVITY,
86,2ee1b2c4-8bd7-48c3-8e4d-c60f1b0064c6,Prestataire 86,,48.755099,2.445362,Villeneuve-Saint-Georges,Île-de-France,94190,Val-de-Marne,France,TRANSPORT,


In [101]:
# It seems that some service providers are not linked to activities. Those we can ignore.
services_activities[services_activities['service_provider_id'] == '444c2c41-2843-40f4-8d06-3818c6fe6754']

Unnamed: 0,service_id,service_provider_id,venue_id,service_category,type,title,description


**Venues cleaning**

In [102]:
# they are potentially a lot more null cases in the venue data set. On the other hand, for an mvp at first glance I don't see any blocker for keeping those value null for a simple first iteration.
venues.isnull().sum() 

id                        0
title                     0
introduction             11
access                   40
food                    108
activities              109
latitude                  0
longitude                 0
city                     11
region                   34
postal_code               1
department               35
country                   0
ambiance                 27
capacity                  0
number_of_bedrooms        0
number_of_bathrooms       0
number_of_beds            0
number_of_workspaces      0
house_type                0
dtype: int64

In [103]:
services_activities, service_providers, venues = clean_data(services_activities, service_providers, venues)

  services_activities = services_activities.applymap(lambda x: x.lower() if isinstance(x, str) else x)
  service_providers = service_providers.applymap(lambda x: x.lower() if isinstance(x, str) else x)
  venues = venues.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [104]:
from transformers import pipeline


pipeline_model = pipeline("text2text-generation", model="google/flan-t5-base")

info = {}
set_of_requirements = ['city', 'participants number']
text = "Événement pour 50 participants avec hébergement en chambres simples et dortoirs. Dates prévues du 27 au 28 janvier 2024, avec flexibilité d'une semaine. Localisation à une heure de Paris. Restauration incluse avec un petit déjeuner, deux déjeuners et un dîner."
brief = text

for requirement in set_of_requirements:

    instruction_city = f"Extract the {requirement} from the following text."

    prompt_city = f"""
        {instruction_city}

        Text: {text}
    """
    result_city = pipeline_model(prompt_city, max_length=100, clean_up_tokenization_spaces=True)
    
    output_city = result_city[0]["generated_text"]

    info[requirement] = output_city.lower()
    
info

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0


{'city': 'paris', 'participants number': '50'}

**Building operatable data requirements**

I'd like to use a certain levelof abstraction to start building my recommendation system. In the end it doesn't matter if the data is a venue or an activity in the end its all just a requirement. For my mvp version I guess what matters the most is the place to host the event also the complexity of handeling the details of the activity seems much more advanced than findig a potential spot. Therefor I'll start by focusing on the location and number of participent.

In [105]:
requirements = pd.read_csv('../data/requirements.csv')

type_mapping = {
    'int': int,
    'float': float,
    'str': str,
}

recommendations = venues

for index, requirement in requirements.iterrows():

    recommendations['score'] = 0

    requirement_name = requirement['requirement_name']
    requirement_textual_name = requirement['requirement_textual_name']
    operator = requirement['operation']
    type = requirement['type']
    requirement_value = info[requirement_textual_name]

    converted_value = type_mapping[type](requirement_value)

    if requirement['priority'] == 0:

        filter_operation = f"recommendations[(recommendations['{requirement_name}']) {operator} converted_value]"
        recommendations = eval(filter_operation)

    if requirement['priority'] == 1:

        filter_operation = f"(row['{requirement_name}'] {operator} converted_value)"
        recommendations['score'] = recommendations.apply(lambda row: row['score'] + 1 if eval(filter_operation) else row['score'], axis = 1)

recommendations = recommendations.sort_values(by='score', ascending=False)
recommendations


200
96
96


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations['score'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations['score'] = recommendations.apply(lambda row: row['score'] + 1 if eval(filter_operation) else row['score'], axis = 1)


Unnamed: 0,id,title,introduction,access,food,activities,latitude,longitude,city,region,...,department,country,ambiance,capacity,number_of_bedrooms,number_of_bathrooms,number_of_beds,number_of_workspaces,house_type,score
1,3b33e55f-1d30-46e1-bdda-3fe094f54ab9,lieu 1,situé au cœur d'une ville réputée pour son ric...,cet établissement est idéalement situé au cœur...,,,51.202087,3.225383,brugge,vlaams gewest,...,west-vlaanderen,belgique,urban,90,90,90,90,12,hotel,1
5,50a04801-91c5-4e32-8344-25e75d2783ca,lieu 5,"situé le long d'une côte pittoresque, cet espa...",l'espace est conçu pour être accessible à tous...,l'établissement propose une expérience culinai...,les invités peuvent profiter d'une variété d'a...,43.765906,13.145343,marotta,marche,...,provincia di pesaro e urbino,italy,beach,96,51,0,0,0,hotel,1
149,50f6212d-39e9-46a7-8317-67b70d323060,lieu 149,"situé dans une région viticole réputée, cet ét...",le lieu est facilement accessible en taxi (env...,,,48.881194,4.004706,bergères-lès-vertus,grand est,...,marne,france,countryside,122,61,61,61,4,hotel,1
146,f0c7d9f3-4bb3-47c5-88d1-ad4a814a8d11,lieu 146,"situé dans une région de france, le lieu bénéf...",ce lieu est facilement accessible en véhicule ...,,,47.464473,-0.559608,"angers, france",,...,,france,urban,172,86,86,172,3,hotel,1
145,2eaddded-73ba-4e17-9a0a-9842b05e3fe4,lieu 145,découvrez un lieu accueillant situé dans une r...,,,,46.712331,7.702537,gunten,,...,,suisse,,102,51,51,0,0,hotel,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,32942dd4-ed01-4aca-bb78-8dd7722642ab,lieu 66,"situé dans une région pittoresque, ce lieu est...",le lieu propose des caractéristiques d'accessi...,l'établissement propose une gamme de services ...,l'établissement propose une variété d'activité...,39.983059,15.730503,maratea,basilicata,...,provincia di potenza,italy,mountain,151,151,0,0,0,hotel,1
64,3ae69313-49e4-420a-b8cf-93d280eae3cb,lieu 64,"situé dans une région dynamique, cet espace pr...",l'espace est conçu pour être accessible à tous...,l'établissement propose une large gamme de ser...,l'établissement propose une gamme d'activités ...,40.561332,-3.631245,san sebastián de los reyes,comunidad de madrid,...,madrid,spain,urban,142,72,0,0,0,hotel,1
59,33e33125-df5d-4940-9a55-ee8bcfb20592,lieu 59,"situé dans une région dynamique, cet espace pr...",l'établissement propose des caractéristiques d...,l'établissement propose une large gamme de ser...,le lieu propose une gamme d'activités récréati...,40.491120,-3.656183,madrid,comunidad de madrid,...,madrid,spain,urban,97,97,0,0,0,hotel,1
57,3590a750-6b47-479c-bda5-9cd441eadbf4,lieu 57,"situé dans une région pittoresque, cet établis...",l'espace est conçu pour être accessible à tous...,l'établissement propose une large gamme de ser...,l'établissement propose une variété d'activité...,39.812609,3.106371,platja de muro,illes balears,...,illes balears,spain,beach,130,130,0,0,0,hotel,1
