<a href="https://colab.research.google.com/github/mouha07/abdou-ahad/blob/main/PROJET_WEB_SEMANTIQUE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Projet Web Sémantique

Présentation des membres du groupe:



*   Mamadou Yaya MANE                       P32 3752
*   Mouhamadou DIALLO                       P32 3743
*   Cheikh Abdoul Ahad Mbacké DIOP          P32 3752



# Data Collection

In [None]:
!pip install sparqlwrapper



Importation

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import csv
import pandas as pd
from datetime import datetime


In [None]:
def execute_sparql_query(endpoint, query):
    # Fonction pour exécuter une requête SPARQL sur le point d'accès SPARQL spécifié
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results


In [None]:
def generate_transactional_file(results, output_file_path):
    # Écrire les résultats SPARQL dans un fichier CSV
    with open(output_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
        fieldnames = ["entity", "itemset"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        # Écrire l'en-tête du fichier CSV
        writer.writeheader()

        # Écrire chaque ligne de résultats dans le fichier CSV
        for result in results:
            # Assurez-vous que les données sont encodées en utf-8 avant l'écriture
            encoded_result = {key: value.encode('utf-8').decode('utf-8') if isinstance(value, str) else value for key, value in result.items()}
            writer.writerow(encoded_result)



In [None]:
# Obtenir la date et l'heure actuelles
maintenant = datetime.now()

# Formater la date selon le format spécifié (dd-mm-yyyy H:m:s)
date_formattee = maintenant.strftime("%d-%m-%Y %H:%M:%S")


def collect_artist_data():
    endpoint = "http://dbpedia.org/sparql"
    query = """
    PREFIX dbo: <http://dbpedia.org/ontology/>
    SELECT ?entity (GROUP_CONCAT(DISTINCT ?predicate; separator=",") AS ?itemset)
    WHERE {
      ?entity a dbo:Artist .
      ?entity ?predicate ?object .
      FILTER(isURI(?object) || isLiteral(?object))
      FILTER(contains(str(?predicate), str(dbo:)))
      FILTER(!contains(str(?predicate), "wikiPage"))
      FILTER(!contains(str(?predicate), "abstract"))
      FILTER(!contains(str(?predicate), "thumbnail"))
      FILTER(!contains(str(?predicate), "birthDate"))
      FILTER(!contains(str(?predicate), "birthPlace"))
      FILTER(!contains(str(?predicate), "birthYear"))
    } GROUP BY ?entity
    """

    results = execute_sparql_query(endpoint, query)
    artist_data = []

    for result in results["results"]["bindings"]:
        entity = result["entity"]["value"]
        itemset = result["itemset"]["value"]
        artist_data.append({"entity": entity, "itemset": itemset})

    # Appel de la fonction pour générer le fichier transactionnel En spécifiant la date sur le nom du fichier
    generate_transactional_file(artist_data, f"TRANSACTION_{date_formattee}.csv")

    return artist_data

In [None]:
artist_data = collect_artist_data()

# Afficher les données des artistes
for data in artist_data:
    print(f"Entity: {data['entity']}, Itemset: {data['itemset']}")

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Entity: http://dbpedia.org/resource/Ferron, Itemset: http://dbpedia.org/ontology/activeYearsStartYear,http://dbpedia.org/ontology/background,http://dbpedia.org/ontology/genre
Entity: http://dbpedia.org/resource/Filous, Itemset: http://dbpedia.org/ontology/activeYearsStartYear
Entity: http://dbpedia.org/resource/Flood_(producer), Itemset: http://dbpedia.org/ontology/activeYearsStartYear,http://dbpedia.org/ontology/associatedBand,http://dbpedia.org/ontology/associatedMusicalArtist,http://dbpedia.org/ontology/background,http://dbpedia.org/ontology/genre,http://dbpedia.org/ontology/occupation
Entity: http://dbpedia.org/resource/Flora_Macdonald_Reid, Itemset: http://dbpedia.org/ontology/field,http://dbpedia.org/ontology/nationality
Entity: http://dbpedia.org/resource/Florence_Pash, Itemset: http://dbpedia.org/ontology/birthName,http://dbpedia.org/ontology/country,http://dbpedia.org/ontology/field
Ent

# Association rule mining

In [None]:
#Importation du fichier Texte contenant Les transactions
transactional_data = pd.read_csv(f"TRANSACTION_{date_formattee}.csv", delimiter=",")

In [None]:
transactional_data

Unnamed: 0,entity,itemset
0,http://dbpedia.org/resource/Cadet_(rapper),http://dbpedia.org/ontology/activeYearsEndYear...
1,http://dbpedia.org/resource/Caitlin_Rose,http://dbpedia.org/ontology/activeYearsStartYe...
2,http://dbpedia.org/resource/Caleb_Schaber,"http://dbpedia.org/ontology/birthName,http://d..."
3,http://dbpedia.org/resource/Calle_Kristiansson,http://dbpedia.org/ontology/activeYearsEndYear...
4,http://dbpedia.org/resource/Call_Me_Ace,http://dbpedia.org/ontology/activeYearsStartYe...
...,...,...
9995,http://dbpedia.org/resource/Lloyd_Goodson,"http://dbpedia.org/ontology/deathDate,http://d..."
9996,http://dbpedia.org/resource/Lokomotif,http://dbpedia.org/ontology/activeYearsEndYear...
9997,http://dbpedia.org/resource/Mina_Forsyth,"http://dbpedia.org/ontology/award,http://dbped..."
9998,http://dbpedia.org/resource/Minnita_Daniel-Cox,http://dbpedia.org/ontology/nationality


In [None]:
#Récupération des itemsets
transactional_itemset = transactional_data.iloc[:,-1].values

transactions = []
for result in transactional_itemset:
    itemset = result.split(',')
    transactions.append(itemset)


# Parcourir chaque transaction et enlever la partie http://dbpedia.org/ontology/
for transaction in transactions:
    for i in range(len(transaction)):
        transaction[i] = transaction[i].replace('http://dbpedia.org/ontology/', '')

In [None]:
# Afficher les transactions après suppression de la partie spécifiée
transactions

[['activeYearsEndYear',
  'activeYearsStartYear',
  'associatedBand',
  'associatedMusicalArtist',
  'deathDate',
  'deathPlace',
  'genre',
  'imdbId'],
 ['activeYearsStartYear',
  'associatedBand',
  'associatedMusicalArtist',
  'genre',
  'hometown',
  'recordLabel'],
 ['birthName', 'deathDate', 'deathPlace', 'training'],
 ['activeYearsEndYear',
  'activeYearsStartYear',
  'genre',
  'hometown',
  'instrument',
  'occupation',
  'recordLabel'],
 ['activeYearsStartYear', 'alias', 'genre', 'occupation'],
 ['activeYearsStartYear',
  'associatedBand',
  'associatedMusicalArtist',
  'genre',
  'hometown'],
 ['activeYearsStartYear',
  'alias',
  'associatedBand',
  'associatedMusicalArtist',
  'hometown',
  'recordLabel'],
 ['alias', 'associatedBand', 'associatedMusicalArtist', 'genre', 'hometown'],
 ['birthName'],
 ['activeYearsStartYear',
  'associatedBand',
  'associatedMusicalArtist',
  'genre',
  'hometown',
  'recordLabel'],
 ['activeYearsStartYear',
  'associatedBand',
  'associate

In [None]:
# Convertissez chaque transaction en un ensemble
transactions = [set(transaction) for transaction in transactions]

# Créez un dictionnaire pour mapper les prédicats aux entiers
predicate_to_int = {predicate: i for i, predicate in enumerate(set.union(*transactions))}

# Traduction inverse de int_to_predicate
int_to_predicate = {value: key for key, value in predicate_to_int.items()}

# Mappez les prédicats aux entiers dans les transactions
transaction_data_for_Apriori = [[predicate_to_int[predicate] for predicate in transaction] for transaction in transactions]

In [None]:
transaction_data_for_Apriori

[[53, 0, 24, 7, 19, 10, 11, 26],
 [0, 7, 40, 10, 11, 52],
 [26, 24, 27, 60],
 [0, 28, 7, 19, 40, 12, 52],
 [0, 20, 12, 7],
 [0, 7, 10, 11, 52],
 [10, 7, 40, 20, 11, 52],
 [0, 10, 20, 11, 52],
 [60],
 [0, 7, 40, 10, 11, 52],
 [0, 7, 40, 10, 11, 52],
 [19, 10, 11, 7],
 [20, 40, 7],
 [0, 7, 10, 11, 12],
 [0, 52, 28, 7],
 [35, 12, 60, 7],
 [0, 20, 40, 12],
 [0, 20, 52],
 [0, 24, 7, 19, 20, 40, 26],
 [0, 7],
 [35, 27, 60],
 [27],
 [28],
 [0, 10, 11, 7],
 [0, 10, 11, 7],
 [0, 7, 40, 10, 11, 52],
 [0, 52, 7],
 [0, 40, 52, 28],
 [27, 30, 33, 24, 50, 60, 26],
 [19, 0, 7],
 [0, 7, 40, 10, 11],
 [0, 7, 40, 10, 11],
 [0, 7, 40, 10, 11],
 [30, 24, 50, 60, 26],
 [24, 6, 35, 7, 19, 12, 26],
 [0, 28, 7, 40, 10, 11, 52],
 [0, 24, 7, 40, 19, 10, 11, 26],
 [0, 24, 10, 7, 40, 19, 20, 11, 26],
 [26, 24, 27, 60],
 [33, 24, 35, 50, 60, 26],
 [0, 7],
 [0, 10, 11, 40],
 [0, 40],
 [24],
 [33, 30, 24, 27],
 [0, 40, 7],
 [33, 35, 24, 26],
 [24],
 [0, 20, 7],
 [0, 20, 52],
 [35],
 [0, 24, 7, 40, 19, 10, 11, 26],
 

In [None]:
# Écrivez les transactions Mapper dans un fichier
with open('transaction_data_for_Apriori.txt', 'w') as f:
    for transaction in transaction_data_for_Apriori:
        f.write(' '.join(map(str, transaction)) + '\n')

In [None]:
# Installation de Java
!apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

# Téléchargement de SPMF
!wget http://www.philippe-fournier-viger.com/spmf/spmf.jar

--2023-12-05 21:28:04--  http://www.philippe-fournier-viger.com/spmf/spmf.jar
Resolving www.philippe-fournier-viger.com (www.philippe-fournier-viger.com)... 104.21.33.228, 172.67.193.154, 2606:4700:3035::ac43:c19a, ...
Connecting to www.philippe-fournier-viger.com (www.philippe-fournier-viger.com)|104.21.33.228|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12692580 (12M) [application/java-archive]
Saving to: ‘spmf.jar’


2023-12-05 21:28:04 (40.7 MB/s) - ‘spmf.jar’ saved [12692580/12692580]



In [None]:
# Exécution de SPMF
!java -jar spmf.jar run Apriori_association_rules transaction_data_for_Apriori.txt output.txt 0.01 0.7

>/content/spmf.jar
 Candidates count : 368
 The algorithm stopped at size 5
 Frequent itemsets count : 114
 Maximum memory usage : 4.5508575439453125 mb
 Total time ~ 211 ms
 Number of association rules generated : 45
 Total time ~ 4 ms


In [None]:
# Lecture des résultats
with open('output.txt', 'r') as f:
    rules = f.read().split('\n')

In [None]:
rules

['15 ==> 0 #SUP: 1436 #CONF: 0.7409700722394221',
 '28 ==> 0 #SUP: 1199 #CONF: 0.7545626179987414',
 '40 ==> 0 #SUP: 2351 #CONF: 0.8496566678713408',
 '52 ==> 0 #SUP: 1653 #CONF: 0.8035974720466699',
 '7 10 ==> 0 #SUP: 131 #CONF: 0.9924242424242424',
 '7 11 ==> 0 #SUP: 131 #CONF: 0.9924242424242424',
 '7 19 ==> 0 #SUP: 113 #CONF: 0.9495798319327731',
 '7 40 ==> 0 #SUP: 533 #CONF: 0.9586330935251799',
 '7 52 ==> 0 #SUP: 389 #CONF: 0.9534313725490197',
 '0 11 ==> 10 #SUP: 438 #CONF: 1.0',
 '10 20 ==> 0 #SUP: 100 #CONF: 0.9259259259259259',
 '10 40 ==> 0 #SUP: 146 #CONF: 0.7604166666666666',
 '10 52 ==> 0 #SUP: 217 #CONF: 0.8509803921568627',
 '11 52 ==> 0 #SUP: 116 #CONF: 0.8',
 '15 24 ==> 0 #SUP: 216 #CONF: 0.7422680412371134',
 '15 28 ==> 0 #SUP: 434 #CONF: 0.8714859437751004',
 '15 40 ==> 0 #SUP: 704 #CONF: 0.9166666666666666',
 '15 52 ==> 0 #SUP: 456 #CONF: 0.8702290076335878',
 '20 40 ==> 0 #SUP: 125 #CONF: 0.8333333333333334',
 '20 52 ==> 0 #SUP: 178 #CONF: 0.8476190476190476',
 '0

In [None]:
#Dans le code suivant nous récupérions les rules générés en récupérant le support et le confidence
rule_generate = []
sup_conf_generated = []
for rule in rules:
  if rule!="":
    rule_generate.append(rule.split(" #SUP: ")[0])
    sup_conf_generated.append(rule.split(" #SUP: ")[1].split(" #CONF: "))

In [None]:
# Parcourir les règles et effectuer la traduction inverse
tab_rules_generate = []
for rule in rule_generate:
    if rule!="":
        items = rule.split(" ==> ")  # Trouver tous les entiers dans la règle
        antecedent = items[0]  # Le premier élément est l'antécédent
        consequent = items[1]  # Le deuxième élément est le conséquent

        # Traduire les entiers aux prédicats dans l'antécédent et le conséquent
        antecedent_predicates = [int_to_predicate[int(item)] for item in antecedent.split(" ")]
        consequent_predicates = [int_to_predicate[int(item)] for item in consequent.split(" ")]

        # Afficher la règle avec les prédicats
        rules_generates_format = f"{{{', '.join(antecedent_predicates)}}} -> {{{', '.join(consequent_predicates)}}}"
        tab_rules_generate.append(rules_generates_format)
        print(f"{', '.join(antecedent_predicates)} => {', '.join(consequent_predicates)}")

background => genre
instrument => genre
recordLabel => genre
hometown => genre
activeYearsStartYear, associatedMusicalArtist => genre
activeYearsStartYear, associatedBand => genre
activeYearsStartYear, activeYearsEndYear => genre
activeYearsStartYear, recordLabel => genre
activeYearsStartYear, hometown => genre
genre, associatedBand => associatedMusicalArtist
associatedMusicalArtist, alias => genre
associatedMusicalArtist, recordLabel => genre
associatedMusicalArtist, hometown => genre
associatedBand, hometown => genre
background, deathDate => genre
background, instrument => genre
background, recordLabel => genre
background, hometown => genre
alias, recordLabel => genre
alias, hometown => genre
genre, deathPlace => deathDate
deathDate, recordLabel => genre
instrument, recordLabel => genre
instrument, hometown => genre
recordLabel, hometown => genre
activeYearsStartYear, associatedBand => associatedMusicalArtist
activeYearsStartYear, associatedMusicalArtist => associatedBand
associatedB

# Query Generation

In [None]:
#Dans cette fonction nous avons implémenter les trois cas de query mise dans le PROJET.
#Par défaut nombre de cas est égale à 1, ce qui veut dire qu'un seul query sera renvoyer dans un tableau
def generate_sparql_queries(input_string, int_to_predicate, nombre_de_cas = 1):
    sparql_queries = []
    prefix = "PREFIX dbo: <http://dbpedia.org/ontology/>\n"
    rules = input_string.split(" -> ")
    if len(rules) != 2:
        print("Format d'entrée invalide.")
        return sparql_queries

    if nombre_de_cas >= 1:
      antecedent_predicates = rules[0][1:-1].split(', ')
      consequent_predicates = rules[1][1:-1].split(', ')

      # Generate SPARQL query clauses for antecedent and consequent
      antecedent_clause_case1 = '.\n'.join([f"?subject dbo:{pred} ?ant{i}" for i, pred in enumerate(antecedent_predicates, start=1)])
      consequent_clause_case1 = '.\n'.join([f"?subject dbo:{pred} ?cons{i}" for i, pred in enumerate(consequent_predicates, start=1)])

      select_clause_case1 = f"SELECT DISTINCT * WHERE {{\n {antecedent_clause_case1}.\n"
      where_clause_case1 = f"{{\n SELECT DISTINCT * {{\n {consequent_clause_case1}.\n}}\n}}"

      # Create the final SPARQL query
      sparql_query_case1 = f"{prefix}{select_clause_case1}{where_clause_case1}\n}}"
      sparql_queries.append(sparql_query_case1)

    if nombre_de_cas >= 2:
      # Generate SPARQL query clauses for antecedent and consequent
      antecedent_clause_case_2 = '.\n'.join([f"?subject dbo:{pred} ?const{i}" for i, pred in enumerate(consequent_predicates, start=1)])
      consequent_clause_case_2 = '.\n'.join([f"?subject dbo:{pred} ?ant{i}" for i, pred in enumerate(antecedent_predicates, start=1)])

      select_clause_case_2 = f"SELECT DISTINCT * WHERE {{\n {antecedent_clause_case_2}.\n"
      where_clause_case_2 = f"{{\n SELECT DISTINCT * {{\n {consequent_clause_case_2}.\n}}\n}}"

      # Create the final SPARQL query
      sparql_query_case_2 = f"{prefix}{select_clause_case_2}{where_clause_case_2}\n}}"
      sparql_queries.append(sparql_query_case_2)


    if nombre_de_cas >= 3:
      # Generate SPARQL query clauses for antecedent and consequent
      antecedent_clause_case_3 = '.\n'.join([f"?subject dbo:{pred} ?const{i}" for i, pred in enumerate(consequent_predicates, start=1)])
      consequent_clause_case_3 = '.\n'.join([f"?subject dbo:{pred} ?ant{i}" for i, pred in enumerate(antecedent_predicates, start=1)])

      select_clause_case_3 = f"SELECT DISTINCT * WHERE {{\n{antecedent_clause_case_3}.\n"
      where_clause_case_3 = f"{consequent_clause_case_3}.\n}}"

      # Create the final SPARQL query
      sparql_query_case_3 = f"{prefix}{select_clause_case_3}{where_clause_case_3}\n"
      sparql_queries.append(sparql_query_case_3)

    return sparql_queries



In [None]:
query_generates = []
for rule in tab_rules_generate:
    # Utilisation de la fonction pour générer les requêtes SPARQL
    generated_sparql_querie = generate_sparql_queries(rule, int_to_predicate)
    query_generates.append(generated_sparql_querie[0])
    # Affichage des requêtes SPARQL générées
    print(f"Requêtes SPARQL générées à partir de l'entrée {rule} :")
    print(generated_sparql_querie[0])
    print("\n\n")

Requêtes SPARQL générées à partir de l'entrée {background} -> {genre} :
PREFIX dbo: <http://dbpedia.org/ontology/>
SELECT DISTINCT * WHERE {
 ?subject dbo:background ?ant1.
{
 SELECT DISTINCT * {
 ?subject dbo:genre ?cons1.
}
}
}



Requêtes SPARQL générées à partir de l'entrée {instrument} -> {genre} :
PREFIX dbo: <http://dbpedia.org/ontology/>
SELECT DISTINCT * WHERE {
 ?subject dbo:instrument ?ant1.
{
 SELECT DISTINCT * {
 ?subject dbo:genre ?cons1.
}
}
}



Requêtes SPARQL générées à partir de l'entrée {recordLabel} -> {genre} :
PREFIX dbo: <http://dbpedia.org/ontology/>
SELECT DISTINCT * WHERE {
 ?subject dbo:recordLabel ?ant1.
{
 SELECT DISTINCT * {
 ?subject dbo:genre ?cons1.
}
}
}



Requêtes SPARQL générées à partir de l'entrée {hometown} -> {genre} :
PREFIX dbo: <http://dbpedia.org/ontology/>
SELECT DISTINCT * WHERE {
 ?subject dbo:hometown ?ant1.
{
 SELECT DISTINCT * {
 ?subject dbo:genre ?cons1.
}
}
}



Requêtes SPARQL générées à partir de l'entrée {activeYearsStartYear, a

# Query validation


In [None]:
for i in range(3):
    endpoint = "http://dbpedia.org/sparql"
    results = execute_sparql_query(endpoint, query_generates[i])
    resultat_ent= results["results"]["bindings"]
    print(f"Query{i}, Number résultat : {len(resultat_ent)}\n")

Query0, Number résultat : 10000

Query1, Number résultat : 10000

Query2, Number résultat : 10000



# the list of top-10 most frequent

In [None]:
top_10_frequent_queries = query_generates[:10]
top_10_frequent_queries_sup_conf = sup_conf_generated[:10]
top_10_tab_rules_generate = tab_rules_generate[:10]

In [None]:
#Cette fonction nous permettra de calculer le support de Y par rapport aux règles générés par l'algo de Apriori
def calcul_support_Y(rules, Y):
  somme = 0
  for rule in rules:
    if str(Y) == rule.split(" -> ")[1]:
      somme+=1
  return somme

In [None]:
dic = {}

for i in range(len(top_10_frequent_queries_sup_conf)):

    support_y = calcul_support_Y(tab_rules_generate, top_10_tab_rules_generate[i].split(" -> ")[1]) /len(tab_rules_generate)
    conf_x_y = top_10_frequent_queries_sup_conf[i][1]
    lift = float(conf_x_y) / support_y

    dic[i] = {
        "X->Y"  : top_10_tab_rules_generate[i],
        "conf" : conf_x_y,
        "lift" : lift
    }

In [None]:
dataframe = pd.DataFrame(dic)
dataframe = dataframe.T
dataframe

Unnamed: 0,X->Y,conf,lift
0,{background} -> {genre},0.7409700722394221,1.075602
1,{instrument} -> {genre},0.7545626179987414,1.095333
2,{recordLabel} -> {genre},0.8496566678713408,1.233373
3,{hometown} -> {genre},0.8035974720466699,1.166512
4,"{activeYearsStartYear, associatedMusicalArtist...",0.9924242424242424,1.440616
5,"{activeYearsStartYear, associatedBand} -> {genre}",0.9924242424242424,1.440616
6,"{activeYearsStartYear, activeYearsEndYear} -> ...",0.9495798319327732,1.378422
7,"{activeYearsStartYear, recordLabel} -> {genre}",0.95863309352518,1.391564
8,"{activeYearsStartYear, hometown} -> {genre}",0.9534313725490196,1.384013
9,"{genre, associatedBand} -> {associatedMusicalA...",1.0,7.5
