In [1]:
import requests
import csv
import json
import random
import time

from tqdm import tqdm

from extraction_croissement import *

In [2]:
start_loading_data_time = time.time()

# Loading des données sauvegardées dans la mémoire ram
g_book_ADP = rdflib.Graph()
g_author_ADP = rdflib.Graph()
ADP_book_graph = g_book_ADP.parse("../Graphes/grapheADPLivres.rdf")
ADP_author_graph = g_author_ADP.parse("../Graphes/grapheADPAuteurs.rdf")
ADP_books = get_ADP_books(g_book_ADP, g_author_ADP)
ADP_loading_time = time.time()
print("ADP_loading_time: ", ADP_loading_time - start_loading_data_time)

g_item_DL = rdflib.Graph()
book_graph_DL = g_item_DL.parse("../Graphes/grapheDepotLegal.rdf")
DL_books = get_depot_legal_book(g_item_DL)
DL_loading_time = time.time()
print("DL_loading_time: ", DL_loading_time - ADP_loading_time)

g_item_ILE = rdflib.Graph()
item_graph_ILE = g_item_ILE.parse("../Graphes/grapheILE.rdf")
ILE_books = get_ILE_book(g_item_ILE)
ILE_loading_time = time.time()
print("ILE_loading time: ", ILE_loading_time - DL_loading_time)

books_Hurtubise_file = open("./Hurtubise/Exportation-Hurtubise.csv", "r", encoding='ISO-8859-1')
csv_reader = csv.DictReader(books_Hurtubise_file, delimiter=',', fieldnames=[
    "Editeur", "ISBN Papier", "ISBN PDF", "ISBN epub", "Titre", "Sous - titre", "Titre de la serie",
    "Contributeurs", "Contributeur(premier)", "Langue", "Langue Origine", "Resume", "Nombre de pages",
    "Date de parution", "Annee de parution", "Sujet  THEMA principal", "Sujet THEMA",
    "Quantificateur Georaphique", "Quantificateur de langue", "Quantificateur Historique", "Niveau soclaire FR",
    "Niveau scolaire QC", "Cycle scolaire FR", "Niveau de lecture", "Echele CECR", "Quantificateur d'interet",
    "Quantificateur d'age", "Quantificateur de style", "Classification Editoriale", "Mots cles"

])
Hurtubise_books = get_Hurtubise_books(csv_reader)
books_Hurtubise_file.close()

authors_ILE_file = open("./ILE/auteurs_ILE_comma_separated.csv", 'r', encoding='ISO-8859-1')
csv_reader = csv.DictReader(authors_ILE_file, delimiter=',', fieldnames=[
    'uri', 'nom', 'bio', 'genres', 'site', 'pseudonyme'])
authors_ILE = [x for x in csv_reader]
authors_ILE_file.close()

authors_wikidata_file = open("./Wikidata/ecrivains_wikidata_comma_separated.csv", 'r', encoding='ISO-8859-1')
csv_reader = csv.DictReader(authors_wikidata_file, delimiter=',', fieldnames=[
    'nom', 'uri'])
authors_wikidata = [x for x in csv_reader]
authors_wikidata_file.close()

authors_DBpedia_file = open("./DBpedia/ecrivains_dbpedia_fr.txt", "r", encoding='ISO-8859-1')
csv_reader = csv.DictReader(authors_DBpedia_file, delimiter=';', fieldnames=[
    'uri', 'nom'])
authors_DBpedia = [x for x in csv_reader]
authors_DBpedia_file.close()

babelioJsonBooks = open("./Babelio/babelio_livres.json", "r")
Babelio_books = get_Babelio_books(json.load(babelioJsonBooks))
babelioJsonBooks.close()

babelioJsonAuthor = open("./Babelio/babelio_auteurs.json", "r")
Babelio_authors = get_Babelio_books(json.load(babelioJsonAuthor))
babelioJsonAuthor.close()

loading_data_time = time.time()
print("loading_data_time: ", loading_data_time - start_loading_data_time)

all_books = ADP_books + ILE_books + Hurtubise_books + Babelio_books + DL_books
random.shuffle(all_books)

author_ls = {}
counter = {'total':0,
           'ADP':0,
           'ILE':0,
           'Hurtubise':0,
           'Babelio':0,
           'Depot_legal':0,
           }
print('recoupement des auteurs')
for book in tqdm(all_books, total=len(all_books)):
    if len(book['author_raw']) == 1 and isinstance(book['author_raw'][0], list):
        book['author_raw'] = book['author_raw'][0]
    for author in book['author_raw']:
        if author.replace('"', '') not in author_ls:
            author_ls[author.replace('"', '')] = [book]
            counter['total'] += 1
            counter[book['data_base']] += 1
        else:
            author_ls[author.replace('"', '')].append(book)
print(counter)

ADP_loading_time:  9.702158212661743
DL_loading_time:  38.9461030960083
ILE_loading time:  12.5650794506073
loading_data_time:  61.99634766578674
recoupement des auteurs
{'total': 21515, 'ADP': 5195, 'ILE': 0, 'Hurtubise': 884, 'Babelio': 1651, 'Depot_legal': 13785}


100%|██████████| 102985/102985 [00:00<00:00, 428015.23it/s]


In [3]:
# test sur le croissement des auteurs
famous = ['Michel Tremblay', 'Anne Hébert', 'Gabrielle Roy', 'Marie Cardinal', 'Réjean Ducharme',
                'Jacques Ferron', 'Victor-Lévy Beaulieu', 'Marcel Dubé', 'Yves Thériault', 'Jacques Poulin',
                'André Langevin']
for famous_author in famous:
    for author in author_ls:
        famous_author_part_1, famous_author_part_2 = famous_author.split(" ")
        re_part_1 = re.search(r"{0}".format(famous_author_part_1), author, flags=re.IGNORECASE)
        re_part_2 = re.search(r"{0}".format(famous_author_part_2), author, flags=re.IGNORECASE)
        re_full = re.search(r"{0}".format(famous_author), author, flags=re.IGNORECASE)
        if re_part_1:
            re_part_1_result = re_part_1.group()
        if re_part_2:
            re_part_2_result = re_part_2.group()
        if re_full:
            re_full_result = re_full.group()
        if re_full:
            print("full find: ", famous_author, " with ", author, " in the db")
        elif re_part_1 and re_part_2:
            print("reverse find: ", famous_author, " with ", author, " in the db")
        elif (re_part_1 or re_part_2):
            pass
            #print("partial find: ", famous_author, " with ", author, " in the db")

full find:  Michel Tremblay  with  Michel Tremblay  in the db
reverse find:  Michel Tremblay  with  Michel  Tremblay  in the db
reverse find:  Michel Tremblay  with  Michelle Tremblay-Lacoursière  in the db
reverse find:  Michel Tremblay  with  MICHEL G. TREMBLAY  in the db
full find:  Michel Tremblay  with  Pierre-Michel Tremblay  in the db
reverse find:  Michel Tremblay  with  Michelle  Tremblay Lacoursière  in the db
full find:  Anne Hébert  with  Anne Hébert  in the db
reverse find:  Anne Hébert  with  Anne  Hébert  in the db
full find:  Gabrielle Roy  with  Gabrielle Roy  in the db
reverse find:  Gabrielle Roy  with  Gabrielle  Roy  in the db
full find:  Marie Cardinal  with  Marie Cardinal  in the db
reverse find:  Marie Cardinal  with  Marie-Josée Cardinal  in the db
reverse find:  Réjean Ducharme  with  Réjean  Ducharme  in the db
full find:  Jacques Ferron  with  Jacques Ferron  in the db
reverse find:  Jacques Ferron  with  Jacques  Ferron  in the db
full find:  Victor-Lévy B

In [4]:
"""
Numero de prop importante:
- sex or gender (P21)
- country of citizenship (P27)
- name in native language (P1559)
- birth name (P1477)
-  given name  (P735)
- family name (P734)
- date of birth (P569)
- place of birth (P19)
- date of death  (P570)
- place of death  (P20)
- occupation (P106)
    - author (Q482980)
    - writer (Q36180)
- notable work (P800)
- genre (P136)
- award received (P166)
- nominated for (P1411)
- country of citizenship (P27)

premiere query: ecrivain canadiens parlant/ecrivant en francais => 1232 resultats
SELECT ?item ?itemLabel
WHERE
{{

    ?item wdt:P31 wd:Q5 .
    ?item wdt:P106 wd:Q36180 .
    ?item wdt:P27 wd:Q16 .
    ?item wdt:P1412 wd:Q150

    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".}
}}

deuxieme query: ecrivain canadiens parlant/ecrivant en francais qui ont un produit notable => 82 resultats
SELECT ?item ?itemLabel ?bookLabel
WHERE
{

  ?item wdt:P31 wd:Q5 .
  ?item wdt:P106 wd:Q36180 .
  ?item wdt:P27 wd:Q16 .
  ?item wdt:P1412 wd:Q150 .
  ?item wdt:P800 ?book

  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".}
}

"""

q = """
SELECT DISTINCT ?s ?book ?bookLabel
WHERE
{{
    ?s wdt:P31 wd:Q5 .
    {{ ?s wdt:P106  wd:Q36180 }} UNION
    {{ ?s wdt:P106  wd:Q49757 }} UNION
    {{ ?s wdt:P106  wd:Q214917 }} UNION
    {{ ?s wdt:P106  wd:Q381353 }} UNION
    {{ ?s wdt:P106  wd:Q6625963 }} UNION
    {{ ?s wdt:P106  wd:Q4853732 }} UNION
    {{ ?s wdt:P106  wd:Q5434338 }} UNION
    {{ ?s wdt:P106  wd:Q1626130 }} UNION
    {{ ?s wdt:P106  wd:Q10297252 }} UNION
    {{ ?s wdt:P106  wd:Q18844224 }} UNION
    {{ ?s wdt:P106  wd:Q15980158 }} UNION
    {{ ?s wdt:P106  wd:Q26203955 }} UNION
    {{ ?s wdt:P106  wd:Q11774202 }} UNION
    {{ ?s wdt:P106  wd:Q1930187 }} UNION
    {{ ?s wdt:P106  wd:Q487596 }} UNION
    {{ ?s wdt:P106  wd:Q201788 }} .

    {{ ?s rdfs:label "{0}"@fr }} UNION {{ ?s rdfs:label "{0}"@en }} UNION {{ ?s skos:altLabel "{0}"@fr }} .
    OPTIONAL {{ ?s wdt:P800 ?book . }}

    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,en".}}
}}
"""

In [12]:
# test with famous authors
res = {}
print('tset query')
count = 0
for author in tqdm(famous, total=len(famous)):
    time.sleep(5)
    test = q.format(author)
    rep_wikidata_ecrivain = requests.get('https://query.wikidata.org/sparql?format=json&query=' + q.format(author))
    if rep_wikidata_ecrivain.status_code == 200:
        if len(rep_wikidata_ecrivain.json()['results']['bindings']) > 0:
            count += 1
            res[author] = {'id':rep_wikidata_ecrivain.json()['results']['bindings'][0]['s']['value'], 'books':[]}
            for resultat in rep_wikidata_ecrivain.json()['results']['bindings']:
                if 'book' in resultat:
                    res[author]['books'].append({'book_id': resultat['book']['value'], 'book_label': resultat['bookLabel']['value']})
print("Nombre d'auteurs trouvés: ", count, " soit ", count * 100/len(famous), "%")

100%|██████████| 11/11 [00:56<00:00,  5.13s/it]


tset query
Nombre d'auteurs trouvés:  9  soit  81.81818181818181 %


In [14]:
res = {}
print('query')
count = 0
# for author in tqdm(author_ls.keys(), total=len(author_ls)):
for author in tqdm(list(author_ls)[:], total=len(list(author_ls)[:])):
    time.sleep(1)
    rep_wikidata_ecrivain = requests.get('https://query.wikidata.org/sparql?format=json&query=' + q.format(author))
    if rep_wikidata_ecrivain.status_code == 429:
        print("trop rapide !!!")
    if rep_wikidata_ecrivain.status_code == 200:
        if len(rep_wikidata_ecrivain.json()['results']['bindings']) > 0:
            count += 1
            res[author] = {'id':rep_wikidata_ecrivain.json()['results']['bindings'][0]['s']['value'], 'books':[]}
            for resultat in rep_wikidata_ecrivain.json()['results']['bindings']:
                if 'book' in resultat:
                    res[author]['books'].append({'book_id': resultat['book']['value'], 'book_label': resultat['bookLabel']['value']})
print("Nombre d'auteurs trouvés: ", count, " soit ", count * 100/len(author_ls), "%")

with open('wikidata_author_books_list.json', 'w') as outfile:
    json.dump(res, outfile)

100%|██████████| 21515/21515 [7:14:56<00:00,  1.21s/it]   


query
trop rapide !!!
trop rapide !!!
trop rapide !!!
trop rapide !!!
trop rapide !!!
trop rapide !!!
trop rapide !!!
trop rapide !!!
trop rapide !!!
Nombre d'auteurs trouvés:  2582  soit  12.000929584011155 %


In [10]:
all_authors_query = """
SELECT DISTINCT ?author ?authorLabel ?bookLabel ?awardLabel
WHERE
{
    ?author wdt:P31 wd:Q5 .
    ?author wdt:P27 ?country .
    ?country wdt:P2936 wd:Q150 .
    ?author wdt:P106 ?occup .
    ?occup wdt:P279 wd:Q36180

    OPTIONAL { ?author ?list_of_work_sub_class ?book .
                ?list_of_work_sub_class wdt:P27 wd:P1455}
    OPTIONAL { ?author wdt:P800 ?book . }
    OPTIONAL { ?author wdt:P166 ?award . }

    SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,en".}

}"""
rep_wikidata_ecrivain = requests.get('https://query.wikidata.org/sparql?format=json&query=' + all_authors_query)
print('code_status: ', rep_wikidata_ecrivain.status_code)

code_status:  200


In [9]:
cleaned_data = {}
if rep_wikidata_ecrivain.status_code == 200:
        if len(rep_wikidata_ecrivain.json()['results']['bindings']) > 0:
            print('exemples: ')
            print(json.dumps(rep_wikidata_ecrivain.json()['results']['bindings'][:2], indent=2))
            for result in tqdm(rep_wikidata_ecrivain.json()['results']['bindings']):
                if result['author']['value'] not in cleaned_data:
                    cleaned_data[result['author']['value']] = {
                        'author_label': result['authorLabel']['value'],
                        'works': [result['bookLabel']['value']] if 'bookLable' in result else [],
                        'awards': [result['awardLabel']['value']] if 'awardLabel' in result else []
                    }
                else:
                    if 'bookLabel' in result and result['bookLabel']['value']  not in cleaned_data[result['author']['value']]['works']:
                        cleaned_data[result['author']['value']]['works'].append(result['bookLabel']['value'])
                    if 'awardLabel' in result and result['awardLabel']['value'] not in cleaned_data[result['author']['value']]['awards']:
                        cleaned_data[result['author']['value']]['awards'].append(result['awardLabel']['value'])


100%|██████████| 30872/30872 [00:00<00:00, 203243.69it/s]


exemples: 
[
  {
    "author": {
      "type": "uri",
      "value": "http://www.wikidata.org/entity/Q1276"
    },
    "authorLabel": {
      "xml:lang": "fr",
      "type": "literal",
      "value": "Leonard Cohen"
    },
    "bookLabel": {
      "xml:lang": "en",
      "type": "literal",
      "value": "Parasites of Heaven"
    },
    "awardLabel": {
      "xml:lang": "fr",
      "type": "literal",
      "value": "prix du Gouverneur g\u00e9n\u00e9ral"
    }
  },
  {
    "author": {
      "type": "uri",
      "value": "http://www.wikidata.org/entity/Q1276"
    },
    "authorLabel": {
      "xml:lang": "fr",
      "type": "literal",
      "value": "Leonard Cohen"
    },
    "bookLabel": {
      "xml:lang": "fr",
      "type": "literal",
      "value": "Songs from a Room"
    },
    "awardLabel": {
      "xml:lang": "fr",
      "type": "literal",
      "value": "Rock and Roll Hall of Fame"
    }
  }
]


In [None]:
# test sur les auteurs de wikidata
for author in res:
    for famous_author in famous:
        famous_author_part_1, famous_author_part_2 = famous_author.split(" ")
        re_part_1 = re.search(r"{0}".format(famous_author_part_1), author, flags=re.IGNORECASE)
        re_part_2 = re.search(r"{0}".format(famous_author_part_2), author, flags=re.IGNORECASE)
        re_full = re.search(r"{0}".format(famous_author), author, flags=re.IGNORECASE)
        if re_part_1:
            re_part_1_result = re_part_1.group()
        if re_part_2:
            re_part_2_result = re_part_2.group()
        if re_full:
            re_full_result = re_full.group()
        if re_full:
            print("full find: ", famous_author, " with ", author, " in the db")
            print("infos recoltées: ", json.dumps(res[author]))
        elif (re_part_1 and re_part_2):
            print("reverse find: ", famous_author, " with ", author, " in the db")
            print("infos recoltées: ", json.dumps(res[author]))
        elif (re_part_1 or re_part_2):
            pass