## Scraping i przygotowanie danych

In [1]:
from bs4 import BeautifulSoup
import requests
import isbnlib

In [76]:
URL = "https://wolnelektury.pl/katalog/"
baseURL = "https://wolnelektury.pl/"
page = requests.get(URL)

In [77]:
soup = BeautifulSoup(page.content, 'html.parser')

In [78]:
catalog = soup.find('div', class_='plain-list-container')

In [79]:
book_links = catalog.find_all('a')

In [80]:
book_links[0]['href']

'/katalog/lektura/jego-zasady/'

In [83]:
links = []
for book_link in book_links:
    link_url = baseURL + book_link['href'][1:]
    links.append(link_url)

In [84]:
links[0]

'https://wolnelektury.pl/katalog/lektura/jego-zasady/'

In [85]:
subURL = links[0]
subpage = requests.get(subURL)
soup = BeautifulSoup(subpage.content, 'html.parser')

In [102]:
author = soup.find(class_='author').text.strip()
title = soup.find(class_='title').text.strip()
isbn = isbnlib.isbn_from_words(title)
link = baseURL + soup.find('a', string='PDF')['href'][1:]
tags_raw = soup.find_all(class_='book-box-tag')
tags = []
for tag in tags_raw:
    tags.append(tag.text.strip())

In [106]:
if isbn == '':
    isbn = 0

In [107]:
print(author, '\n', title, '\n', isbn, '\n', tags, '\n', link)

Adolf Abrahamowicz 
 Jego zasady 
 0 
 ['Pozytywizm', 'Dramat', 'Komedia'] 
 https://wolnelektury.pl/media/book/pdf/jego-zasady.pdf


In [123]:
books = []

In [124]:
for link in links:
    subURL = link
    subpage = requests.get(subURL)
    soup = BeautifulSoup(subpage.content, 'html.parser')
    pdf_link = soup.find('a', string='PDF')
    if pdf_link == None:
        continue
    pdf_link = baseURL + pdf_link['href'][1:]
    author = soup.find(class_='author').text.strip()
    title = soup.find(class_='title').text.strip()
    isbn = isbnlib.isbn_from_words(title)
    if isbn == '':
        isbn = '0'
    
    tags_raw = soup.find_all(class_='book-box-tag')
    tags = []
    for tag in tags_raw:
        tags.append(tag.text.strip())
    books.append([title, author, isbn, set(tags), pdf_link])

### Zapis pobranych danych w pliku pickle

In [3]:
import pickle

In [131]:
with open('books.pkl', 'wb') as f:
    pickle.dump(books, f)

In [136]:
with open("books_look.txt", "w", encoding="utf-8") as output:
    output.write(str(books))

In [140]:
len(books)

2310

In [4]:
with open('books.pkl', 'rb') as f:
    books = pickle.load(f)

In [14]:
tagset = set()

In [26]:
rmtag = set()
addtag = set()
for book in books:
    rmtag.clear()
    addtag.clear()
    for tag in book[3]:
        if ',' in tag:
            splitlist = [x.strip() for x in tag.split(',')]
            set(splitlist)
            rmtag.add(tag)
            addtag.update(splitlist)
    for tag in rmtag:
        book[3].remove(tag)
    book[3].update(addtag)
    tagset.update(book[3])
        

In [27]:
books[0]

['Jego zasady',
 'Adolf Abrahamowicz',
 '0',
 {'Dramat', 'Komedia', 'Pozytywizm'},
 'https://wolnelektury.pl/media/book/pdf/jego-zasady.pdf']

In [40]:
for book in books:
    book[3].discard('')
    book[3].discard('nie dotyczy')

In [41]:
tagset.discard('')
tagset.discard('nie dotyczy')

In [44]:
InsertTags = ''
for tag in tagset:
    InsertTags += "INSERT INTO tags (Tag) VALUES ('{}');\n".format(tag)

In [None]:
authorset = set()

In [47]:
books[0]

['Jego zasady',
 'Adolf Abrahamowicz',
 '0',
 {'Dramat', 'Komedia', 'Pozytywizm'},
 'https://wolnelektury.pl/media/book/pdf/jego-zasady.pdf']

In [59]:
authorset.clear()
for book in books:
    authorset.add(book[1])

In [61]:
authcorrect = set()
for author in authorset:
    if '\n' in author:
        authcorrect.add(author)

In [62]:
authcorrect

set()

In [58]:
for book in books:
    if book[1] in authcorrect:
        books.remove(book)
    

In [64]:
InsertAuthors = ''
for author in authorset:
    InsertAuthors += "INSERT INTO authors (Author) VALUES ('{}');\n".format(author)

In [66]:
books[0]

['Jego zasady',
 'Adolf Abrahamowicz',
 '0',
 {'Dramat', 'Komedia', 'Pozytywizm'},
 'https://wolnelektury.pl/media/book/pdf/jego-zasady.pdf']

In [69]:
with open("InsertTags.txt", "w", encoding="utf-8") as output:
    output.write(str(InsertTags))
with open("InsertAuthors.txt", "w", encoding="utf-8") as output:
    output.write(str(InsertAuthors))


In [70]:
with open('books_cleaned_1.pkl', 'wb') as f:
    pickle.dump(books, f)

In [71]:
bookId = 1
for book in books:
    book.append(bookId)
    bookId += 1

In [72]:
books[0]

['Jego zasady',
 'Adolf Abrahamowicz',
 '0',
 {'Dramat', 'Komedia', 'Pozytywizm'},
 'https://wolnelektury.pl/media/book/pdf/jego-zasady.pdf',
 1]

In [79]:
tagId = 1
tags = []
for tag in tagset:
    tags.append([tag, tagId])
    tagId += 1

In [81]:
authorId = 1
authors = []
for author in authorset:
    authors.append([author, authorId])
    authorId += 1    

In [84]:
AuthorBook = []
TagBook = []

In [85]:
with open('books_indexed_no_relation.pkl', 'wb') as f:
    pickle.dump(books, f)
with open('tags_indexed_no_relation.pkl', 'wb') as f:
    pickle.dump(tags, f)
with open('authors_indexed_no_relation.pkl', 'wb') as f:
    pickle.dump(authors, f)

In [44]:
import pickle
with open('tags_indexed_no_relation.pkl', 'rb') as f:
    tags = pickle.load(f)
with open('authors_indexed_no_relation.pkl', 'rb') as f:
    authors = pickle.load(f)

In [90]:
for tag in tags:
    for book in books:
        if tag[0] in book[3]:
            TagBook.append([tag[1], book[5]])

In [94]:
print(books[833][3], tags[0])

{'Proza poetycka', 'Dwudziestolecie międzywojenne', 'Nowela', 'Felieton', 'Epika', 'Dramat współczesny', 'Pogadanka'} ['Proza poetycka', 1]


In [95]:
for author in authors:
    for book in books:
        if author[0] in book[1]:
            AuthorBook.append([author[1], book[5]])

In [96]:
AuthorBook[20]

[2, 252]

In [97]:
print(authors[1], books[251][1])

['Tadeusz Boy-Żeleński', 2] Tadeusz Boy-Żeleński


In [45]:
InsertTags = ''
for tag in tags:
    InsertTags += "INSERT INTO tags (TagId, Tag) VALUES ({},'{}');\n".format(tag[1], tag[0].replace("'","''"))

In [46]:
InsertAuthors = ''
for author in authors:
    InsertAuthors += "INSERT INTO authors (AuthorId, Author) VALUES ({},'{}');\n".format(author[1], author[0].replace("'","''"))

In [47]:
with open("InsertTags1.txt", "w", encoding="utf-8") as output:
    output.write(str(InsertTags))
with open("InsertAuthors1.txt", "w", encoding="utf-8") as output:
    output.write(str(InsertAuthors))

In [105]:
books[0]

['Jego zasady',
 'Adolf Abrahamowicz',
 '0',
 {'Dramat', 'Komedia', 'Pozytywizm'},
 'https://wolnelektury.pl/media/book/pdf/jego-zasady.pdf',
 1]

In [106]:
InsertBooks = ''
for book in books:
    InsertBooks += "INSERT INTO books (BookId, Title, ISBN, URL) VALUES ({},'{}','{}','{}');\n".format(book[5], book[0], book[2], book[4])

In [108]:
print(InsertBooks[0:1200])

INSERT INTO books (BookId, Title, ISBN, URL) VALUES (1,'Jego zasady','0','https://wolnelektury.pl/media/book/pdf/jego-zasady.pdf');
INSERT INTO books (BookId, Title, ISBN, URL) VALUES (2,'Po burzy','9788364980978','https://wolnelektury.pl/media/book/pdf/po-burzy.pdf');
INSERT INTO books (BookId, Title, ISBN, URL) VALUES (3,'Oresteja','9788328853553','https://wolnelektury.pl/media/book/pdf/ajschylos-oresteja.pdf');
INSERT INTO books (BookId, Title, ISBN, URL) VALUES (4,'Prometeusz skowany','9788366837201','https://wolnelektury.pl/media/book/pdf/ajschylos-prometeusz-skowany.pdf');
INSERT INTO books (BookId, Title, ISBN, URL) VALUES (5,'Małe kobietki','9788377795712','https://wolnelektury.pl/media/book/pdf/alcott-male-kobietki.pdf');
INSERT INTO books (BookId, Title, ISBN, URL) VALUES (6,'Boska Komedia','9788361060901','https://wolnelektury.pl/media/book/pdf/boska-komedia.pdf');
INSERT INTO books (BookId, Title, ISBN, URL) VALUES (7,'Serce','9788374851480','https://wolnelektury.pl/media/b

In [111]:
InsertTagBook = ''
for tagbook in TagBook:
    InsertTagBook += "INSERT INTO tagbook (TagId, BookId) VALUES ({},{});\n".format(tagbook[0], tagbook[1])

In [112]:
print(InsertTagBook[0:200])

INSERT INTO tagbook (TagId, BookId) VALUES (1,833);
INSERT INTO tagbook (TagId, BookId) VALUES (1,834);
INSERT INTO tagbook (TagId, BookId) VALUES (2,1923);
INSERT INTO tagbook (TagId, BookId) VALUES 


In [114]:
InsertAuthorBook = ''
for authorbook in AuthorBook:
    InsertAuthorBook += "INSERT INTO authorbook (AuthorId, BookId) VALUES ({},{});\n".format(authorbook[0], authorbook[1])

In [115]:
print(InsertAuthorBook[0:200])

INSERT INTO authorbook (AuthorId, BookId) VALUES (1,193);
INSERT INTO authorbook (AuthorId, BookId) VALUES (1,194);
INSERT INTO authorbook (AuthorId, BookId) VALUES (2,234);
INSERT INTO authorbook (Au


In [49]:
maxtitle = 0
for book in books:
    if len(book[4]) > maxtitle:
        maxtitle = len(book[4])
print(maxtitle)

123


In [48]:
maxtitle = 0
for tag in tags:
    if len(tag[0]) > maxtitle:
        maxtitle = len(tag[0])
print(maxtitle)

29


In [118]:
maxauthor = 0
for author in authors:
    if len(author[0]) > maxauthor:
        maxauthor = len(author[0])
print(maxauthor)

58


In [119]:
with open("InsertTags.txt", "w", encoding="utf-8") as output:
    output.write(str(InsertTags))
with open("InsertAuthors.txt", "w", encoding="utf-8") as output:
    output.write(str(InsertAuthors))
with open("InsertBooks.txt", "w", encoding="utf-8") as output:
    output.write(str(InsertBooks))
with open("InsertTagBook.txt", "w", encoding="utf-8") as output:
    output.write(str(InsertTagBook))
with open("InsertAuthorBook.txt", "w", encoding="utf-8") as output:
    output.write(str(InsertAuthorBook))



## Uczenie, test i zapis modelu doc2vec

In [131]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [127]:
data = []
for book in books:
    bookstr = book[1]
    for tag in book[3]:
        bookstr += ' ' + tag
    data.append(bookstr)

In [128]:
data[0]

'Adolf Abrahamowicz Pozytywizm Komedia Dramat'

In [132]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [135]:
model = Doc2Vec(vector_size=3,
                alpha=0.025, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

In [137]:
model.train(tagged_data,
            total_examples=model.corpus_count,
            epochs=100)

model.save("d2v.model")

In [138]:
test_data = word_tokenize("Epos rycerski Legenda".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

V1_infer [-0.7152114  1.2028822  1.6474428]


### Rozszerzenie listy książek o wektor uzyskany z przygotowanego modelu

In [144]:
for book in books:
    bookstr = book[1]
    for tag in book[3]:
        bookstr += ' ' + tag
    book_tokenized = word_tokenize(bookstr.lower())
    book_vector = model.infer_vector(book_tokenized)
    book.append(book_vector)

In [150]:
books[0]

['Jego zasady',
 'Adolf Abrahamowicz',
 '0',
 {'Dramat', 'Komedia', 'Pozytywizm'},
 'https://wolnelektury.pl/media/book/pdf/jego-zasady.pdf',
 1,
 array([0.08220517, 0.15564835, 1.0348116 ], dtype=float32)]

In [176]:
with open('books_vectors.pkl', 'wb') as f:
    pickle.dump(books, f)

In [2]:
import pickle
with open('books_vectors.pkl', 'rb') as f:
    books = pickle.load(f)

In [12]:
import pandas as pd
import numpy as np
from scipy import spatial

In [26]:
vect_input = [np.mean([books[0][6][0],books[50][6][0]]),np.mean([books[0][6][1],books[50][6][1]]),np.mean([books[0][6][2],books[50][6][2]])]
maxresult = -1
for book in books:
    result = 1 - spatial.distance.cosine(vect_input, book[6])
    if result > maxresult:
        book_output = book
        maxresult = result
print(maxresult)

0.9992067217826843


In [27]:
books[0]

['Jego zasady',
 'Adolf Abrahamowicz',
 '0',
 {'Dramat', 'Komedia', 'Pozytywizm'},
 'https://wolnelektury.pl/media/book/pdf/jego-zasady.pdf',
 1,
 array([0.08220517, 0.15564835, 1.0348116 ], dtype=float32)]

In [28]:
books[50]

['Kiejstut',
 'Adam Asnyk',
 '9788328534124',
 {'Dramat', 'Pozytywizm', 'Tragedia'},
 'https://wolnelektury.pl/media/book/pdf/asnyk-kiejstut.pdf',
 51,
 array([-0.47235325, -0.2805281 ,  0.6188072 ], dtype=float32)]

In [29]:
book_output

['Synowie ziemi',
 'Stanisław Przybyszewski',
 '9788328807723',
 {'Epika', 'Powieść', 'Pozytywizm'},
 'https://wolnelektury.pl/media/book/pdf/synowie-ziemi-.pdf',
 1712,
 array([-0.200813  , -0.07436049,  0.74203247], dtype=float32)]

In [39]:
InsertBooks = ''
for book in books:
    InsertBooks += "INSERT INTO books (BookId, Title, ISBN, URL, X, Y, Z) VALUES ({}, '{}', '{}', '{}', {}, {}, {});\n".format(
        book[5], book[0].replace("'","''"), book[2], book[4], book[6][0], book[6][1], book[6][2])

In [40]:
print(InsertBooks[0:1200])

INSERT INTO books (BookId, Title, ISBN, URL, X, Y, Z) VALUES (1, 'Jego zasady', '0', 'https://wolnelektury.pl/media/book/pdf/jego-zasady.pdf', 0.08220516890287399, 0.1556483507156372, 1.0348116159439087);
INSERT INTO books (BookId, Title, ISBN, URL, X, Y, Z) VALUES (2, 'Po burzy', '9788364980978', 'https://wolnelektury.pl/media/book/pdf/po-burzy.pdf', 0.02022506110370159, 0.16492147743701935, 0.9576689004898071);
INSERT INTO books (BookId, Title, ISBN, URL, X, Y, Z) VALUES (3, 'Oresteja', '9788328853553', 'https://wolnelektury.pl/media/book/pdf/ajschylos-oresteja.pdf', 0.1115083247423172, 0.3541731536388397, 1.2940075397491455);
INSERT INTO books (BookId, Title, ISBN, URL, X, Y, Z) VALUES (4, 'Prometeusz skowany', '9788366837201', 'https://wolnelektury.pl/media/book/pdf/ajschylos-prometeusz-skowany.pdf', 0.2508053183555603, 0.3743506669998169, 1.2772467136383057);
INSERT INTO books (BookId, Title, ISBN, URL, X, Y, Z) VALUES (5, 'Małe kobietki', '9788377795712', 'https://wolnelektury.pl

In [42]:
with open("InsertBooks.txt", "w", encoding="utf-8") as output:
    output.write(str(InsertBooks))

In [38]:
len('3.3743506669998169')

18