This code is a web scraper written in Python using the Requests and BeautifulSoup libraries to extract data from the Skolska knjiga website, specifically from the "Svjetska knjizevnost" category. The code loops through each page of the category and extracts book data such as the book's URL, author, title, price in both euros and Croatian kuna, image URL, and additional information such as the book's code, publisher, binding, number of pages, year of publishing, and format.

The code then stores this data in a dictionary and appends it to a list. Once all the data has been collected, a Pandas DataFrame is created from the list of dictionaries and displayed.

In [3]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Create an empty list to store book data
data = []

# Loop over each page of the website to scrape book data
for broj in range(1,52):
    # Construct the URL for the current page
    url = 'https://shop.skolskaknjiga.hr/knjizevnost/knjizevnost/svjetska.html?p=' + str(broj)
    # Send a GET request to the page
    request = requests.get(url)
    # Use BeautifulSoup to parse the HTML content of the page
    bs = BeautifulSoup(request.content, 'html.parser')
    # Find all div elements with the class 'product-item'
    knjige = bs.find_all("div", class_='product-item')
    
    # Loop over each book div and extract relevant information
    for knjiga in knjige:
        # Extract the book's URL
        anchor = knjiga.find("a", class_="product-image")
        link = anchor.get('href')
        
        # Extract the book's author
        autor = knjiga.find('span', class_='autor').text
        # Extract the book's title and price
        title = knjiga.find_all('div', class_='top-actions-inner')
        for linija in title:
            naslov = linija.find('a').text
            price = knjiga.find_all('div', class_='price-box')
        for b in price:
            cijena = b.find('span', class_='price').text.split()[0]
            price_kn = knjiga.find_all('div', class_='price-box')
        for c in price_kn:
            cijena_tag = c.find('span', class_='regular-price')
            cijena_kn = cijena_tag.text.split()[2][1:] if cijena_tag else None
        # Extract the book's image URL    
        img_tag = knjiga.find_all('img', class_='img-responsive lazy')
        img_urls = [tag.get('src', '') for tag in img_tag]
        img_url = img_urls[0] if img_urls else ''       
        
        # Follow the book's URL to extract additional information
        response2 = requests.get(link)
        otvorena_knjiga = BeautifulSoup(response2.content, 'html.parser')
        
        
        table = otvorena_knjiga.find("table", {"id": "product-attribute-specs-table"})
        if table:
            tbody = table.find("tbody")
            
        else: 
            continue

        # Extract the book's code, publisher, binding, number of pages, year of publishing, and format
        sifra_td = tbody.find("th", text="Šifra")
        izdavac_td = tbody.find("th", text="Izdavač")
        uvez_td = tbody.find("th", text="Uvez")
        broj_stranica_td = tbody.find("th", text="Broj stranica")
        godina_izdanja_td = tbody.find("th", text="Godina izdanja")
        format_td = tbody.find("th", text="Format")

        
        sifra = sifra_td.find_next_sibling("td").text.strip() if sifra_td else None
        izdavac = izdavac_td.find_next_sibling("td").text.strip() if izdavac_td else None
        uvez = uvez_td.find_next_sibling("td").text.strip() if uvez_td else None
        broj_stranica = broj_stranica_td.find_next_sibling("td").text.strip() if broj_stranica_td else None
        godina_izdanja = godina_izdanja_td.find_next_sibling("td").text.strip() if godina_izdanja_td else None
        format = format_td.find_next_sibling("td").text.strip() if format_td else None
        
        # add book title, autor,price_eur, price_kn, code, publisher, binding, number_of_pages, year_of-publishing and format to data dictionary
        data.append({
            'title': naslov,
            'autor': autor,
            'price_eur': float(cijena.replace(',', '.')), 
            'price_kn': float(cijena.replace(',', '.')), 
            'code': sifra,
        
            'publisher': izdavac,
            'binding': uvez,
            'number_of_pages': broj_stranica,
            'year_of_publishing': godina_izdanja,
            'format': format
     })
    
        
        
# create a pandas dataframe from the data dictionary    
df = pd.DataFrame(data)
display(df)             # display the dataframe in a jupyter notebook or similar environment


Unnamed: 0,title,autor,price_eur,price_kn,code,publisher,binding,number_of_pages,year_of_publishing,format
0,Komplet Žar u pepelu ...,,75.12,75.12,U17519,,,,,
1,Komplet Svjedočanstva i Sluškinjina priča ...,,29.07,29.07,U20453,,,,,
2,"Sve je bilo savršeno, a onda je nestao ...",Rosie Walsh,18.45,18.45,170576,Lumen,meki,244,2018,15 x 23
3,Mjesec u palači ...,Weina Dai Randel,18.45,18.45,160775,Lumen,meki,316,2018,15 x 23
4,Deset tisuća života ...,Michael Poore,18.45,18.45,150243,Lumen,meki,312,2018,15 x 23
...,...,...,...,...,...,...,...,...,...,...
2023,Alfabetist ...,Torsten Petterson,17.12,17.12,061809,Lumen,meki,300,2011,15 x 23
2024,Alex Cross ...,James Patterson,17.12,17.12,156358,Lumen,tvrdi,384,2012,18 x 24
2025,Nobelova nagrada za književnost 1957. - Kuga;...,Albert Camus,14.47,14.47,060632,Školska knjiga,tvrdi,355,1995,14.5 x 20
2026,500 \n ...,Matthew Quirk,18.45,18.45,156346,Lumen,tvrdi,292,2012,16 x 24


This code is written in Python and uses the psycopg2 library to establish a connection to a PostgreSQL database and create a new database. Here is a step-by-step explanation of what the code does:

Import the psycopg2 library - this library provides a way to connect to a PostgreSQL database and execute SQL statements.
Establish a connection to the PostgreSQL database by providing the necessary connection parameters such as host, user, password, and port. In this case, the host is "localhost", the user is "postgres", the password is "1111", and the port is "5432".
Set autocommit to True - this means that every SQL statement will be automatically committed to the database without needing to explicitly call the commit() method.
Create a new database named "skolska_knjiga" using the cursor object's execute() method. This SQL statement creates a new database with the specified name.
Close the cursor and connection to the database using the close() method.

In [19]:
import psycopg2

# Establish a connection to the database
conn = psycopg2.connect(
    host="localhost",
    user="postgres",
    password="1111",
    port="5432"
)

# Set autocommit to True
conn.autocommit = True

# Create a new database named "skolska_knjiga"
cur = conn.cursor()
cur.execute("CREATE DATABASE skolska_knjiga;")

# Close the cursor and connection
cur.close()
conn.close()


This code is an example of using the psycopg2 library to create a new table named "books" in a PostgreSQL database named "skolska_knjiga", and inserting data into it using Pandas DataFrame. Here's a breakdown of what the code is doing:
The psycopg2.connect() function is used to create a connection to the PostgreSQL database, with the specified host, database name, username, and password. This connection is assigned to the variable conn.
A cursor object is created using conn.cursor(). A cursor is used to execute commands in the PostgreSQL database.
The cur.execute() function is called to create the "books" table in the database. The table has several columns with different data types such as ID, title, author, price in EUR, price in KN, code, publisher, binding, number of pages, year of publishing, and format.
Then, the data is read from a Pandas DataFrame named df. The data is extracted from each row of the DataFrame and inserted into a list of tuples named data.
The cur.executemany() function is called to insert the data into the "books" table using the list of tuples. This function inserts multiple rows at once.
Finally, the conn.commit() function is called to commit the changes to the database, and the cursor and connection are closed using cur.close() and conn.close(), respectively.

In [23]:
# import the necessary libraries, psycopg2 and pandas
import psycopg2
import pandas as pd

# Connect to PostgreSQL database
conn = psycopg2.connect(
    host="localhost",
    database="skolska_knjiga",
    user="postgres",
    password="1111"
)

# Create a cursor object
cur = conn.cursor()

# Create the "books" table
cur.execute("""
    CREATE TABLE IF NOT EXISTS books (
        id SERIAL PRIMARY KEY,
        title TEXT NOT NULL,
        author TEXT NOT NULL,
        price_eur FLOAT NOT NULL,
        price_kn FLOAT NOT NULL,
        code TEXT,
        publisher TEXT,
        binding TEXT,
        number_of_pages INTEGER,
        year_of_publishing INTEGER,
        format TEXT
    );
""")

# Insert data into the "books" table
data = []
for index, row in df.iterrows():
    book = row.to_dict()
    data.append((
        book['title'],
        book['autor'],
        book['price_eur'],
        book['price_kn'],
        book['code'],
        book['publisher'],
        book['binding'],
        book['number_of_pages'],
        book['year_of_publishing'],
        book['format']
    ))

cur.executemany("""
    INSERT INTO books(title, author, price_eur, price_kn, code, publisher, binding, number_of_pages, year_of_publishing, format)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
""", data)

# Commit the changes and close the connection
conn.commit()
cur.close()
conn.close()
