In [1]:
import scrapy
import re

In [2]:
class Scraper_dela(scrapy.Spider):
    name = 'studentska_dela_spider'
    allowed_domains = ['studentski-servis.com']
    start_urls = ['https://www.studentski-servis.com/studenti/prosta-dela/']
    custom_settings = {
        'FEEDS': {
            'dela.csv': {
                'format': 'csv',
                'overwrite': True
            }
        },
        #Nastavljeni podatki brskalnika, ki jih scrapy pošivlja zraven url requestov
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    def parse(self, response):
        vsa_dela = response.css('article.job-item') #Glavni elementi ki vsebujejo podatke posameznega dela

        for delo in vsa_dela:   #Sprehod skozi posamezne elemente
            d = {}

            d['naziv'] = delo.css('h5.mb-0::text').extract_first()
            lokacija = delo.css('p::text').extract()[1] if len(delo.css('p::text').extract()) > 1 else ''   #Iskanje elementa in če v elementu ni teksta ali pa ne obstaja shranimo prazen niz
            d['lokacija'] = re.sub(r'[\n\t]', '', lokacija)
            bruto = delo.css('li.job-payment a::text').get().strip() if delo.css('li.job-payment a::text').get() else ''
            d['placa_bruto'] = re.sub(r'[()h/bruto€]', '', bruto)   #Odstranjevanje nepotrebnih znakov
            neto = delo.css('li.job-payment a strong::text').get().strip() if delo.css('li.job-payment a strong::text').get() else ''
            d['placa_neto'] = re.sub(r'[h/neto€]', '', neto)

            if 'PO DOGOVORU' in d['placa_neto']:    #Razčlenjevanje plače
              d['placa_po_dogovoru'] = True
              d['placa_neto'] = '/'
              d['placa_bruto'] = '/'
            else:
              d['placa_po_dogovoru'] = False


            #Pridobivanje teksta elementov, katerih starševski elementi vsebujejo določen niz
            d['prosta_mesta'] = '1'
            job_attributes = delo.css('ul.job-attributes li')
            for attribute in job_attributes:
                attribute_text = attribute.css('::text').get().strip()

                if 'Šifra' in attribute_text:
                    d['id'] = attribute.css('strong::text').get().strip()
                elif 'Trajanje' in attribute_text:
                    d['trajanje'] = attribute.css('strong::text').get().strip()
                elif 'Delovnik' in attribute_text:
                    d['delovnik'] = attribute.css('strong::text').get().strip()
                elif 'Prosta mesta' in attribute_text:
                    d['prosta_mesta'] = attribute.css('strong::text').get().strip()    
  
            yield d

        #Rekurzija za nasledenjo stran
        naslednje_strani = response.css('div.page-items a.page-link::text').extract()
        for i in naslednje_strani:
            if int(i) < 80:
                url = f'https://www.studentski-servis.com/studenti/prosta-dela/?page={i}'
                yield scrapy.Request(url, callback=self.parse)


In [3]:
#Zagon scrapy-ja
from scrapy.crawler import CrawlerProcess

process=CrawlerProcess()

process.crawl(Scraper_dela)

process.start()   #Vsakič morš znova zagnat runtime -> restart 

2024-06-28 21:24:37 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scrapybot)
2024-06-28 21:24:37 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.11.0 (main, Oct 24 2022, 18:26:48) [MSC v.1933 64 bit (AMD64)], pyOpenSSL 23.2.0 (OpenSSL 3.1.3 19 Sep 2023), cryptography 41.0.4, Platform Windows-10-10.0.19045-SP0
2024-06-28 21:24:37 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-06-28 21:24:37 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2024-06-28 21:24:37 [scrapy.extensions.telnet] INFO: Telnet Password: 151d407ac79a8438
2024-06-28 21:24:37 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExp