In [1]:
import time #usado para esperar um certo tempo para executar um comando
from threading import Thread #usado para executar funções ao mesmo tempo
import re #regex
import pickle #usado para salvar variáveis na memória hd
from pathlib import Path #usado para acessar pastas na memória
import sqlite3 #conexão com o banco sqlite
import string 
import random
import requests #usado para fazer requisições http
from bs4 import BeautifulSoup #transformar texto html em estrutura pyhton
import pandas as pd
from datetime import datetime, timedelta #usado para tratar datas

In [2]:

class BackupApiResp:
    def __init__(self,path_file_bk): 
        self.__con = sqlite3.connect(path_file_bk) #inicia a conexão com o banco
        self.__con.row_factory = sqlite3.Row 
        self.today = datetime.now().strftime("%Y-%m-%d %H:%M") #define data atual no padrão ano-mes-dia horas-minutos
        self.__init_db() #chama a função que inicia o banco de dados
    
    def __treat_text_dict_to_sql(text_dict):
        for key in text_dict:
            text_dict[key] = text_dict[key].replace("'","''")
        return text_dict
    def __init_db(self): 
        '''cria o banco executando a query abaixo'''
        sql_table_def = (
            'CREATE TABLE IF NOT EXISTS ' 
            'anuncios_resumo ('
                'url_lista TEXT,'
                'numero_pagina_lista INTEGER,'
                'url_anuncio TEXT,'
                'categoria_completa TEXT,'
                'categoria_atual TEXT,'
                'titulo_anuncio TEXT,'
                'detalhes_anuncio TEXT,'
                'preco_anuncio FLOAT,'
                'url_img_principal TEXT,'
                'qt_img_anuncio INTEGER,'
                'localizacao_completa TEXT,'
                'localizacao_complemento TEXT,'
                'anuncio_profissional INTEGER,'
                'data_publicacao_anuncio DATETIME,'
                'data_coleta_dados DATETIME'
            ');' 
        ) #cria a tabela anuncios_resumo se ela não existir, com as colunas informadas

        sql_index1_def = (
            'CREATE INDEX IF NOT EXISTS '
            'index_url_anuncio_ar ON '
            'anuncios_resumo ('
                'url_anuncio ASC'
            ');'
        ) #cria um índice com a url_anuncio

        sql_index2_def = (
            'CREATE INDEX IF NOT EXISTS '
            'index_url_lista_ar ON '
            'anuncios_resumo ('
                'url_lista ASC'
            ');' 
        ) #cria um índice com a url_lista
        sql_index3_def = (
            'CREATE INDEX IF NOT EXISTS '
            'index_categoria_titulo_url ON '
            'anuncios_resumo ('
                'categoria_atual ASC,'
                'titulo_anuncio ASC,'
                'url_anuncio ASC'
            ');' 
        )
        sql_index_control = (
            'CREATE INDEX IF NOT EXISTS '
            'index_table_control ON '
            'scraping_control ('
                'url_base ASC'
            ');'
        ) #cria um índice com a url_base
        sql_table_control = (
            'CREATE TABLE IF NOT EXISTS ' 
            'scraping_control ('
                'url_base TEXT,'
                'qt_last_scraping INTEGER,'
                'id_session TEXT,'
                'date_init_scraping DATETIME,'
                'date_end_scraping DATETIME'
            ');'
        )
        self.__con.execute(sql_table_def) #executa as querys sobre a conexão iniciada 
        self.__con.execute(sql_index1_def)
        self.__con.execute(sql_index2_def)
        self.__con.execute(sql_index3_def)
        self.__con.execute(sql_table_control)
        self.__con.execute(sql_index_control)
        self.__con.commit()

    def add_rows_anuncios_resumo(self,rows):
        '''Faz a inserção de linhas na tabela'''
        if isinstance(rows,list):
            if len(rows) > 0 and isinstance(rows[0],dict):
                sql_insert_data = """INSERT INTO anuncios_resumo (
                                                    url_lista,
                                                    numero_pagina_lista,
                                                    url_anuncio,
                                                    categoria_completa,
                                                    categoria_atual,
                                                    titulo_anuncio,
                                                    detalhes_anuncio,
                                                    preco_anuncio,
                                                    url_img_principal,
                                                    qt_img_anuncio,
                                                    localizacao_completa,
                                                    localizacao_complemento,
                                                    anuncio_profissional,
                                                    data_publicacao_anuncio,
                                                    data_coleta_dados ) VALUES"""
                sql_rows = []
                for row in rows:
                    row = BackupApiResp.__treat_text_dict_to_sql(row)
                    sql_rows.append(f"""('{row.get('url_lista',"erro")}',
                                          {row.get('numero_pagina_lista',"-1")},
                                         '{row.get('url_anuncio',"erro")}',
                                         '{row.get('categoria_completa',"erro")}',
                                         '{row.get('categoria_atual',"erro")}',
                                         '{row.get('titulo_anuncio',"erro")}',
                                         '{row.get('detalhes_anuncio',"erro")}',
                                          {row.get('preco_anuncio',"-1")},
                                         '{row.get('url_img_principal',"erro")}',
                                          {row.get('qt_img_anuncio',"-1")},
                                         '{row.get('localizacao_completa',"erro")}',
                                         '{row.get('localizacao_complemento',"erro")}',
                                          {row.get('anuncio_profissional',"-1")},
                                         '{row.get('data_publicacao_anuncio',self.today)}',
                                         '{self.today}')""")
                sql_insert_data += ','.join(sql_rows) + ';'
                self.__con.execute(sql_insert_data)
                self.__con.commit()
    
    def add_url_scraping_control(self,url_base):
        sql_insert_data = (
            "INSERT INTO scraping_control (url_base) "
            "VALUES ("
                f"'{url_base}'"
            ");"
        )
        self.__con.execute(sql_insert_data)
        self.__con.commit()
    
    def init_scraping_url_base(self,url_base,id_session='-'):
        sql_insert_data = f"""UPDATE scraping_control
                                SET date_init_scraping = '{datetime.now().strftime("%Y-%m-%d %H:%M")}',
                                    id_session = '{id_session}',
                                    date_end_scraping = NULL
                              WHERE url_base = '{url_base}' AND 
                              ((id_session is NULL) OR 
                              COALESCE(julianday('now') - julianday(date_init_scraping),99)>1)
                              """
     
        self.__con.execute(sql_insert_data)
        self.__con.commit()
    
    def verify_id_session(self,url_base,id_session): #util if use a cloud db
        sql =  f"SELECT url_base FROM scraping_control where url_base = '{url_base}' and id_session = '{id_session}'"
        result = self.__con.execute(sql).fetchone()
        return True if result else False  
    
    def get_urls_base_in_db(self,only_not_scraped = False):
        sql_data = f"""SELECT url_base FROM scraping_control"""
        sql_data += f""" where date_end_scraping is null order by id_session, date_init_scraping desc """ if only_not_scraped else " order by id_session, qt_last_scraping desc"
     
        result = self.__con.execute(sql_data).fetchall()
        return [row['url_base'] for row in result]
        
    def end_scraping_url_base(self,url_base,qt_last_scraping):
        sql_insert_data = f"""UPDATE scraping_control
                                SET date_end_scraping = '{datetime.now().strftime("%Y-%m-%d %H:%M")}',
                                    qt_last_scraping = {qt_last_scraping},
                                    id_session = NULL
                              WHERE url_base = '{url_base}' """
     
        self.__con.execute(sql_insert_data)
        self.__con.commit()

    def get_all_urls_ads(self,url_lista):
        sql_data = f"""SELECT url_anuncio FROM anuncios_resumo where url_lista = '{url_lista}'"""
        result = self.__con.execute(sql_data).fetchall()
        return [row['url_anuncio'] for row in result]

    def has_url_ad_in_anuncio_resumo(self,url_ad):
        sql = f""" select url_anuncio from anuncios_resumo
                    where url_anuncio = '{url_ad}'
                    limit 1 """

        result = self.__con.execute(sql).fetchone()
        return True if result else False


In [3]:
def print_verbose_function(verbose):
    if verbose:
        return print
    else:
        return lambda x: None

In [4]:
def convert_date_olx_to_datetime_str(texto_data):
    MONTHS = {'jan': 1, 'fev': 2, 'mar': 3, 'abr': 4,  'mai': 5,  'jun': 6,
          'jul': 7, 'ago': 8, 'set': 9, 'out': 10, 'nov': 11, 'dez': 12}
    data, hora = texto_data.lower().split(',')
    data_padrao = ""
    
    if 'ontem' == data :
        data_padrao = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
    elif 'hoje' == data :
        data_padrao = datetime.now().strftime('%Y-%m-%d')
    else:    
        dataP = data.split(' ')
        dia_mes = int(dataP[0][0:2])
        mes = MONTHS[dataP[1]]
        ano_atual = int(datetime.now().strftime('%Y'))
        mes_atual = int(datetime.now().strftime('%m'))
        ano = ano_atual if mes_atual >= mes else ano_atual - 1
        try:
            data_padrao = datetime(year=ano, month=mes, day=dia_mes).strftime('%Y-%m-%d')
        except:
            data_padrao = datetime(year=2000, month=1, day=1).strftime('%Y-%m-%d')
  
    
    return data_padrao + hora

In [5]:
headers_olx =  { #Network -> Request Headers
                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
                "origin": "https://olx.com.br/",
                "referer": "https://olx.com.br/",
                "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Google Chrome\";v=\"96\"",
                "sec-ch-ua-mobile": "?0",
                "sec-ch-ua-platform": "\"Windows\"",
                "sec-fetch-dest": "empty",
                "sec-fetch-mode": "cors",
                "sec-fetch-site": "same-site",
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
                }

In [6]:
def get_retroactive_links_in_div(div_class,url,verbose=True,only_endpoints=False):
    """
    this code is used to get the retroactive links in the div class
    """
    session = requests.Session()
    all_url_visited = []
    end_links = []
    error_links = []
    iprint = print_verbose_function(verbose)

    def get_links_in_current_div(url_base,suffix='',qt_try=1):
        nonlocal session
        if url_base not in all_url_visited:
            all_url_visited.append(url_base)   
        iprint("*****url_base: "+ url_base)
        
        try:
            resp = session.get(url_base+suffix, headers=headers_olx, timeout=15)
            bsObj = BeautifulSoup(resp.text, "html.parser")
            items = bsObj.find("div", {"class":div_class})
            qt_reg_adds = 0
            if items:
                links_in_div = [*map(lambda x: x['href'], items.find_all("a", href=True))]
                
                for link in links_in_div:
                    if link not in all_url_visited:
                        iprint('+Redirecionamento:'+url_base+'->' + link)
                        get_links_in_current_div(link,'/')
                        qt_reg_adds+=1
                
            else:
                print('warning-no-class:' + url_base)
            
            if qt_reg_adds == 0:
                if url_base not in end_links:
                    iprint('----end_link:'+url_base) 
                    end_links.append(url_base)
        
        except Exception as e:
            print('!!! Warning, qt attempts:'+str(qt_try)+' in:'+url_base)
            print(str(e))
            print('!!! Restart session and Wait 10 seconds\n\n')
            time.sleep(10)
            session = requests.Session()
            if qt_try < 6:
                get_links_in_current_div(url_base,suffix,qt_try+1)
            else:
                print('!!! Error, qt attempts:'+str(qt_try)+' in:'+url_base)
                error_links.append(url_base)

    get_links_in_current_div(url)       
    
    return all_url_visited if only_endpoints == False else end_links

In [7]:
# _test_endlink = get_retroactive_links_in_div('sc-1ncgzjx-0','https://mg.olx.com.br/belo-horizonte-e-regiao/imoveis/venda',True,True)

In [8]:
def get_unique_category_urls():
    """
    Return only the most specific category links 
    *(general categories are not returned and yours advertisements are included in specific categories)
    * is necessary get specific categories because the general categories are limited in 5k advertisements per location
    """
    url_base = "https://www.olx.com.br/brasil"
    class_category_menu = "jx24x3-2"

    all_url_visited = get_retroactive_links_in_div(class_category_menu,url_base) # end_links = False -> because there are siblings links in categories menu

    most_specific_category = []

    # Is necessary threat all links using Regex to get the most specific category
    for link_ref in all_url_visited:
        add = True
        for link_comp in all_url_visited:
            if re.match(link_ref, link_comp) and link_ref != link_comp:
                add = False
                break
        if add:
            most_specific_category.append(link_ref)

    most_specific_category.remove(url_base)       
    
    return  most_specific_category   

In [9]:
def get_start_urls_scraping_threads(urls_base,qt_threads=10,verbose=False):
    """
    Return a list of urls to be scraped (inside a specific region)
    Is necessary to use threads because the OLX website has a lot of categories and the scraping is slow
    """
    def _get_retroactive_links_in_div(div_class,url,results,index):
        #is necessay to use this function because the get_retroactive_links_in_div function is not thread safe
        results[index] = get_retroactive_links_in_div(div_class,url,verbose,True)
    
    class_location_menu = "sc-1ncgzjx-0"
    start_urls = []
    threads = [None] * qt_threads
    results = [None] * qt_threads
    qt_urls_base = len(urls_base)
    print('Start get_start_urls_scraping_threads - This Function take many time to finish')
    for index in range(0,qt_urls_base,qt_threads):
        for i in range(qt_threads):
            if (index+i) < qt_urls_base:
                threads[i] = Thread(target=_get_retroactive_links_in_div, args=(class_location_menu,urls_base[index+i],results,i))
                threads[i].start()
        for i in range(qt_threads):
            if (index+i) < qt_urls_base:
                threads[i].join()
                print(f"Category {urls_base[index+i]} ended: {index+i+1} / {qt_urls_base}")
                start_urls.extend(results[i])
        # start_urls = sum(results,start_urls)
        threads = [None] * qt_threads
        results = [None] * qt_threads

    return [*filter(lambda x: x != None, start_urls)]

In [10]:
# Test = get_start_urls_scraping_threads(['https://ac.olx.com.br/hobbies-e-colecoes'])

In [11]:
def get_start_urls_scraping(urls_base,vebose=False):
    """
    Return a list of urls to be scraped (inside a specific region)
    """
    class_location_menu = "sc-1ncgzjx-0"
    all_url_visited = []
    for url_base in urls_base:
      qt_url_base = len(urls_base)
      percetil_end = len(all_url_visited)/qt_url_base*100
      print(f'Percentual Concluido:{percetil_end:.2f}% - {len(all_url_visited)} de {qt_url_base}')
      all_url_visited.extend(get_retroactive_links_in_div(class_location_menu,url_base,vebose,True))
    return all_url_visited

In [12]:
unique_category_urls = []
if not Path('./unique_category_urls.pkl').is_file():
    unique_category_urls = get_unique_category_urls()
    with open('./unique_category_urls.pkl', 'wb') as file:
        pickle.dump(unique_category_urls, file)
else :
    with open('./unique_category_urls.pkl', 'rb') as file:
        unique_category_urls = pickle.load(file)   

In [13]:
start_urls_scraping = []
if not Path('./start_urls_scraping.pkl').is_file():
    start_urls_scraping = get_start_urls_scraping_threads(unique_category_urls,12)
    with open('./start_urls_scraping.pkl', 'wb') as file:
        pickle.dump(start_urls_scraping, file) 
else :
    with open('./start_urls_scraping.pkl', 'rb') as file:
        start_urls_scraping = pickle.load(file)

In [14]:
banco_olx = BackupApiResp("./banco_scraping_olx.db")
all_urls = banco_olx.get_urls_base_in_db()
urls_to_add_db = [*filter(lambda x: x not in all_urls, start_urls_scraping)]
# add new urls to db for scrapping
for url in urls_to_add_db:
    banco_olx.add_url_scraping_control(url)

In [15]:
def get_data_from_olx_page(bsObj,list_url,page_number):
    
    div_category = bsObj.find("div", {"class": "otEye"})
    list_category = div_category.findAll(text=True) if div_category else []
    current_category = list_category[-1] if len(list_category) > 0 else 'Error'
    complete_category = ';'.join(list_category[1:]) if len(list_category) > 0 else 'Error'
    div_location = bsObj.find("div", {"class": "sc-gPWkxV UqLlM"})
    complete_location = div_location.text.replace('>',';') if div_location else ''
    ad_list = bsObj.findAll("li", {"class": "sc-1fcmfeb-2"})
    rows_to_save_db = []
    for ad_resume in ad_list:

        adv_url_tag = ad_resume.find("a",href=True)
        if adv_url_tag:
            thumb_img_tag = adv_url_tag.find("img",src=True)
            qt_img_tag = ad_resume.find("span",{"class":"bYaEay"})
            desc_tag = ad_resume.find("h2",text=True) 
            price_tag = ad_resume.find("span",{"class":"kHeyHD"})
            detail_tag = ad_resume.find("div",{"class":"jEDFNq"})
            address_detail_tag = ad_resume.find("span",{"class":"iDvjkv"})
            type_seller_tag = ad_resume.find("span",{"class":"jGYopB"})
            date_tag = ad_resume.find("span",{"class":"javKJU"})
            price = price_tag.text if price_tag else ''
            
            data_to_insert = {
                                'url_lista': list_url,
                                'numero_pagina_lista': str(page_number),
                                'url_anuncio': adv_url_tag['href'] if adv_url_tag else '',
                                'categoria_completa': complete_category,
                                'categoria_atual': current_category,
                                'titulo_anuncio': desc_tag.text if desc_tag else '',
                                'detalhes_anuncio': detail_tag.getText() if detail_tag else '',
                                'preco_anuncio': re.sub('[^0-9,]','',price).replace(',','.') if price else 'NULL',
                                'url_img_principal': thumb_img_tag['src'] if thumb_img_tag else '',
                                'qt_img_anuncio': qt_img_tag.text if qt_img_tag else '0',
                                'localizacao_completa': complete_location,
                                'localizacao_complemento': address_detail_tag.text if address_detail_tag else '',
                                'anuncio_profissional':'1' if type_seller_tag else '0',
                                'data_publicacao_anuncio': convert_date_olx_to_datetime_str(date_tag.text) if date_tag else datetime.now().strftime("%Y-%m-%d %H:%M")
                            }
            rows_to_save_db.append(data_to_insert)
    
    return rows_to_save_db
    # banco_olx.add_rows_anuncios_resumo(rows_to_save_db)


In [16]:
def scrape_and_insert_data_olx(url_end_point,urls_already_access=[],page_number=1,session=None,qt_try=1,force_search=False):
    # print(f"Scraping {url_end_point} page {page_number}")
    
    query_Parameters = {'sf':1} if page_number == 1 else {} # get data in order of publication (THIS CASE IS POSSIBLE STOP SCRAPING IF SEE ONE AD ALREADY SCRAPED)
    _session = session if session else requests.Session()
    # try:
    resp = _session.get(url_end_point, params=query_Parameters, headers=headers_olx, timeout=15)
    bsObj = BeautifulSoup(resp.text, "html.parser")
    _problem = True if bsObj.find('title',text="Ops!!") else False
    if not _problem :
        _url_end_point = url_end_point.split('?')[0]
        all_row_scraped = get_data_from_olx_page(bsObj,_url_end_point,page_number)
        rows_to_save_db = [*filter(lambda x: x['url_anuncio'] not in urls_already_access, all_row_scraped)] # add only new ads
        # banco_olx.add_rows_anuncios_resumo(rows_to_save_db)
        
        if len(rows_to_save_db) != len(all_row_scraped) and not force_search:  # if some ad was already scraped stop scraping in this page base
            return len(rows_to_save_db)

        next_page_tag = bsObj.find('a', {'data-lurker-detail':'next_page'})
        next_page_url = next_page_tag['href'] if next_page_tag else None
        if next_page_url:
            return len(rows_to_save_db) + scrape_and_insert_data_olx(next_page_url,urls_already_access,page_number+1,_session,force_search=force_search)
    else:
        raise Exception(f"Problem with Server in {url_end_point} page {page_number}")
    
    # except Exception as e:
    #         print('!!! Warning, qt attempts:'+str(qt_try)+' in:'+ url_end_point)
    #         print(str(e))
    #         print(f'!!! Restart session and Wait {15*(qt_try**3)} seconds\n\n')
    #         time.sleep(15*(qt_try**3))
    #         _session = requests.Session()
    #         if qt_try < 6:
    #             return len(rows_to_save_db) + scrape_and_insert_data_olx(url_end_point,urls_already_access,page_number,_session,qt_try+1,force_search=force_search)
    #         else:
    #             print(f'!!! Error, qt attempts:{str(qt_try)} in:{url_end_point} page {page_number}')

    return len(rows_to_save_db)

In [17]:
def scrape_all_ad_olx():
    print('Scraping all ad data OLX')
    start_urls_to_scraping  = banco_olx.get_urls_base_in_db(only_not_scraped=True)
    if len(start_urls_to_scraping) == 0:
        start_urls_to_scraping  = banco_olx.get_urls_base_in_db() # if all urls already scraped, update all data
    rand_string = ''.join(random.choices(string.ascii_uppercase + string.digits, k=15))
    for url in start_urls_to_scraping:
        print(f"Starting Scraping {url}")
        banco_olx.init_scraping_url_base(url,rand_string)
        if banco_olx.verify_id_session(url,rand_string): # verify if session is the same (util for multiprocessing feature to use a cloud db)
            current_reg_in_url = banco_olx.get_all_urls_ads(url)
            qt_reg_add = scrape_and_insert_data_olx(url,current_reg_in_url)
            banco_olx.end_scraping_url_base(url,qt_reg_add)
            print(f"End Scraping {url} - {qt_reg_add} new ads")
        else:
            print(f"End Scraping: Other process/computer is scraping {url}")    

In [18]:
def scrape_data_olx_page(url_end_point,urls_already_access,force_search=False):
    list_result_data=[]
    session = requests.Session()

    def scrape_data_olx_page_retroactive(url_end_point,page_number=1,qt_try=1):
        # print(f"Scraping {url_end_point} page {page_number}")
        nonlocal session
        query_Parameters = {'sf':1} if page_number == 1 else {} # get data in order of publication (THIS CASE IS POSSIBLE STOP SCRAPING IF SEE ONE AD ALREADY SCRAPED)
        try:
            resp = session.get(url_end_point, params=query_Parameters, headers=headers_olx, timeout=15)
            bsObj = BeautifulSoup(resp.text, "html.parser")
            _problem = True if bsObj.find('title',text="Ops!!") else False
            if not _problem :
                _url_end_point = url_end_point.split('?')[0]
                all_row_scraped = get_data_from_olx_page(bsObj,_url_end_point,page_number)
                rows_to_save_db = [*filter(lambda x: x['url_anuncio'] not in urls_already_access, all_row_scraped)] # add only new ads
                list_result_data.extend(rows_to_save_db)

                if len(rows_to_save_db) != len(all_row_scraped) and not force_search:  # if some ad was already scraped stop scraping in this page base
                    return True

                next_page_tag = bsObj.find('a', {'data-lurker-detail':'next_page'})
                next_page_url = next_page_tag['href'] if next_page_tag else None
                if next_page_url:
                    return scrape_data_olx_page_retroactive(next_page_url,page_number+1,qt_try)
            else:
                raise Exception(f"Problem with Server in {url_end_point} page {page_number}")
        
        except Exception as e:
                print('!!! Warning, qt attempts:'+str(qt_try)+' in:'+ url_end_point)
                print(str(e))
                print(f'!!! Restart session and Wait {15*(qt_try**3)} seconds\n\n')
                time.sleep(15*(qt_try**3))
                session = requests.Session()
                if qt_try < 6:
                    return scrape_data_olx_page_retroactive(url_end_point,page_number,qt_try+1)
                else:
                    print(f'!!! Error, qt attempts:{str(qt_try)} in:{url_end_point} page {page_number}')
                    return False

        return True
    
    scrape_data_olx_page_retroactive(url_end_point)
    return list_result_data


In [19]:
# resultl = [[],[]]
# erro = scrape_and_insert_data_olx('https://pi.olx.com.br/regiao-de-teresina-e-parnaiba/teresina/imoveis/venda/casas?o=93&sf=1')
# jp2 = scrape_data_olx_page('https://ba.olx.com.br/grande-salvador/salvador/imoveis/venda/casas?o=100')


In [20]:
def scrape_all_ad_olx_threads(qt_threads=10,verbose=False):
    print('Scraping all ad data OLX')
    start_urls_to_scraping  = banco_olx.get_urls_base_in_db(only_not_scraped=False)
    
    if len(start_urls_to_scraping) == 0:
        print('All urls already scraped, updating all data')
        start_urls_to_scraping  = banco_olx.get_urls_base_in_db() # if all urls already scraped, update all data and scrape all data again in order of publication
    else:
        random.shuffle(start_urls_to_scraping) # randomize the order of scraping (AVOID COLLISIONS for first scraping base)

    rand_string = ''.join(random.choices(string.ascii_uppercase + string.digits, k=15))
    def _scrape_data_olx_page(url,results,index,current_reg_in_url):
        #is necessay to use this function because the get_retroactive_links_in_div function is not thread safe
        results[index] = scrape_data_olx_page(url,current_reg_in_url)
    
    threads = [None] * qt_threads
    results = [None] * qt_threads
    qt_urls_base = len(start_urls_to_scraping)

    print('Start Scraping all ad data OLX')
    for index in range(0,qt_urls_base,qt_threads):
        for i in range(qt_threads):
            if (index+i) < qt_urls_base:
                print(f"Starting Scraping {start_urls_to_scraping[index+i]}")
                banco_olx.init_scraping_url_base(start_urls_to_scraping[index+i],rand_string)
                if banco_olx.verify_id_session(start_urls_to_scraping[index+i],rand_string): # verify if other process/computer is scraping this url
                    current_reg_in_url = banco_olx.get_all_urls_ads(start_urls_to_scraping[index+i])
                    threads[i] = Thread(target= _scrape_data_olx_page, args=(start_urls_to_scraping[index+i],results,i,current_reg_in_url))
                    threads[i].start()
                else:
                    warning  = f"End Scraping: Other process/computer is scraping {start_urls_to_scraping[index+i]}"
                    print(warning)
                    threads[i] = Thread(target= print, args=(''))
                    threads[i].start()

        for i in range(qt_threads):
            if (index+i) < qt_urls_base:
                threads[i].join()
                if results[i]!=None:
                    qt_new_ads = len(results[i])
                    if qt_new_ads>0 :
                        banco_olx.add_rows_anuncios_resumo(results[i]) # need add rows in the same thread to avoid concurrency problems
                    banco_olx.end_scraping_url_base(start_urls_to_scraping[index+i],qt_new_ads)
                    print(f"End Scraping {start_urls_to_scraping[index+i]} - {qt_new_ads} new ads")
 
        threads = [None] * qt_threads
        results = [None] * qt_threads
    
 

In [None]:
scrape_all_ad_olx_threads(30)

In [None]:
# scrape_all_ad_olx()

In [None]:
# scrape_and_insert_data_olx('https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas',urls_already_access=banco_olx.get_all_urls_ads('https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas'))