In [None]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
import multiprocessing as mp
import os
import csv
from math import ceil
from itertools import cycle

# Define proxies
proxies = [f'http://92.63.77.{ip}:3139' for ip in range(130, 230)]

def fetch_vacancies_with_session(session, url, headers, params, proxy_dict):
    max_retries = 10  
    retry_count = 0

    while retry_count < max_retries:
        print(f"[{os.getpid()}] Using proxy: {proxy_dict['http']} | Params: {params}")
        try:
            response = session.get(url, headers=headers, params=params, proxies=proxy_dict, timeout=10)

            if response.status_code == 200:
                return response.json()
            elif response.status_code == 429:
                retry_after = int(response.headers.get("Retry-After", 5))
                print(f"[{os.getpid()}] Rate limit (429). Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
            else:
                return None
        except Exception as e:
            print(f"[{os.getpid()}] Error with proxy {proxy_dict['http']}: {e}")
            retry_count += 1
        time.sleep(1)
    return None

        
def process_chunk(chunk_tasks, access_token, url, headers, chunk_index):
    aggregated_vacancies = []
    with requests.Session() as session:
        for location, business_area, proxy in chunk_tasks:
            proxy_dict = {"http": proxy, "https": proxy}  # Assign task-specific proxy
            current_page = 1
            while True:
                params = {
                    "locations": location["id"],
                    "business_area": business_area["id"],
                    "per_page": 100,
                    "page": current_page,
                }
                try:
                    data = fetch_vacancies_with_session(session, url, headers, params, proxy_dict)
                    if data:
                        vacancies = data.get("vacancies", [])
                        aggregated_vacancies.extend(vacancies)
                        
                        meta = data.get("meta", {})
                        total_pages = meta.get("pages", 1)
                        if current_page >= total_pages:
                            break
                        current_page += 1
                    else:
                        break  
                except Exception as e:
                    print(f"Error fetching vacancies for location {location['name']} and business area {business_area['name']}: {e}")
                    break

    filename = f"vacancies_chunk_{chunk_index}.csv"
    if aggregated_vacancies:
        df = pd.DataFrame(aggregated_vacancies)
        print(f"Writing {len(df)} vacancies to {filename}")
        df.to_csv(filename, index=False, quoting=csv.QUOTE_NONNUMERIC)
    else:
        print(f"No vacancies to write for chunk {chunk_index}.")

    return aggregated_vacancies

def process_vacancies_multiprocessing(access_token, locations, business_areas, url, headers, processes=10):
    tasks = [
        (location, business_area, proxy) 
        for (location, business_area), proxy in zip(
            [(loc, ba) for loc in locations for ba in business_areas], cycle(proxies)
        )
    ]
    
    total_tasks = len(tasks)
    chunk_size = ceil(total_tasks / processes)
    chunks = [tasks[i:i+chunk_size] for i in range(0, total_tasks, chunk_size)]

    print(f"Total tasks: {total_tasks}, Chunk size: {chunk_size}, Number of chunks: {len(chunks)}")

    with mp.Pool(processes=processes) as pool:
        results = pool.starmap(process_chunk, [(chunk, access_token, url, headers, idx) for idx, chunk in enumerate(chunks, start=1)])
    
    aggregated_results = []
    for result in results:
        aggregated_results.extend(result)
    
    print(f"Total aggregated vacancies fetched: {len(aggregated_results)}")
    return aggregated_results


def get_access_token():
    url = "https://api.avito.ru/token"
    params = {
        "client_id": "oNwJeKq7XxKdbMisWAw7",
        "client_secret": "wsFicRL8q2lmfPnYMcevaVyf9kwnAV7QNdU-Jjtd",
        "grant_type": "client_credentials"
    }
    
    try:
        response = requests.post(url, params=params)
        if response.status_code == 200:
            return response.json().get("access_token")
        else:
            print(f"Failed to get access token. Status Code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error while getting access token: {e}")
        return None

def load_locations_from_xml(xml_file):
    locations = []
    try:
        with open(xml_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'xml')

            for region in soup.find_all('Region'):
                region_id = region.get('Id')
                region_name = region.get('Name')

                if region_id and region_name:
                    locations.append({"id": region_id, "name": region_name, "type": "Region"})

                    for dicroad in region.find_all('DirectionRoad'):
                        dicroad_id = dicroad.get('Id')
                        dicroad_name = dicroad.get('Name')
                        if dicroad_id and dicroad_name:
                            locations.append({"id": dicroad_id, "name": f"{dicroad_name} ({region_name})", "type": "DirectionRoad"})

                    for subway in region.find_all('Subway'):
                        subway_id = subway.get('Id')
                        subway_name = subway.get('Name')
                        if subway_id and subway_name:
                            locations.append({"id": subway_id, "name": f"{subway_name} ({region_name})", "type": "Subway"})

                    for city in region.find_all('City'):
                        city_id = city.get('Id')
                        city_name = city.get('Name')
                        if city_id and city_name:
                            locations.append({"id": city_id, "name": f"{city_name} ({region_name})", "type": "City"})

                            for district in city.find_all('District'):
                                district_id = district.get('Id')
                                district_name = district.get('Name')
                                if district_id and district_name:
                                    locations.append({"id": district_id, "name": f"{district_name} ({city_name})", "type": "District"})

        print(f"Loaded {len(locations)} unique locations from XML.")
        return locations

    except Exception as e:
        print(f"An error occurred while loading locations from XML: {e}")
        return []

def load_business_areas_from_xml(xml_file):
    business_areas = []
    try:
        with open(xml_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'xml')
            for business_area in soup.find_all('BusinessArea'):
                area_id = business_area.find('id').text if business_area.find('id') else None
                area_name = business_area.find('name').text if business_area.find('name') else None
                if area_id and area_name:
                    business_areas.append({"id": area_id, "name": area_name})
        
        print(f"Loaded {len(business_areas)} business areas from XML.")
        return business_areas
    except Exception as e:
        print(f"An error occurred while loading business areas from XML: {e}")
        return []
        
def init_process(proxy_list):
    global process_proxy_cycle
    process_proxy_cycle = cycle(proxy_list)
    
if __name__ == "__main__":
    access_token = get_access_token()
    start_time = time.time()  

    if access_token:
        locations = load_locations_from_xml("catalog-location.xml")
        business_areas = load_business_areas_from_xml("catalog-business-area.xml")
        
        url = "https://api.avito.ru/job/v2/vacancies"
        headers = {
            "Authorization": f"Bearer {access_token}"
        }
        
        all_vacancies = process_vacancies_multiprocessing(
            access_token,
            locations=locations,
            business_areas=business_areas,
            url=url,
            headers=headers,
            processes=30  
        )
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Total execution time: {elapsed_time:.2f} seconds")
        print(f"Total aggregated vacancies fetched: {len(all_vacancies)}")

Loaded 4552 unique locations from XML.
Loaded 54 business areas from XML.
Total tasks: 245808, Chunk size: 8194, Number of chunks: 30
[349767] Using proxy: http://92.63.77.130:3139 | Params: {'locations': '637640', 'business_area': '3278315', 'per_page': 100, 'page': 1}[349768] Using proxy: http://92.63.77.224:3139 | Params: {'locations': '72', 'business_area': '3278987', 'per_page': 100, 'page': 1}[349787] Using proxy: http://92.63.77.200:3139 | Params: {'locations': '622070', 'business_area': '3278359', 'per_page': 100, 'page': 1}

In [None]:
pip install lxml

In [None]:
pip install --upgrade pip