In [1]:
# packages
import pandas as pd
import requests
from bs4 import BeautifulSoup

from datetime import datetime

# Web Scraping from [car.gr](https://www.car.gr/)

### Construct urls to web scrape

In [2]:
def create_save_urls(save_path, txt_name, num_pages):
    urls=[]
    for page_number in range(1,num_pages):
        url = f'https://www.car.gr/classifieds/cars/?fromfeed=1&pg={page_number}'
        urls.append(url)

    urls_str = '\n'.join(urls)
    with open(save_path+txt_name, 'w') as f:
        f.write(urls_str)
    
    
def read_urls(save_path, txt_name):
    with open(save_path+txt_name, 'r') as f:
        lines = f.readlines()
    lines_striped = []
    for line in lines:
        lines_striped.append(line.strip())
    return lines_striped

In [3]:
path = r'./data/'
# create_save_urls(path, 'urls.txt', 2001)
urls = read_urls(path, 'urls.txt')

# Extract other information

In [8]:
def gather_save_data(urls, save_path):
    names, prices, infos = [],[],[]
    
    for url in urls:
        page = requests.get(url)
        containers = BeautifulSoup(page.content, 'html').find_all('div', class_='tw-relative tw-w-full tw-pl-3 tw-pr-0 sm:tw-pr-1 space-between-column tw-col-span-7 lg:tw-col-span-9 sm:tw-col-span-8 md:tw-col-span-9')
        for container in containers:
            try:
                name = container.find('h2', class_='title mr-0').string.replace('\n','').strip()
                price = float(container.find('span').span.string.replace('.', ''))
                info = container.find('div', class_='tw-text-grey-600').string.replace('\n','').strip()
                names.append(name)
                prices.append(price)
                infos.append(info)
            except:
                names.append('error')
                prices.append(None)
                infos.append(None)
    
    data = pd.DataFrame({'Name' : names,'Price' : prices,'Info' : infos})
    name = f'\data_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.csv'
    data.to_csv(save_path+name, sep=',', index=False)
    
    print('Save Succesfull!')
    n_errors = data[data['Name']=='error'].shape[0]
    perc = n_errors / data.shape[0]
    print(f'{n_errors} car ads of {data.shape[0]} does not scraped. Percentage: {round(perc*100,2)}%')

In [9]:
path = r'./data/raw'
gather_save_data(urls, path)

Save Succesfull!
6843 car ads of 25895 does not scraped. Percentage: 26.43%


# Extract the images  
[*github link*](https://github.com/jhnwr/image-downloader/blob/main/imagedownloader.py#L20)  
*WIP*