In [1]:
!mkdir ./data

In [18]:
!pip3 install beautifulsoup4

Defaulting to user installation because normal site-packages is not writeable
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 230 kB/s eta 0:00:01
[?25hCollecting soupsieve>1.2; python_version >= "3.0"
  Downloading soupsieve-2.1-py3-none-any.whl (32 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.3 soupsieve-2.1
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [37]:
from abc import ABC,abstractmethod
import bs4
import re
import requests
import pandas as pd
import json
class Crawler(ABC):
    
    def __init__(self,data_dir,base_url,saving_interval=100):
        self.crawling_paths = []
        self.base_url = base_url
        self.data = []
        self.data_dir = data_dir
        self.crawling_paths_error = []
        self.paths_error = []
        self.saving_interval = saving_interval
        
    
    @abstractmethod
    def get_childs(self,items_count=10):
        pass
    
    @abstractmethod
    def crawl_data(self,path):
        payload={}
        return payload
    
    @abstractmethod
    def get_page_childs(self,path):
        pass
    
    def error_occured(self):
        return len(self.crawling_paths_error) > 0
    
    def save(self):
        with open(self.data_dir+'/errors.json','w') as errors_file,open(self.data_dir+'/data.json','w') as data_file:
            json.dump({
                'crawling_paths_error':self.crawling_paths_error,
                'paths_error':self.paths_error
            },errors_file,indent=2)
            json.dump(self.data,data_file,indent=2)
    
    def run(self,items_count=10):
        self.get_childs(items_count)
        for i,path in enumerate(self.crawling_paths):
            if i%self.saving_interval==0:
                self.save()
                print(f'{i} datapoints saved')
            self.data.append(self.crawl_data(path))
        self.save()
        print(f'{i+1} datapoints saved')
    
    def run_failed_attempts(self):
        cpe = self.crawling_paths_error.copy()
        self.crawling_paths = self.paths_error.copy()
        self.crawling_paths_error = []
        self.paths_error = []
        for item in cpe:
            self.get_page_childs(item['URL'])
        for i,path in enumerate(self.crawling_paths):
            if i%self.saving_interval==0:
                self.save()
                print(f'{i} datapoints saved')
            self.data.append(self.crawl_data(path))
        self.save()
        print(f'{i+1} datapoints saved')

In [40]:
class HouzzCrawler(Crawler):
    items_per_page = 36
    
    def get_childs(self,items_count=10):
        remaining_items = items_count
        while remaining_items > 0:
            page = (items_count-remaining_items)//self.items_per_page
            URL = f'{self.base_url}/p/{page}?oq=' if page > 0 else self.base_url
#             print(URL)
            try:
                response = requests.get(URL)
            except error as e:
                self.crawling_paths_error.append({
                    'URL': URL,
                    'error': e
                })
            if response.status_code !=200:
                self.crawling_paths_error.append({
                    'URL': URL,
                    'status_code': status_code
                })
            soup = bs4.BeautifulSoup(response.content,'html.parser')
            a_tags = soup.findAll('a', class_='hz-product-card__link',href=True)
#             print(len(a_tags))
            for a_tag in a_tags:
                if a_tag.get('href') is None:
                    pass
                self.crawling_paths.append(a_tag.get('href'))
                remaining_items -=1
                if remaining_items <=0:
                    break
            
    def get_page_childs(self,path):
            try:
                response = requests.get(URL)
            except error as e:
                self.crawling_paths_error.append({
                    'URL': URL,
                    'error': e
                })
            if response.status_code !=200:
                self.crawling_paths_error.append({
                    'URL': URL,
                    'status_code': status_code
                })
            soup = bs4.BeautifulSoup(response.content,'html.parser')
            a_tags = soup.findAll('a', class_='hz-product-card__link',href=True)
            for a_tag in a_tags:
                if a_tag.get('href') is None:
                    pass
                self.crawling_paths.append(a_tag.get('href'))
    
    def crawl_data(self,path):
        try:
            response = requests.get(path)
        except error as e:
            self.paths_error.append({
                'URL': path,
                'error': e
            })
            
        if response.status_code !=200:
            self.paths_error.append({
                'URL': path,
                'status_code': status_code
            })
        
        soup = bs4.BeautifulSoup(response.content,'html.parser')
        description_tags = [tag.get_text() for tag in soup.find_all('li', class_='product-keywords__word')]
        title = soup.find('span',class_='view-product-title').get_text()
        thumb_divs = soup.find_all('div',class_='alt-images__thumb')
        thumbnails = [thumb_div.find('img').get('src') for thumb_div in thumb_divs]
        sample_img_url = soup.find('img',class_='view-product-image-print').get('src')
        replacement = sample_img_url[sample_img_url.find('_'):]
        images = []
        for thumbnail in thumbnails:
            image_path = thumbnail.replace('fimgs','simgs',1)
            image_path = re.sub(r'_.*',replacement,image_path)
            images.append(image_path)
        return {
            'title':title,
            'This Product Has Been Described As': description_tags,
            'thumbnails': thumbnails,
            'images': images
        }

In [41]:
hc = HouzzCrawler(data_dir='./data/bed',base_url='https://www.houzz.com/products/beds-and-headboards',saving_interval=5)
hc.run(items_count=10)

0 datapoints saved
5 datapoints saved
10 datapoints saved


In [42]:
hc = HouzzCrawler(data_dir='./data/chair',base_url='https://www.houzz.com/products/chairs',saving_interval=10)
hc.run(items_count=20)

0 datapoints saved
10 datapoints saved
20 datapoints saved


In [43]:
hc = HouzzCrawler(data_dir='./data/sofa',base_url='https://www.houzz.com/products/sofas-and-sectionals',saving_interval=20)
hc.run(items_count=15)

0 datapoints saved
15 datapoints saved


In [44]:
hc = HouzzCrawler(data_dir='./data/desk',base_url='https://www.houzz.com/products/desks',saving_interval=10)
hc.run(items_count=15)

0 datapoints saved
10 datapoints saved
15 datapoints saved
