# Module: Crawling flight information from ticket websites

This module is part of the project: **Flight price prediction model**. It crawls flight information from specific ticket websites in a pre-defined schedule. Moreover, the information will be preprocessed and stored into a data warehouse for reference purpose.

In [None]:
from selenium import webdriver
import urllib
import pandas as pd
import logging
import commonparam as food
import platform
import selenium.webdriver.chrome.service as service

**Class: WebDriver**

This class works as a Context for class FlyingSpider

In [None]:
class WebDriver:
    def __init__(self):
        self.current = food.CURRENT_TIME
        self.journey_type = food.JOURNEY_TYPE
        self.locale = food.LOCALE
        self.origin = food.DEPARTURE
        self.adt = food.PASSENGER
        self.destination = food.DESTINATION
        self.wait_time = food.WAITING_TIME if food.WAITING_TIME >= 5 else 5
        self.from_date = food.FROM_DATE
        self.flight_info = {
            'journey_type': [],
            'departure': [],
            'destination': [],
            'search_date': [],
            'departure_datetime': [],
            'arrival_datetime': [],
            'stops': [],
            'passenger': [],
            'prices': [],
            'flight_number': [],
        }
        
        self.driver = self._setup
        self.driver.implicitly_wait(self.wait_time)
        self.driver.get(self._generateURL(self.from_date))

    # used to run both environment Linux and Windows
    @property
    def _setup(self):
        os_name = platform.system()
        # setup when run on Windows
        if os_name == 'Windows':
            return webdriver.Chrome(food.WINDOWS_CHROME_DRIVER)

        self.aservice = service.Service('/usr/local/bin/chromedriver')
        self.aservice.start()

        capabilities = {'chrome.binary': '/usr/bin/google-chrome-stable', "chromeOptions": {"args": ['--no-sandbox']}}

        return webdriver.Remote(self.aservice.service_url, capabilities)

    def _generateURL(self, search_date):
        trip_info = {
            'domain': 'https://fly.vietnamairlines.com/dx/VNDX/#/flight-selection?',
            'variances': {
                'journey_type': self.journey_type,
                'locale': self.locale,
                'origin': self.origin,
                'destination': self.destination,
                'ADT': self.adt,  # Adult numbers
                'CHD': 0,
                'INT': 0,
                'date': search_date,
            }
        }
        return trip_info['domain'] + urllib.parse.urlencode(trip_info['variances'])

    # Find button which can request to next day and call event click
    def _clickNext(self):
        # flights_date have button which can choose day of flights
        btn_flight_days = self.driver.find_element_by_class_name('days')
        is_click = False

        for btn_flight_day in btn_flight_days.find_elements_by_tag_name('button'):
            if is_click:
                btn_flight_day.click()
                break
            # we have to click on next button which have attribute aria-pressed.
            # aria-pressed has mean this button are seleting.
            if btn_flight_day.get_attribute('aria-pressed') == 'true':
                is_click = True

**Class: FlyingSpider**

This class is the blueprint for spiders.
A spider has 2 public methods are `crawl()` and `save()`

<figure>
    <img src="http://4.bp.blogspot.com/-0gjqZ5kGerE/VgdLPDRGg6I/AAAAAAAAK50/aBW64pAtZbA/w1200-h630-p-k-no-nu/waiting_spider_web.jpg">
</figure>

In [None]:
class FlyingSpider(WebDriver):
    def __init__(self):
        super().__init__()

    def __del__(self):
        self.driver.close()

    def _feed(self, col_name, value):
        self.flight_info[col_name].append(value)

    def _feedInfo(self):
        self._feed('journey_type', food.JOURNEY_TYPE)
        self._feed('departure', food.DEPARTURE)
        self._feed('destination', food.DESTINATION)
        self._feed('search_date', self.current)
        self._feed('passenger', food.PASSENGER)

    def crawl(self):
        request_count = 0
        while request_count < food.REQUEST_DAYS:
            request_count += 1
            try:
                print('Crawling page : ', request_count)
                dashboard = self.driver.find_element_by_class_name('flights-table')
                flights = dashboard.find_elements_by_class_name('dxp-flight')

                for flight in flights:

                    self._feedInfo()

                    depart_arrive_time = flight.find_elements_by_class_name('dxp-time')
                    stop_or_not = flight.find_element_by_xpath("//td[@class='column flight-stops']").text
                    flight_number = flight.find_element_by_class_name("flight-number").text

                    self._feed('departure_datetime', depart_arrive_time[0].get_attribute('datetime'))
                    self._feed('arrival_datetime', depart_arrive_time[1].get_attribute('datetime'))
                    self._feed('stops', stop_or_not)
                    self._feed('flight_number', flight_number)

                    prices = []
                    for price in flight.find_elements_by_class_name("price-container"):
                        prices.append(price.text)
                    self._feed('prices', prices)

                self._clickNext()
            except ValueError:
                logging.basicConfig(format='%(asctime)s %(message)s')
                logging.warning('Elements are not found.')

        print('Crawling was done. Check file ', self._generateFileName())

    def _generateFileName(self):
        return str(self.current.year) + str(self.current.month) + \
               str(self.current.day) + str('_') + str(self.current.hour) + \
               str(self.current.minute) + str('.csv')

    def save(self):
        pd.DataFrame(self.flight_info).to_csv(self._generateFileName(), index=False)

### Now let's create a spider and watch it  build the web!!!

In [None]:
def __main__():
    spider = FlyingSpider()
    spider.crawl()
    spider.save()

__main__()