# Module: Crawling flight information from ticket websites

This module is part of the project: **Flight price prediction model**. It crawls flight information from specific ticket websites in a pre-defined schedule. Moreover, the information will be preprocessed and stored into a data warehouse for reference purpose.

In [60]:
from selenium import webdriver
import urllib
import pandas as pd
import logging
import commond as food

**Class: FlightSpider**

This class is the blueprint for spiders.
A spider has 2 public methods are `crawl()` and `save()`

In [61]:
class FlightSpider:
    def __init__(self):
        self.current = food.CURRENT_TIME
        self.journey_type = food.JOURNEY_TYPE
        self.locale = food.LOCALE
        self.origin = food.DEPARTURE
        self.adt = food.PASSENGER
        self.destination = food.DESTINATION
        self.from_date = food.FROM_DATE
        self.to_date = food.TO_DATE
        self.flightInfo = {
            'journeyType': [],
            'departure': [],
            'destination': [],
            'search_date': [],
            'departure_datetime': [],
            'arrival_datetime': [],
            'stops': [],
            'passenger': [],
            'prices': [],
            'flight_number': [],
        }
    
    def _feed(self, col_name, value):
        self.flightInfo[col_name].append(value)

    def _feedInfo(self):
        self._feed('journeyType', food.JOURNEY_TYPE)
        self._feed('departure', food.DEPARTURE)
        self._feed('destination', food.DESTINATION)
        self._feed('search_date', self.current)
        self._feed('passenger', food.PASSENGER)

    def _generateDate(self, from_date, to_date):
        return [date.strftime('%m-%d-%Y') for date in pd.date_range(from_date, to_date)]

    def _generateURL(self, search_date):
        trip_info = {
            'domain': 'https://fly.vietnamairlines.com/dx/VNDX/#/flight-selection?',
            'variances': {
                'journeyType': self.journey_type,
                'locale': self.locale,
                'origin': self.origin,
                'destination': self.destination,
                'ADT': self.adt,  # Adult numbers
                'CHD': 0,
                'INT': 0,
                'date': search_date,
            }
        }
        return trip_info['domain'] + urllib.parse.urlencode(trip_info['variances'])

    def generateURLList(self):
        date_range = self._generateDate(self.from_date, self.to_date)
        urls = []
        # General n urls. Each url has same parameters except date
        for search_date in date_range:
            urls.append(self._generateURL(search_date))
        return urls
    
    def crawl(self):
        urls = self.generateURLList()
        
        for url in urls:
            driver = webdriver.Chrome(food.CHROME_DRIVER)
            driver.implicitly_wait(food.WAITING_TIME)
            driver.get(url)
            try:
                dashboard = driver.find_element_by_class_name('flights-table')
                flights = dashboard.find_elements_by_class_name('dxp-flight')

                for flight in flights:

                    self._feedInfo()

                    depart_arrive_time = flight.find_elements_by_class_name('dxp-time')
                    stop_or_not = flight.find_element_by_xpath("//td[@class='column flight-stops']").text
                    flight_number = flight.find_element_by_class_name("flight-number").text

                    self._feed('departure_datetime', depart_arrive_time[0].get_attribute('datetime'))
                    self._feed('arrival_datetime', depart_arrive_time[1].get_attribute('datetime'))
                    self._feed('stops', stop_or_not)
                    self._feed('flight_number', flight_number)

                    prices = []
                    for price in flight.find_elements_by_class_name("price-container"):
                        prices.append(price.text)
                    self._feed('prices', prices)
                    
            except ValueError:
                logging.basicConfig(format='%(asctime)s %(message)s')
                logging.warning('Elements are not found.')
            driver.quit()

    def _generateFileName(self):
        return str(self.current.year) + str(self.current.month) + \
            str(self.current.day) + str('_') + str(self.current.hour) + \
            str(self.current.minute) + str('.csv')

    def save(self):
        pd.DataFrame(self.flightInfo).to_csv(self._generateFileName(), index=False)


### Now let's create a spider and watch it  build the web!!!

In [62]:
spider = FlightSpider()
spider.crawl()
spider.save()


KeyboardInterrupt: 

<figure><img src="http://4.bp.blogspot.com/-0gjqZ5kGerE/VgdLPDRGg6I/AAAAAAAAK50/aBW64pAtZbA/w1200-h630-p-k-no-nu/waiting_spider_web.jpg"></figure>