In [1]:
import time
import datetime
import re
from itertools import product
from collections import OrderedDict
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymongo
from pymongo import MongoClient
from pprint import pprint

In [2]:
def writeLog(log_text, function_name):
    with open('web_parcer.log', 'a') as file:
        file.write(str(datetime.datetime.now()) + ":\t" + function_name + ': \t' + log_text + '\n')


def goToPage(query, driver):
    start_loading = datetime.datetime.now()
    try:
        driver.set_page_load_timeout(120)
        driver.get("https://www.aviasales.ru/search/{}".format(query))
    except WebDriverException:
        writeLog('Page loading timeout :' + query, 'goToPage')
    
    try:
        time.sleep(3)    
        wait_seconds = 600
        wait = WebDriverWait(driver, wait_seconds)
        wait.until(EC.invisibility_of_element((By.CLASS_NAME, "countdown")))
        loading_duration = (datetime.datetime.now() - start_loading).seconds
        printDelimiter('*')
        print("$$$$$   page loaded for {} seconds   $$$$$".format(loading_duration))
        time.sleep(1)
    except:
        print("\x1b[31m\"CAN'T WAIT THE PAGE LOAD ({} seconds)\"\x1b[0m".format(wait_seconds))
    

def closeExtraTab(driver): # В предположении что драйвер находится на рекламном окне после его открытия
    num_of_tabs = len(driver.window_handles)
    if num_of_tabs == 2:
        main_window_handle = None
        if driver.current_window_handle == driver.window_handles[0]:
            driver.switch_to_window(driver.window_handles[1])
            main_window_handle = driver.window_handles[0]
        elif driver.current_window_handle == driver.window_handles[1]:
            driver.switch_to_window(driver.window_handles[0])
            main_window_handle = driver.window_handles[1]
        title_of_closed_tab = driver.title
        driver.close()
        driver.switch_to_window(main_window_handle)
        writeLog("Extra tab *{}* has been closed".format(title_of_closed_tab), 'closeExtraTab')
    
    
def printDelimiter(char):
    delimiter = ''
    for i in range(118):
        delimiter += char
    print(delimiter)
    
    
def getCollection():
    client = MongoClient()
    db = client.aviasales_database # getting a database
    return db.ticket_collection_v2

In [3]:
#hardcoded_year = None################

def processPage(driver, collection):
    tickets = driver.find_elements_by_class_name("ticket")
    tickets = removeAdvertisingTickets(tickets)
    tickets = removeAbsentTickets(tickets)
    print(driver.title[-9:], '\n')
    for ticket in tickets:
        collection.insert_one(processTicket(ticket))
        closeExtraTab(driver)
        
        
def removeAdvertisingTickets(tickets):
    for ticket in tickets:
        ticket_opener_text = ticket.find_element_by_class_name("ticket__expand-button").text
        if ticket_opener_text == 'Реклама':
            tickets.remove(ticket)
    return tickets


def removeAbsentTickets(tickets):
    for ticket in tickets:
        ticket_price = (ticket.find_element_by_class_name("buy-button__price").text).replace('\u2009', '')
        if ticket_price == '':
            tickets.remove(ticket)
    return tickets
        
        
def processTicket(ticket):
    ticket_price = int((ticket.find_element_by_class_name("buy-button__price-num").text).replace('\u2009', ''))
    
    ticket_opener = ticket.find_element_by_class_name("ticket__expand-button")
    ticket_opener.click()
    
    ticket_content = ticket.find_element_by_class_name("ticket__content")
    
    flights = ticket_content.find_elements_by_class_name("ticket-segment")
    
    print("crawl_date_time: ", datetime.datetime.now())                    # debug
    print("price: ", ticket_price)                                         # debug
    
    post = dict()
    post['crawl_date_time'] = datetime.datetime.now()
    post['price'] = ticket_price
    post['flights'] = list()
    
    for flight in flights:
        post['flights'].append(processFlight(post, flight))
    
    return post

        
def processFlight(post, flight):
    flight_info = dict()
    flight_info['details'] = list()
    
    general_flight_info = getGeneralFlightInfo(flight)
    flight_info['city'] = general_flight_info[0]
    flight_info['date_time'] = general_flight_info[1]
    flight_info['total_time'] = general_flight_info[2]
    
    #nonlocal hardcoded_year
    #hardcoded_year = flight_general_info['date_time']['from'].year#############################
    
    
    detailed_flight_info = getDetailedFlightInfo(flight)
    for info in detailed_flight_info:
        flight_info['details'].append(info)
        
    flight_info_order = ['total_time', 'date_time', 'city']        # debug
    for info in flight_info_order:                                 # debug
        print('\t' + info + ": ", flight_info[info])               # debug
        
    print('\n')

    return flight_info
    
    
def getGeneralFlightInfo(flight):
# Общая информация
    flight_from = getGeneralPointInfo(flight, "segment-route__endpoint.origin")

    flight_info_path = getGeneralPathInfo(flight)
    
    flight_to = getGeneralPointInfo(flight, "segment-route__endpoint.destination")
    
    flight_info_city = {'from' : flight_from[0],
                        'to' : flight_to[0]}
    flight_info_date_time = {'from' : flight_from[1],
                             'to' : flight_to[1]}
    
    return flight_info_city, flight_info_date_time, flight_info_path


def getGeneralPathInfo(flight):
    flight_path = flight.find_element_by_class_name("segment-route__route_wrap")
    flight_total_time = re.split(' ', flight_path.find_element_by_class_name("segment-route__duration").text)
    flight_total_time_info = getHoursAndMinutes(flight_total_time)
    return flight_total_time_info
    
    
def getHoursAndMinutes(time_string):
    hours = 0
    minutes = 0
    days = 0
    for word in time_string:
        if word[-1] == 'ч':
            hours = word[:-1]
        elif word[-1] == 'м':
            minutes = word[:-1]
        elif word[-1] == 'д':
            days = word[:-1]
    return {'days' : days, 'hours' : hours, 'minutes' : minutes}
    
    
def getGeneralPointInfo(flight, segment_route):
    flight_point = flight.find_element_by_class_name(segment_route)
    flight_city = flight_point.find_element_by_class_name("segment-route__city").text
    time_point = re.split(':', flight_point.find_element_by_class_name("segment-route__time").text)
    date_point = re.split(' ', flight_point.find_element_by_class_name("segment-route__date").text)
    flight_date_time_point = datetime.datetime(int(date_point[2][:-1]), 
                                               translateDate(date_point[1]), 
                                               int(date_point[0]), 
                                               int(time_point[0]), 
                                               int(time_point[1]))
    return flight_city, flight_date_time_point

    
def translateDate(month):
    date_translator = {'янв' : 1, 'фев' : 2, 'мар' : 3, 'апр' : 4 ,'май' : 5 ,'июн' : 6, 'июл' : 7, 'авг' : 8,
                       'сен' : 9, 'окт' : 10, 'ноя' : 11, 'дек' : 12}
    return date_translator[month]
    
    
def getDetailedFlightInfo(flight):
    flight_info = list()
    try:
    # Детали перелета
        time.sleep(1)
    
        flight_itinerary = flight.find_element_by_class_name("ticket-segment__details")

        # Flight legs
        flight_legs = flight_itinerary.find_elements_by_class_name("ticket-segment__flight")
        flight_legs_info = list()

        for flight_leg in flight_legs:
            flight_legs_info.append(getDetailedFlightLegInfo(flight_leg))


        # Flight stops    
        flight_stops = flight_itinerary.find_elements_by_class_name("ticket-segment__stop")
        flight_stops_info = list()

        for flight_stop in flight_stops:
            flight_stops_info.append(getDetailedFlightStopInfo(flight_stop))


        # Миксую через один инфу по остановкам и инфу по пересадкам (для восстановления хронологии в выводе)
        flight_info = mixFlightInfo(flight_legs_info, flight_stops_info)

        #for info in flight_info:                                               # debug
        #    if info['type'] == 'leg':                                          # debug
        #        print("\n\tflight date time: ", info['date_time'],             # debug
        #              "\n\t\tairport: ", info['airport'],                      # debug
        #              "\n\t\tairline: ", info['airline'],                      # debug
        #              "\n\t\tflight number: ", info['flight_number'])          # debug
        #    elif info['type'] == 'stop':                                       # debug
        #        print("\n\tstop time: ", info['time'],                         # debug
        #              "\n\tstop place: ", info['place'])                       # debug

        #print("\n")                                                        # debug
    except NoSuchElementException as exception:
        #print("\x1b[31m\"red\"\x1b[0m")
        print("\x1b[31m\"Element not found (getDetailedFlightInfo): ", exception, "\"\x1b[0m")
    return flight_info 
    
                                 
def mixFlightInfo(flight_legs_info, flight_stops_info):
    flight_info = [flight_stops_info[i//2] 
                   if i%2 else flight_legs_info[i//2] 
                   for i in range(len(flight_stops_info)*2+1)]
    return flight_info
                                 
    
def getDetailedFlightStopInfo(flight_stop):
    flight_attrs_tmp = dict()
    flight_attrs_tmp['place'] = flight_stop.find_element_by_class_name("ticket-stop__place").text
    flight_stop_time = re.split(' ', flight_stop.find_element_by_class_name("ticket-stop__time").text)
    flight_attrs_tmp['time'] = getHoursAndMinutes(flight_stop_time)
    flight_attrs_tmp['type'] = 'stop'
    return flight_attrs_tmp
    

def getDetailedFlightLegInfo(flight_leg):
    try:
        flight_attrs_tmp = dict()
        
        flight_attrs_tmp['type'] = 'leg'

        flight_from_info = flight_leg.find_elements_by_class_name("ticket-flight__route-info")[0]
        flight_to_info = flight_leg.find_elements_by_class_name("ticket-flight__route-info")[1]

        flight_date_time_from = getDetailedFlightLegDatetimeInfo(flight_from_info)
        flight_date_time_to = getDetailedFlightLegDatetimeInfo(flight_to_info)
        
        flight_attrs_tmp['date_time'] = {'from' : flight_date_time_from,
                                         'to' : flight_date_time_to}

        flight_attrs_tmp['airport'] = getDetailedFlightLegAirportInfo(flight_from_info, flight_to_info)

        flight_airline = flight_leg.find_element_by_class_name("ticket-carrier")
        flight_attrs_tmp['airline'] = flight_airline.find_element_by_css_selector('img').get_attribute('alt')
        
        flight_number = flight_leg.find_element_by_class_name("ticket-flight__title")
        flight_attrs_tmp['flight_number'] = flight_number.text[5:]
    
    except NoSuchElementException as exception:
        print("\x1b[31m\"Element not found (getDetailedFlightLegInfo): ", exception, "\"\x1b[0m")
    return flight_attrs_tmp


def getDetailedFlightLegAirportInfo(flight_from_info, flight_to_info):
    flight_airport_name_from = flight_from_info.find_element_by_class_name("ticket-flight__name").text
    flight_airport_iata_from = flight_from_info.find_element_by_class_name("ticket-flight__iata").text
    flight_airport_name_to = flight_to_info.find_element_by_class_name("ticket-flight__name").text
    flight_airport_iata_to = flight_to_info.find_element_by_class_name("ticket-flight__iata").text
    flight_airport_info = {'from' : 
                                   {'name' : flight_airport_name_from,
                                    'iata' : flight_airport_iata_from},
                           'to' : 
                                   {'name' : flight_airport_name_to,
                                    'iata' : flight_airport_iata_to}
                          }
    return flight_airport_info


def getDetailedFlightLegDatetimeInfo(flight_info):
    
    flight_time = re.split(':', flight_info.find_element_by_class_name("ticket-flight__departure-time").text)
    flight_date = re.split(' ', flight_info.find_element_by_class_name("ticket-flight__departure-date").text)
    
    #ATTENTION: создаю багу того что год в дате проставляется как год даты начала поездки
    flight_date_time= datetime.datetime(2019, #######hardcode######################################
                                        translateDate(flight_date[1][:-1]),
                                        int(flight_date[0]),
                                        int(flight_time[0]),
                                        int(flight_time[1])) 
    return flight_date_time


In [None]:
driver = webdriver.Firefox()
driver.get('https://www.aviasales.ru/search/MOW1307STO1')

In [5]:
def crawlByList(dates_destinations_pairs, driver): 
    collection = getCollection()
    query = 'MOW'
    while len(dates_destinations_pairs) > 0:
        pair = dates_destinations_pairs[0]
        query = 'MOW' + pair[0] + pair[1] + '1'
        goToPage(query, driver)
        processPage(driver, collection)
        dates_destinations_pairs.remove(pair)

#При остановке паука можно продолжить поиск с той пары на которой все сломалось
def makeDateDestinationPairs(dates, destinations): # dates - что то в духе ['1304','1404', ...]
    return list(product(dates, destinations))

#При перезапуске кернела, можно обрезать начало массива пар, которое уже пройдено краулером
def skipPairsBefore(pair_array, last_seen_pair):
    idx = pair_array.index(last_seen_pair)
    return pair_array[idx:]

In [9]:
# НЕ ЗАПУСКАТЬ ПРИ ПОЛОМКЕ (создаст заново все пары городов и дат)
city_time_pairs = makeDateDestinationPairs(time_list_september[15:], city_list) 

In [10]:
print(city_time_pairs[:50])

[('1609', 'ALC'), ('1609', 'AMS'), ('1609', 'ANR'), ('1609', 'BCN'), ('1609', 'BFS'), ('1609', 'BER'), ('1609', 'BRN'), ('1609', 'BLQ'), ('1609', 'BRE'), ('1609', 'BRQ'), ('1609', 'OST'), ('1609', 'BRU'), ('1609', 'BUD'), ('1609', 'VLC'), ('1609', 'VIE'), ('1609', 'VCE'), ('1609', 'VRN'), ('1609', 'HAM'), ('1609', 'HAJ'), ('1609', 'GLA'), ('1609', 'GOT'), ('1609', 'DUB'), ('1609', 'DUS'), ('1609', 'GVA'), ('1609', 'SZG'), ('1609', 'IBZ'), ('1609', 'INN'), ('1609', 'KLV'), ('1609', 'CBG'), ('1609', 'ORC'), ('1609', 'CGN'), ('1609', 'LCA'), ('1609', 'LEJ'), ('1609', 'LYS'), ('1609', 'LON'), ('1609', 'MAD'), ('1609', 'PMI'), ('1609', 'AGP'), ('1609', 'MAN'), ('1609', 'MIL'), ('1609', 'MPL'), ('1609', 'MUC'), ('1609', 'NAP'), ('1609', 'ECN'), ('1609', 'NCE'), ('1609', 'NUE'), ('1609', 'PMO'), ('1609', 'PAR'), ('1609', 'PFO'), ('1609', 'PEG')]


In [11]:
# ЗАПУСКАТЬ ТОЛЬКО при перезапуске кернела
city_time_pairs = skipPairsBefore(city_time_pairs, ('2209', 'GVA'))
print(city_time_pairs[:50])

[('2209', 'GVA'), ('2209', 'SZG'), ('2209', 'IBZ'), ('2209', 'INN'), ('2209', 'KLV'), ('2209', 'CBG'), ('2209', 'ORC'), ('2209', 'CGN'), ('2209', 'LCA'), ('2209', 'LEJ'), ('2209', 'LYS'), ('2209', 'LON'), ('2209', 'MAD'), ('2209', 'PMI'), ('2209', 'AGP'), ('2209', 'MAN'), ('2209', 'MIL'), ('2209', 'MPL'), ('2209', 'MUC'), ('2209', 'NAP'), ('2209', 'ECN'), ('2209', 'NCE'), ('2209', 'NUE'), ('2209', 'PMO'), ('2209', 'PAR'), ('2209', 'PFO'), ('2209', 'PEG'), ('2209', 'PRG'), ('2209', 'ROM'), ('2209', 'RTM'), ('2209', 'SVQ'), ('2209', 'STO'), ('2209', 'SXB'), ('2209', 'QCN'), ('2209', 'TRN'), ('2209', 'FLR'), ('2209', 'ZRH'), ('2209', 'EDI'), ('2209', 'EIN'), ('2309', 'ALC'), ('2309', 'AMS'), ('2309', 'ANR'), ('2309', 'BCN'), ('2309', 'BFS'), ('2309', 'BER'), ('2309', 'BRN'), ('2309', 'BLQ'), ('2309', 'BRE'), ('2309', 'BRQ'), ('2309', 'OST')]


In [12]:
crawlByList(city_time_pairs, driver) # main function

[31m"CAN'T WAIT THE PAGE LOAD (600 seconds)"[0m


MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=43749): Max retries exceeded with url: /session/59d4e198-8288-4404-90f0-34f4f1014e11/elements (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f33620a69b0>: Failed to establish a new connection: [Errno 111] Connection refused',))

In [8]:
#март
time_list_march = ['0103','0203','0303','0403','0503','0603','0703','0803','0903','1003',
                   '1103','1203','1303','1403','1503','1603','1703','1803','1903','2003',
                   '2103','2203','2303','2403','2503','2603','2703','2803','2903','3003','3103']

#апрель
time_list_april = ['0104','0204','0304','0404','0504','0604','0704','0804','0904','1004',
                   '1104','1204','1304','1404','1504','1604','1704','1804','1904','2004',
                   '2104','2204','2304','2404','2504','2604','2704','2804','2904','3004']

#май
time_list_may = ['0105','0205','0305','0405','0505','0605','0705','0805','0905','1005',
                 '1105','1205','1305','1405','1505','1605','1705','1805','1905','2005',
                 '2105','2205','2305','2405','2505','2605','2705','2805','2905','3005', '3105']

#июнь
time_list_june = ['0106','0206','0306','0406','0506','0606','0706','0806','0906','1006',
                  '1106','1206','1306','1406','1506','1606','1706','1806','1906','2006',
                  '2106','2206','2306','2406','2506','2606','2706','2806','2906','3006']

#июль
time_list_july = ['0107','0207','0307','0407','0507','0607','0707','0807','0907','1007',
                  '1107','1207','1307','1407','1507','1607','1707','1807','1907','2007',
                  '2107','2207','2307','2407','2507','2607','2707','2807','2907','3007', '3107']

#август
time_list_august = ['0108','0208','0308','0408','0508','0608','0708','0808','0908','1008',
                    '1108','1208','1308','1408','1508','1608','1708','1808','1908','2008',
                    '2108','2208','2308','2408','2508','2608','2708','2808','2908','3008', '3108']

#сентябрь
time_list_september = ['0109','0209','0309','0409','0509','0609','0709','0809','0909','1009',
                       '1109','1209','1309','1409','1509','1609','1709','1809','1909','2009',
                       '2109','2209','2309','2409','2509','2609','2709','2809','2909','3009']


city_list = {'BER':'Берлин', 
             'HAM':'Гамбург', 
             'BRE':'Бремен', 
             'HAJ':'Ганновер', 
             'LEJ':'Лейпциг', 
             'DUS':'Дюссельдорф', 
             'CGN':'Кёльн', 
             'NUE':'Нюрнберг', 
             'MUC':'Мюнхен', 
             'PRG':'Прага', 
             'BRQ':'Брно', 
             'KLV':'Карловы Вары', 
             'VIE':'Вена', 
             'SZG':'Зальцбург', 
             'INN':'Инсбрук', 
             'BUD':'Будапешт', 
             'BRU':'Брюссель', 
             'ANR':'Антверпен', 
             'OST':'Брюгге', 
             'NCE':'Ницца', 
             'LYS':'Лион', 
             'PAR':'Париж', 
             'MPL':'Монпелье', 
             'SXB':'Страсбург', 
             'GVA':'Женева', 
             'ZRH':'Цюрих', 
             'BRN':'Берн',
             'MIL':'Милан',
             'ROM':'Рим',
             'VCE':'Венеция',
             'VRN':'Верона',
             'NAP':'Неаполь',
             'TRN':'Турин',
             'FLR':'Флоренция',
             'PMO':'Палермо',
             'BLQ':'Болонья',
             'PEG':'Перуджа',
             'AGP':'Малага',
             'BCN':'Барселона',
             'PMI':'Майорка',
             'MAD':'Мадрид',
             'ALC':'Аликанте',
             'SVQ':'Севилья',
             'VLC':'Валенсия',
             'QCN':'Таррагона',
             'IBZ':'Ибица',
             'LON':'Лондон',
             'MAN':'Манчестер',
             'EDI':'Эдинбург',
             'GLA':'Глазго',
             'CBG':'Кембридж',
             'BFS':'Белфаст',
             'DUB':'Дублин',
             'ORC':'Корк',
             'AMS':'Амстердам',
             'RTM':'Роттердам',
             'EIN':'Эйндховен',
             'LCA':'Ларнака',
             'ECN':'Никосия',
             'PFO':'Пафос',
             'STO':'Стокгольм',
             'GOT':'Гётерберг'
             }

city_list = OrderedDict(sorted(city_list.items(), key=lambda kv: kv[1]))

In [417]:
def printBeautifulTicket(post):
    print('_id: ', post['_id'])
    print('crawl_date_time: ', post['crawl_date_time'])
    print('flights: ')
    for flight in post['flights']:
        print('\tcity: ')
        print('\t\tfrom\t: ', flight['city']['from'])
        print('\t\tto\t: ', flight['city']['to'])
        print('\tdate_time: ')
        print('\t\tfrom\t: ', flight['date_time']['from'])
        print('\t\tto\t: ', flight['date_time']['to'])
        print('\tdetailes: ')
        for period in flight['details']:
            if period['type'] == 'leg':
                print('\t\tairline\t: ', period['airline'])
                print('\t\tairport: ')
                print('\t\t\tfrom: ')
                print('\t\t\t\tiata\t: ', period['airport']['from']['iata'])
                print('\t\t\t\tname\t: ', period['airport']['from']['name'])
                print('\t\t\tto: ')
                print('\t\t\t\tiata\t: ', period['airport']['to']['iata'])
                print('\t\t\t\tname\t: ', period['airport']['to']['name'])
                print('\t\tdate_time: ')
                print('\t\t\tfrom\t: ', period['date_time']['from'])
                print('\t\t\tto\t: ', period['date_time']['to'])
                print('\t\tflight_number\t: ', period['flight_number'])
                print('\t\ttype\t: ', period['type'])
            if period['type'] == 'stop':
                print('\t\tplace\t: ', period['place'])
                print('\t\ttime: ')
                print('\t\t\thours\t: ', period['time']['hours'])
                print('\t\t\tminutes\t: '), period['time']['minutes']
                
    print('\n\n')

In [None]:
for post in collection.find({'price':61765}):
    printBeautifulTicket(post)

In [None]:
for post in collection.find({'price':61765}):
    pprint(post)