In [1]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 38
page_size = 100


In [2]:
import scrapy
from bs4 import BeautifulSoup

class ReviewsSpider(scrapy.Spider):
    name = 'reviews'
    start_urls = [f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}" for i in range(1, pages + 1)]

    def parse(self, response):
        soup = BeautifulSoup(response.text, 'html.parser')
        articles_class = soup.find_all("article", class_="comp comp_reviews-airline querylist position-content")
        articles = articles_class[0].find_all("article", itemprop="review")

        for article in articles:
            review = {}
            review["datePublished"] = article.find("meta", itemprop="datePublished")["content"]
            review["ratingValue"] = article.find("span", itemprop="ratingValue").text
            review["bestRating"] = article.find("span", itemprop="bestRating").text
            review["header"] = article.find("h2", class_="text_header").text
            review["author"] = article.find("span", itemprop="name").text
            review["reviewBody"] = article.find("div", itemprop="reviewBody").text.strip()
            review["recommended"] = (
                article.find("td", class_="review-value rating-yes").text
                if article.find("td", class_="review-value rating-yes")
                else None
            )

            review_stats = {}
            for row in article.find_all("tr"):
                header = row.find("td", class_="review-rating-header")
                if header:
                    key = header.text.strip()
                    value = row.find("td", class_="review-value")
                    if value:
                        review_stats[key] = value.text.strip()
                    else:
                        stars = row.find_all("span", class_="star fill")
                        review_stats[key] = len(stars)
            review["stats"] = review_stats

            yield review

In [None]:
# Run the spider

if __name__ == "__main__":
    from scrapy.crawler import CrawlerProcess

    process = CrawlerProcess(settings={
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'FEED_FORMAT': 'json',
        'FEED_URI': 'output.json'
    })

    process.crawl(ReviewsSpider)
    process.start()
