In [1]:
%pip install pandas matplotlib openpyxl

Collecting pandas
  Downloading pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting matplotlib
  Downloading matplotlib-3.9.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Collecting cycler>=0.10 (from matplotli

## Web Scrapping

### Import necessary libraries

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

### Define a dictionary to hold the review data

In [3]:

review_table = {
    'Name': [],
    'Date Published': [],
    'review': [],
    'overall rating': [],
    'Verified customer': [],
    'type of travel': [],
    'Route': [],
    'Date of Travel': [],
    'Seat Type': [],
    'Seat Comfort': [],
    'Cabin Staff Service': [],
    'Food & Beverages': [],
    'Inflight Entertainment': [],
    'Ground Service': [],
    'wifi connectivity': [],
    'Value For Money': []
}

### Function to update the review data

In [4]:

def update_data(reviews):
    for review in reviews:
        reviewr_name = review.find("span", itemprop="name").text.strip() if review.find("span", itemprop="name") else "No Name"
        date_published = review.find("time", itemprop="datePublished").text.strip() if review.find("time", itemprop="datePublished") else ""
        review_head = review.find('h2', class_="text_header").text.strip() if review.find('h2', class_="text_header") else "No Header"
        ratings = review.find('span', itemprop="ratingValue").text.strip() if review.find('span', itemprop="ratingValue") else "Nan"
        content_div = review.find('div', itemprop='reviewBody')
        verifiedOrNot = 'Verified' if content_div and content_div.find('em', text='Trip Verified') else 'Not Verified'
        
        rating_table = review.find('table', class_="review-ratings")
        if rating_table:
            rows = rating_table.find_all('tr')
            flag = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            for row in rows:
                cells = row.find_all("td")
                for cell in cells:
                    if "Type Of Traveller" in cell.text and flag[6] == 0:
                        typeOtravel = row.find('td', class_='review-value').text.strip()
                        flag[6] = 1
                        break
                    elif "Route" in cell.text and flag[7] == 0:
                        route_travelled = row.find('td', class_='review-value').text.strip()
                        flag[7] = 1
                        break
                    elif "Date Flown" in cell.text and flag[8] == 0:
                        Date_flown = row.find('td', class_='review-value').text.strip()
                        flag[8] = 1
                        break
                    elif "Seat Type" in cell.text and flag[9] == 0:
                        Seat_type = row.find('td', class_='review-value').text.strip()
                        flag[9] = 1
                        break
                    elif "Seat Comfort" in cell.text and flag[0] == 0:
                        stars = row.find_all('span', class_="star fill")
                        seat_comfort_stars = len(stars) if stars else None
                        flag[0] = 1
                        break
                    elif "Food & Beverages" in cell.text and flag[1] == 0:
                        stars = row.find_all('span', class_="star fill")
                        food_beverages_stars = len(stars) if stars else None
                        flag[1] = 1
                        break
                    elif "Inflight Entertainment" in cell.text and flag[2] == 0:
                        stars = row.find_all('span', class_="star fill")
                        inflight_entertainment_stars = len(stars) if stars else None
                        flag[2] = 1
                        break
                    elif "Ground Service" in cell.text and flag[3] == 0:
                        stars = row.find_all('span', class_="star fill")
                        ground_service_stars = len(stars) if stars else None
                        flag[3] = 1
                        break
                    if "Wifi & Connectivity" in cell.text and flag[4] == 0:
                        stars = row.find_all('span', class_="star fill")
                        wifi_connectivity_stars = len(stars) if stars else None
                        flag[4] = 1
                        break
                    if "Cabin Staff Service" in cell.text and flag[5] == 0:
                        stars = row.find_all('span', class_="star fill")
                        Cabin_Staff_Service_stars = len(stars)
                        flag[5] = 1
                        break
                    elif "Value For Money" in cell.text:
                        stars = row.find_all('span', class_="star fill")
                        value_for_money_stars = len(stars) if stars else None
                        break
                    else:
                        if flag[4] == 0: 
                            wifi_connectivity_stars = None
                        if flag[2] == 0:
                            inflight_entertainment_stars = None
                        if flag[1] == 0:
                            food_beverages_stars = None
                        if flag[5] == 0:
                            Cabin_Staff_Service_stars = None
                        if flag[3] == 0:
                            ground_service_stars = None
                        if flag[0] == 0:
                            seat_comfort_stars = None
                        if flag[6] == 0:
                            typeOtravel = None
                        if flag[7] == 0:
                            route_travelled = None
                        if flag[8] == 0:
                            Date_flown = None
                        if flag[9] == 0:
                            Seat_type = None
                        break
        else:
            print("i was unable to scrape this webpage")
        
        review_table["Name"].append(reviewr_name)
        review_table["Value For Money"].append(value_for_money_stars)
        review_table["wifi connectivity"].append(wifi_connectivity_stars)
        review_table["Cabin Staff Service"].append(Cabin_Staff_Service_stars)
        review_table["Ground Service"].append(ground_service_stars)
        review_table["Inflight Entertainment"].append(inflight_entertainment_stars)
        review_table["Food & Beverages"].append(food_beverages_stars)
        review_table["Seat Comfort"].append(Cabin_Staff_Service_stars)
        review_table["Seat Type"].append(Seat_type)
        review_table["Date of Travel"].append(Date_flown)
        review_table["Route"].append(route_travelled)
        review_table["type of travel"].append(typeOtravel)
        review_table["Verified customer"].append(verifiedOrNot)
        review_table["Date Published"].append(date_published)
        review_table["review"].append(review_head)
        review_table["overall rating"].append(ratings)


### Main scraping loop

In [5]:

webpage = "https://www.airlinequality.com/airline-reviews/british-airways"

for i in range(1, 381):
    d_webpage = f"https://www.airlinequality.com/airline-reviews/british-airways/page/{i}/"
    
    headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} 
    response = requests.get(url=d_webpage, headers=headers)
    print("page number:", i)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        reviews_section = soup.find('section', class_='layout-section layout-2 closer-top')
        reviews = reviews_section.find_all("article", itemprop="review")
        update_data(reviews)

page number: 1


  verifiedOrNot = 'Verified' if content_div and content_div.find('em', text='Trip Verified') else 'Not Verified'


page number: 2
page number: 3
page number: 4
page number: 5
page number: 6
page number: 7
page number: 8
page number: 9
page number: 10
page number: 11
page number: 12
page number: 13
page number: 14
page number: 15
page number: 16
page number: 17
page number: 18
page number: 19
page number: 20
page number: 21
page number: 22
page number: 23
page number: 24
page number: 25
page number: 26
page number: 27
page number: 28
page number: 29
page number: 30
page number: 31
page number: 32
page number: 33
page number: 34
page number: 35
page number: 36
page number: 37
page number: 38
page number: 39
page number: 40
page number: 41
page number: 42
page number: 43
page number: 44
page number: 45
page number: 46
page number: 47
page number: 48
page number: 49
page number: 50
page number: 51
page number: 52
page number: 53
page number: 54
page number: 55
page number: 56
page number: 57
page number: 58
page number: 59
page number: 60
page number: 61
page number: 62
page number: 63
page number: 64


### Create a DataFrame from the review table and save it to an Excel file

In [6]:

data = pd.DataFrame(review_table)
print(data.head())
data.to_excel("British_Airlines_review1.xlsx", index=False)

                Name  Date Published                                review  \
0  Daniel Chinellato   9th July 2024      "left me extremely disappointed"   
1     Flora Dogneton   5th July 2024          “customer service was awful”   
2        Alex Martyn   1st July 2024             "over 2 weeks to respond"   
3          P Jackson  30th June 2024  “wholly inadequate customer service”   
4         R Thornton  26th June 2024           “the cabin crew were great”   

  overall rating Verified customer  type of travel  \
0              1      Not Verified        Business   
1              1      Not Verified  Couple Leisure   
2              1      Not Verified    Solo Leisure   
3              1          Verified  Couple Leisure   
4              6          Verified        Business   

                            Route  Date of Travel       Seat Type  \
0  Berlin to São Paulo via London      March 2024   Economy Class   
1                 London to Corfu  September 2023   Economy Class   

## Data Cleaning