# COLLECTING ALL THE AIRLINE DATA FROM THE SITE

### I scraped more data from the site than was needed but decided to keep the data to discover new insights outside the internship objectives.

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import re

reviews = []
stars = []
country = []
aircraft = []
type_of_traveller = []
seat_type = []
route = []
date_flown = []
seat_comfort = []
cabin_staff_services = []
food_and_beverages = []
ground_service = []
value_for_money = []
recommended = []

#error handling and solving
for i in range(1, 36):
    try:
        page = requests.get(f"https://www.airlinequality.com/airline-reviews/british-airways/page/{i}/?sortby=post_date%3ADesc&pagesize=100")
        bsobj = bs(page.content, "html5")
        # scrape data from the page
        # ...
    except requests.exceptions.RequestException as e:
        # handle errors that occur during the request
        print(f"An error occurred while scraping page {i}: {e}") 
    
    # reviews
    for item in bsobj.find_all("div", class_="text_content"):
        review_text = item.text
        reviews.append(review_text)
    
    # aircraft            
    for item in bsobj.find_all("div", class_="body"):
        try:
            aircraft_type = item.find("table", class_="review-ratings").find("td", class_="review-rating-header aircraft").find_next_sibling("td").text
            aircraft.append(aircraft_type)
        except AttributeError:
            aircraft.append("N/A")
            
     # stars         
    for item in bsobj.find_all("article", itemprop = "review"):
        try:
            stars_type = item.find("div", itemprop = "reviewRating").find("span", itemprop = "ratingValue").text
            stars.append(stars_type)
        except AttributeError:
            stars.append("N/A")
            
     # scrape data from the page
    for item in bsobj.find_all("h3", class_="text_sub_header userStatusWrapper"):
        try:
            country_type = re.search(r'\(([^)]+)\)', item.text).group(1)
            country.append(country_type.strip())
        except AttributeError:
            country.append("N/A")
            
    # type of traveller
    for item in bsobj.find_all("div", class_="body"):
        try:
            traveller_type = item.find("table", class_="review-ratings").find("td", class_="review-rating-header type_of_traveller").find_next_sibling("td").text
            type_of_traveller.append(traveller_type)
        except AttributeError:
            type_of_traveller.append("N/A")
    
    # seat type
    for item in bsobj.find_all("div", class_="body"):
        try:
            type_seat = item.find("table", class_="review-ratings").find("td", class_="review-rating-header cabin_flown").find_next_sibling("td").text
            seat_type.append(type_seat) 
        except AttributeError:
            seat_type.append("N/A")
    
    # route
    for item in bsobj.find_all("div", class_="body"):
        try:
            route_type = item.find("table", class_="review-ratings").find("td", class_="review-rating-header route").find_next_sibling("td").text
            route.append(route_type)
        except AttributeError:
            route.append("N/A")
            
    # date flown
    for item in bsobj.find_all("div", class_="body"):
        try:
            date_type = item.find("table", class_="review-ratings").find("td", class_="review-rating-header date_flown").find_next_sibling("td").text
            date_flown.append(date_type)
        except AttributeError:
            date_flown.append("N/A")
            
    # seat comfort
    for item in bsobj.find_all("div", class_="body"):
        try:
            stars_span = item.find("table", class_="review-ratings").find("td", class_="review-rating-header seat_comfort").find_next_sibling("td").find_all("span", class_="star fill")
            num_stars = len(stars_span)
            seat_comfort.append(num_stars)
        except AttributeError:
            seat_comfort.append("N/A")
    
    # cabin staff
    for item in bsobj.find_all("div", class_="body"):
        try:
            cabin_span = item.find("table", class_="review-ratings").find("td", class_="review-rating-header cabin_staff_service").find_next_sibling("td").find_all("span", class_="star fill")
            num_cabin = len(cabin_span)
            cabin_staff_services.append(num_cabin)
        except:
            cabin_staff_services.append("N/A")
            
    # food and beverages
    for item in bsobj.find_all("div", class_="body"):
        try:
            food_and_beverages_span = item.find("table", class_="review-ratings").find("td", class_="review-rating-header food_and_beverages").find_next_sibling("td").find_all("span", class_="star fill")
            num_food_and_beverages = len(food_and_beverages_span)
            food_and_beverages.append(num_food_and_beverages)
        except AttributeError:
            food_and_beverages.append("N/A")
            
    # ground service
    for item in bsobj.find_all("div", class_= "body"):
        try:
            ground_service_span = item.find("table", class_="review-ratings").find("td", class_="review-rating-header ground_service").find_next_sibling("td").find_all("span", class_="star fill")
            num_ground_service = len(ground_service_span)
            ground_service.append(num_ground_service)
        except AttributeError:
            # handle errors that occurred during the scraping
            ground_service.append("N/A")
            
    # value for money
    for item in bsobj.find_all("div", class_="body"):
        try:
            value_for_money_span = item.find("table", class_="review-ratings").find("td", class_="review-rating-header value_for_money").find_next_sibling("td").find_all("span", class_="star fill")
            num_value = len(value_for_money_span)
            value_for_money.append(num_value)
        except AttributeError:
            # handle errors that occurred during the scraping
            value_for_money.append("N/A")  
            
    # recommended
    for item in bsobj.find_all("td", class_= "review-rating-header recommended"):
        if "Recommended" in item.text:
            recommended.append(item.find_next_sibling("td").text)

CROSS-CHECKING THE LENGTH OF EACH VALUE

In [2]:
print("Number of reviews:", len(reviews))
print("Number of stars:", len(stars))
print("Number of aircraft:", len(aircraft))
print("Number of countries:", len(country))
print("Number of types of traveller:", len(type_of_traveller))
print("Number of seat types:", len(seat_type))
print("Number of routes:", len(route))
print("Number of dates flown:", len(date_flown))
print("Number of seat comfort ratings:", len(seat_comfort))
print("Number of cabin staff services ratings:", len(cabin_staff_services))
print("Number of food and beverages ratings:", len(food_and_beverages))
print("Number of ground service ratings:", len(ground_service))
print("Number of value for money ratings:", len(value_for_money))
print("Number of recommended ratings:", len(recommended))

Number of reviews: 3485
Number of stars: 3485
Number of aircraft: 3485
Number of countries: 3485
Number of types of traveller: 3485
Number of seat types: 3485
Number of routes: 3485
Number of dates flown: 3485
Number of seat comfort ratings: 3485
Number of cabin staff services ratings: 3485
Number of food and beverages ratings: 3485
Number of ground service ratings: 3485
Number of value for money ratings: 3485
Number of recommended ratings: 3485


In [3]:
# creating a dataframe
df = pd.DataFrame({"Reviews": reviews,"Ratings": stars,
                   "Aircraft Type":aircraft, "Traveller": type_of_traveller,
                   "Countries": country, "Flight Ticket": seat_type, "Route": route, 
                  "Date Flown":date_flown, "Seat Comfort /5": seat_comfort,
                   "Cabin Staff Services /5": cabin_staff_services,
                   "Food and beverages /5": food_and_beverages, "Ground service /5": ground_service,
                   "Value for money /5": value_for_money, "Recommended": recommended
                  })

In [12]:
df.head()

In [4]:
# Creating a .csv on the os 
import os

cwd = os.getcwd()
df.to_csv(cwd+ "/BA.csv")