<h1>Importing Required Libraries</h1>

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

<h1>Defining Variables</h1>

In [2]:
n = 3586 #Number of reviews required(MAX:3586)
URL = f"https://www.airlinequality.com/airline-reviews/british-airways/page/1/?sortby=post_date%3ADesc&pagesize={n}"

<h1>Function for Data Scraping</h1>

In [3]:
def fetch_data(url):
    df = pd.DataFrame(columns=['rev_id','review', 'Name', 'Country', 'Date', 'long_review', "aircraft", "type_of_traveller", "cabin_flown", "route", "date_flown", "seat_comfort", 
                               "cabin_staff_service", "food_and_beverages", "ground_service", "value_for_money", "recommended"])
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    i=0
    for article in soup.find_all("article", {"class":"comp_media-review-rated"}):
        try:
            name = article.h3.get_text().split("(")[0].strip()
        except:
            name = None
        try:
            country = article.h3.get_text().split("(")[1].split(")")[0].strip()
        except:
            country = None
        try:
            date = article.h3.get_text().split("(")[1].split(")")[1].strip()
        except:
            date = None
        df = df.append({
            'rev_id': article['class'][-1].replace("review-", ""),
            'review': article.h2.get_text(),
            'Name': name,
            'Country': country,
            'Date': date,
            'long_review': article.find("div", {"class": "text_content"}).get_text(),
        }, ignore_index = True)

        for table in article.find_all('table', {"class":"review-ratings"}):
            for row in (table.find_all('tr')):
                if row.find_all("td")[1].get_text() != '12345':
                    df.loc[i].at[row.find("td")['class'][-1]] = row.find_all("td")[1].get_text()
                else:
                    df.loc[i].at[row.find("td")['class'][-1]] = len(row.find_all("span", {"class":"fill"}))
        i+=1
    return df

<h1>Basic Data Preprocessing</h1>

In [4]:
df = fetch_data(URL)

In [5]:
df.head()

Unnamed: 0,rev_id,review,Name,Country,Date,long_review,aircraft,type_of_traveller,cabin_flown,route,date_flown,seat_comfort,cabin_staff_service,food_and_beverages,ground_service,value_for_money,recommended
0,877598,"""I would still recommend BA""",Michael Gardiner,United Kingdom,23rd October 2023,Not Verified | I flew London to Cairo and ret...,A321 neo,Business,Economy Class,London Heathrow to Cairo,October 2023,3,5,3.0,5,2,yes
1,877522,"""the worst experience ever""",Terry Anderson,United States,22nd October 2023,Not Verified | Absolutely the worst experienc...,,Couple Leisure,Economy Class,Seattle to Porto via Heathrow,October 2023,2,3,3.0,1,1,no
2,877484,"""an exceptional crew member""",Philip Thompson,United Kingdom,22nd October 2023,Not Verified | Flew back from Malta after sc...,A320,Couple Leisure,Economy Class,Malta to London Gatwick,October 2023,4,5,3.0,4,4,yes
3,877425,"""even Ryanair have more space""",M Meijs,Netherlands,21st October 2023,Not Verified | Cabin luggage had to go to carg...,,Solo Leisure,Economy Class,London to Amsterdam,October 2023,1,3,,2,1,no
4,877380,"""in-flight service is usually weak""",Mehmet Sahiner,United Kingdom,21st October 2023,✅ Trip Verified | I have been using BA for a ...,A380-800,Solo Leisure,Economy Class,San Francisco to London,October 2023,3,2,2.0,2,3,no


In [6]:
df.shape

(3586, 17)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3586 entries, 0 to 3585
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   rev_id               3586 non-null   object
 1   review               3586 non-null   object
 2   Name                 3586 non-null   object
 3   Country              3584 non-null   object
 4   Date                 3584 non-null   object
 5   long_review          3586 non-null   object
 6   aircraft             1914 non-null   object
 7   type_of_traveller    2916 non-null   object
 8   cabin_flown          3584 non-null   object
 9   route                2912 non-null   object
 10  date_flown           2909 non-null   object
 11  seat_comfort         3471 non-null   object
 12  cabin_staff_service  3460 non-null   object
 13  food_and_beverages   3205 non-null   object
 14  ground_service       2841 non-null   object
 15  value_for_money      3586 non-null   object
 16  recomm

In [8]:
import numpy as np
df = df.fillna(np.nan)
df = df.replace('N/A', np.nan)

In [9]:
int_dtype = ["rev_id", "seat_comfort", "cabin_staff_service","food_and_beverages","ground_service", "value_for_money"]
df[int_dtype] = df[int_dtype].apply(pd.to_numeric, errors='coerce', axis=1)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3586 entries, 0 to 3585
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   rev_id               3586 non-null   float64
 1   review               3586 non-null   object 
 2   Name                 3586 non-null   object 
 3   Country              3584 non-null   object 
 4   Date                 3584 non-null   object 
 5   long_review          3586 non-null   object 
 6   aircraft             1914 non-null   object 
 7   type_of_traveller    2916 non-null   object 
 8   cabin_flown          3584 non-null   object 
 9   route                2912 non-null   object 
 10  date_flown           2909 non-null   object 
 11  seat_comfort         3471 non-null   float64
 12  cabin_staff_service  3460 non-null   float64
 13  food_and_beverages   3204 non-null   float64
 14  ground_service       2841 non-null   float64
 15  value_for_money      3586 non-null   f

<h1>Data Export</h1>

In [11]:
df.to_csv("AirlineReviews.csv", index=False)