## GETTING THE DATA

In [3]:
from bs4 import BeautifulSoup
import requests as requests
import pandas as pd
import re # for regular expressions 

In [None]:
ratingValue = []
reviewTitle = []
location = []
reviewArticle = []

for i in range(1,201): # Scrape 200 pages
    try:
        URL = f"https://www.airlinequality.com/airline-reviews/british-airways/page/{i}/"
        page = requests.get(URL)
        page = page.content
        soup = BeautifulSoup(page, "html.parser")

        all_reviews = soup.find_all("article", itemprop="review")

        
        for review_element in all_reviews:
            ratingValues = review_element.find("span", itemprop="ratingValue").text.strip()
            ratingValue.append(ratingValues)

            reviewTitles = review_element.find("h2", class_="text_header").text.strip()
            reviewTitle.append(reviewTitles)

            locations = review_element.find("h3", class_="text_sub_header").text.strip()
            location.append(locations)

            reviewArticles = review_element.find("div", class_="text_content").text.strip()
            reviewArticle.append(reviewArticles)

        print("Page ", i, " complete") # To know number of pages scraped 
        print(len(ratingValue), "reviews scraped") # To know number of reviews scraped
    except AttributeError:
        continue

In [None]:
df = pd.DataFrame({"ratingValue":ratingValue, "reviewTitle":reviewTitle, "location":location, "reviewArticle":reviewArticle})
df.sample(3)

## INSPECTING AND CLEANING THE DATA

In [2]:
df.info() # Get to know the data

NameError: name 'df' is not defined

In [None]:
sum(df.duplicated()) # Check for duplicated items

In [4]:
# convert dataframe into csv file
# df.to_csv('BA.csv', index=False)

# Read csv
df = pd.read_csv('BA.csv')
df

Unnamed: 0,ratingValue,reviewTitle,location,reviewArticle
0,8,"""sufficient leg and arm room""",Graeme Boothman (United Kingdom) 8th November ...,✅ Trip Verified | Booked online months ago an...
1,7,“crew were polite”,R Vines (United Kingdom) 7th November 2023,✅ Trip Verified | The flight was on time. The...
2,2,"""Angry, disappointed, and unsatisfied""",Massimo Tricca (Italy) 5th November 2023,"Not Verified | Angry, disappointed, and unsat..."
3,3,"""BA now stands for Basic Airways""",J Kaye (United Kingdom) 5th November 2023,"✅ Trip Verified | As an infrequent flyer, Bri..."
4,8,"""A totally unremarkable flight""",M Collie (Ireland) 4th November 2023,"Not Verified | A totally unremarkable flight,..."
...,...,...,...,...
1995,3,"""another underwhelming experience""",Clive Drake (United Kingdom) 13th November 2016,✅ Verified Review | The flight started in the...
1996,3,"""underwhelming due to bean counters""",Clive Drake (United Kingdom) 12th November 2016,✅ Verified Review | The flight started badly ...
1997,4,"""bit amateur for business class """,R Gordon (United Kingdom) 11th November 2016,Gatwick to Alicante. Crew friendly but a bit a...
1998,3,"""was one bad trip too many""",P Cleary (United Kingdom) 10th November 2016,✅ Verified Review | London Heathrow to Bangko...


#### Things to clean
1. Convert ratingValue to int
2. Remove quotation symbols from reviewTitle and convert to string
3. Get text inside bracket in location and convert to string
4. Split reviewArticle into verification and actual article columns
5. Remove check symbol from verification column

In [5]:
#1
df['ratingValue'] = df['ratingValue'].astype(int) # Convert ratingValue to int

In [6]:
#3
df['location'] = df['location'].astype(str)   # Convert to string
df['year'] = df.location.str[-4:]
df['location'] = df['location'].str.extract(r'\((.*?)\)')    # Get text inside bracket
df['location'] = df['location'].str.lower()
df['location'] = df['location'].str.strip()

In [7]:
# I only want data from the past 5 years i.e 2018-2023 so I'd be dropping rows where year isn't >= 2018 
df['year'] = df['year'].astype(int) # Convert ratingValue to float
df = df[df.year >= 2018]

In [8]:
df = df.dropna()

In [9]:
df.info()
df

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1332 entries, 0 to 1331
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ratingValue    1332 non-null   int32 
 1   reviewTitle    1332 non-null   object
 2   location       1332 non-null   object
 3   reviewArticle  1332 non-null   object
 4   year           1332 non-null   int32 
dtypes: int32(2), object(3)
memory usage: 52.0+ KB


Unnamed: 0,ratingValue,reviewTitle,location,reviewArticle,year
0,8,"""sufficient leg and arm room""",united kingdom,✅ Trip Verified | Booked online months ago an...,2023
1,7,“crew were polite”,united kingdom,✅ Trip Verified | The flight was on time. The...,2023
2,2,"""Angry, disappointed, and unsatisfied""",italy,"Not Verified | Angry, disappointed, and unsat...",2023
3,3,"""BA now stands for Basic Airways""",united kingdom,"✅ Trip Verified | As an infrequent flyer, Bri...",2023
4,8,"""A totally unremarkable flight""",ireland,"Not Verified | A totally unremarkable flight,...",2023
...,...,...,...,...,...
1327,1,"""go with a budget airline""",united kingdom,✅ Trip Verified | Amsterdam to London. Servic...,2018
1328,8,"""I was pleasantly surprised""",united kingdom,✅ Trip Verified | London to Johannesburg. I w...,2018
1329,1,"""BA is now as bad as Ryanair""",united kingdom,✅ Trip Verified | Will never fly with BA agai...,2018
1330,1,"""waiting on my luggage for five days""",united states,✅ Trip Verified | London to Tel Aviv. I have...,2018


In [10]:
#2
df['reviewTitle'] = df['reviewTitle'].str.lower()
df['reviewTitle'] = df['reviewTitle'].str.replace('“','\"') 
df['reviewTitle'] = df['reviewTitle'].str.replace('”','\"')
df['reviewTitle'] = df['reviewTitle'].str.extract(r'\"(.*?)\"')  # Get text in quotation symbols
df['reviewTitle'] = df['reviewTitle'].str.strip()

def cleanTitle(reviewTitle):
    reviewTitle = re.sub(':','',str(reviewTitle))  # Removing the : symbols
    reviewTitle = re.sub('\"','',str(reviewTitle)) # Removing the " symbols
    reviewTitle = re.sub('\”','',str(reviewTitle)) # Removing the ” symbols
    reviewTitle = re.sub('\“','',str(reviewTitle)) # Removing the “ symbols
    reviewTitle = re.sub('\’','',str(reviewTitle)) # Removing the ’ symbols
    reviewTitle = re.sub('\/','',str(reviewTitle)) # Removing the / symbols
    reviewTitle = re.sub('\|','',str(reviewTitle)) # Removing the | symbols
    reviewTitle = re.sub('\?','',str(reviewTitle)) # Removing the ? symbols
    reviewTitle = re.sub('\.','',str(reviewTitle)) # Removing the . symbols
    reviewTitle = re.sub('\,','',str(reviewTitle)) # Removing the , symbols
    reviewTitle = re.sub('\!','',str(reviewTitle)) # Removing the ! symbols
    reviewTitle = re.sub('-','',str(reviewTitle)) # Removing the - symbols
    reviewTitle = re.sub('\'','',str(reviewTitle)) # Removing the ' symbols
    reviewTitle = re.sub('\*','',str(reviewTitle)) # Removing the * symbols
    reviewTitle = re.sub('[0-9]','',str(reviewTitle)) # Removing digits
    return reviewTitle


df['cleanTitle'] = df['reviewTitle'].apply(cleanTitle)

In [11]:
#4
df['reviewArticle'] = df['reviewArticle'].astype(str)   # Convert to string
df['reviewArticle'] = df['reviewArticle'].str.lower()
df['reviewArticle'] = df['reviewArticle'].str.strip()
df[['Verification','reviewArticle']] = df['reviewArticle'].str.split('|',expand=True)


def cleanArticle(reviewArticle):
    reviewArticle = re.sub(':','',str(reviewArticle))  # Removing the : symbols
    reviewArticle = re.sub('\"','',str(reviewArticle)) # Removing the " symbols
    reviewArticle = re.sub('\”','',str(reviewArticle)) # Removing the ” symbols
    reviewArticle = re.sub('\“','',str(reviewArticle)) # Removing the “ symbols
    reviewArticle = re.sub('\’','',str(reviewArticle)) # Removing the ’ symbols
    reviewArticle = re.sub('\/','',str(reviewArticle)) # Removing the / symbols
    reviewArticle = re.sub('\|','',str(reviewArticle)) # Removing the | symbols
    reviewArticle = re.sub('\?','',str(reviewArticle)) # Removing the ? symbols
    reviewArticle = re.sub('\.','',str(reviewArticle)) # Removing the . symbols
    reviewArticle = re.sub('\,','',str(reviewArticle)) # Removing the , symbols
    reviewArticle = re.sub('\!','',str(reviewArticle)) # Removing the ! symbols
    reviewArticle = re.sub('-','',str(reviewArticle)) # Removing the - symbols
    reviewArticle = re.sub('\'','',str(reviewArticle)) # Removing the ' symbols
    reviewArticle = re.sub('\*','',str(reviewArticle)) # Removing the * symbols
    reviewArticle = re.sub('[0-9]','',str(reviewArticle)) # Removing digits
    return reviewArticle

df['cleanArticle'] = df['reviewArticle'].apply(cleanArticle)

In [12]:
#5
def cleanVerification(Verification):

    Verification = re.sub('✅ trip verified','yes',Verification)
    Verification = re.sub('not verified','no',Verification)

    return Verification

df['confirmedVerification'] = df['Verification'].apply(cleanVerification)

df.drop(columns=['Verification','reviewArticle', 'reviewTitle'], inplace=True)

## PRE-PROCESSING

 - Tokenization: This is breaking the raw text into small chunks(words, sentences) called tokens. These tokens help in understanding the context or developing the model for NLP. Tokenization helps in interpreting the meaning of the text by analyzing the sequence of the words.

 - Stop words Removal: Stop words are commonly used words which are generally filtered out before processing a natural language. These are actually the most common words in any language (like articles, prepositions, pronouns, conjunctions, etc) and do not add much information to the text

  - Lemmatization entails reducing a word to its dictionary form

In [17]:
import nltk  # text analysis
nltk.download('stopwords')

stop = stopwords.words('english')
stop2 = ["ive", "im", "youre", "youve", "youll", "youd", "shes", "thatll", "dont", 
                "shouldve", "arent", "couldnt", "didnt", "doesnt", "hadnt", "hasnt", "havent",
                 "isnt", "mightnt", "mustnt", "neednt", "shant", "shouldnt", "wasnt", "werent", "wont", 'wouldnt',
                'theres', "there'is", "ba", "british airways", "airways", "british"]

stop.extend(stop2)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\23490\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


NameError: name 'stopwords' is not defined

In [16]:
from nltk.tokenize import word_tokenize # to create word tokens
from nltk.stem import WordNetLemmatizer # to reduce words to orginal form

def preprocessTitle(cleanTitle):

    title_tokens = word_tokenize(cleanTitle)  # convert string to tokens
    filtered_title = [x for x in title_tokens if x not in stop] # Remove stopwords
    filtered_title = ' '.join(filtered_title)

    return "".join(filtered_title)  # join words with a space in between them

def processedTitle(processedTitle):

    title_tokens = word_tokenize(processedTitle)  # convert string to tokens
    lemmatizer = WordNetLemmatizer() # instatiate an object WordNetLemmatizer Class
    lemma_words = [lemmatizer.lemmatize(x) for x in title_tokens]
    return " ".join(lemma_words)


df['processedTitle'] = df['cleanTitle'].apply(preprocessTitle)
df['processedTitle'] = df['processedTitle'].apply(processedTitle)


NameError: name 'stop' is not defined

In [40]:
from nltk.tokenize import word_tokenize # to create word tokens
from nltk.stem import WordNetLemmatizer # to reduce words to orginal form

def preprocessArticle(cleanArticle):

    Article_tokens = word_tokenize(cleanArticle)  # convert string to tokens
    filtered_Article = [x for x in Article_tokens if x not in stop] # Remove stopwords
    filtered_Article = ' '.join(filtered_Article)

    return "".join(filtered_title)  # join words with a space in between them

def processedArticle(processedArticle):

    Article_tokens = word_tokenize(processedArticle)  # convert string to tokens
    lemmatizer = WordNetLemmatizer() # instatiate an object WordNetLemmatizer Class
    lemma_words = [lemmatizer.lemmatize(x) for x in Article_tokens]
    return " ".join(lemma_words)


df['processedArticle'] = df['cleanArticle'].apply(preprocessArticle)
df['processedArticle'] = df['processedArticle'].apply(processedArticle)


Unnamed: 0,ratingValue,location,year,cleanTitle,cleanArticle,confirmedVerification,processedTitle
0,8,united kingdom,2023,sufficient leg and arm room,booked online months ago and the only hitch ...,yes,sufficient leg arm room
1,7,united kingdom,2023,crew were polite,the flight was on time the crew were polite ...,yes,crew polite
2,2,italy,2023,angry disappointed and unsatisfied,angry disappointed and unsatisfied my route ...,no,angry disappointed unsatisfied
3,3,united kingdom,2023,ba now stands for basic airways,as an infrequent flyer british airways was a...,yes,stand basic
4,8,ireland,2023,a totally unremarkable flight,a totally unremarkable flight on time as com...,no,totally unremarkable flight


In [1]:
df1 = df.copy()

NameError: name 'df' is not defined

In [None]:
from nltk.tokenize import word_tokenize # to create word tokens
from nltk.stem import WordNetLemmatizer # to reduce words to orginal form

def preprocessArticle(cleanArticle):

    Article_tokens = word_tokenize(cleanArticle)  # convert string to tokens

    filtered_Article = [x for x in Article_tokens if x not in stop] # Remove stopwords

    lemma_words = [lemmatizer.lemmatize(x) for x in filtered_Article]
    
    lemmatizer = WordNetLemmatizer() # instatiate an object WordNetLemmatizer Class
    
    return " ".join(lemma_words)


df1['processedArticle'] = df1['cleanArticle'].apply(preprocessArticle)
df1['processedArticle'] = df1['processedArticle'].apply(processedArticle)
