# Data Preparation

In [17]:
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vatsa\AppData\Roaming\nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vatsa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
import pandas as pd
import numpy as np
import requests

In [4]:
Unclean_Data = pd.read_csv("C:/Users/vatsa/Downloads/British-Airways-reviews-analysis/Data/BA_Data.csv")

In [6]:
Unclean_Data.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,date,country
0,0,✅ Trip Verified | This is the first time I ha...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,2nd April 2024,United Kingdom
1,1,✅ Trip Verified | Flew business class from Do...,3,2nd April 2024,United Kingdom
2,2,✅ Trip Verified | Starting off at Heathrow Te...,4,28th March 2024,United Kingdom
3,3,Not Verified | We have flown this route with ...,8,28th March 2024,United Kingdom
4,4,✅ Trip Verified | A last minute business trip ...,1,26th March 2024,United Kingdom


In [8]:
Unclean_Data['verified'] = Unclean_Data.reviews.str.contains("Trip Verified")

Unclean_Data['verified']

0        True
1        True
2        True
3       False
4        True
        ...  
3595    False
3596    False
3597    False
3598    False
3599    False
Name: verified, Length: 3600, dtype: bool

# For Reviews column

In [21]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

lemma = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Assuming reviews_data is a pandas Series containing the reviews
reviews_data = Unclean_Data.reviews

# Create an empty list to collect cleaned data corpus
corpus = []

# Loop through each review, strip everything before and including the "|" symbol, remove punctuations, lowercase it, lemmatize words, and add to corpus
for rev in reviews_data:
    rev = re.sub(r'^.*\|', '', rev)  # Strip everything before and including the "|" symbol
    rev = rev.lower()  # Convert to lowercase
    rev = re.sub('[^a-zA-Z]', ' ', rev)  # Remove non-alphabetic characters
    rev = rev.split()  # Tokenize into words
    rev = [lemma.lemmatize(word) for word in rev if word not in stop_words]  # Lemmatize and remove stopwords
    rev = " ".join(rev)  # Join the words back into a single string
    corpus.append(rev)  # Add the cleaned review to the corpus


In [22]:
Unclean_Data['corpus'] = corpus

In [23]:
Unclean_Data.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,date,country,verified,corpus
0,0,✅ Trip Verified | This is the first time I ha...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,2nd April 2024,United Kingdom,True,first time seen new club world suite seat comf...
1,1,✅ Trip Verified | Flew business class from Do...,3,2nd April 2024,United Kingdom,True,flew business class doha london st march servi...
2,2,✅ Trip Verified | Starting off at Heathrow Te...,4,28th March 2024,United Kingdom,True,starting heathrow terminal check fairly easy f...
3,3,Not Verified | We have flown this route with ...,8,28th March 2024,United Kingdom,False,flown route easyjet regularly twenty year age ...
4,4,✅ Trip Verified | A last minute business trip ...,1,26th March 2024,United Kingdom,True,last minute business trip hnd route regularly ...


# For Dates

In [24]:
Unclean_Data.dtypes

Unnamed: 0     int64
reviews       object
stars         object
date          object
country       object
verified        bool
corpus        object
dtype: object

In [25]:
# converting the date to datetime format

Unclean_Data.date = pd.to_datetime(Unclean_Data.date)

In [26]:
Unclean_Data.date.head()

0   2024-04-02
1   2024-04-02
2   2024-03-28
3   2024-03-28
4   2024-03-26
Name: date, dtype: datetime64[ns]

# For Stars

In [28]:
#check for unique values
Unclean_Data.stars.unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '3', '4', '8', '1', '5', '10',
       '9', '7', '2', '6', 'None'], dtype=object)

In [31]:
# remove the \t and \n from the ratings
Unclean_Data.stars = Unclean_Data.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

Unclean_Data.stars.value_counts()

1       867
2       411
3       407
8       355
10      298
9       292
7       288
5       253
4       243
6       182
None      4
Name: stars, dtype: int64

In [32]:
# drop the rows where the value of ratings is None
Unclean_Data.drop(Unclean_Data[Unclean_Data.stars == "None"].index, axis=0, inplace=True)

In [34]:
#check the unique values again
Unclean_Data.stars.unique()

array(['5', '3', '4', '8', '1', '10', '9', '7', '2', '6'], dtype=object)

# For Null Values

In [35]:
Unclean_Data.isnull().value_counts()

Unnamed: 0  reviews  stars  date   country  verified  corpus
False       False    False  False  False    False     False     3594
                                   True     False     False        2
dtype: int64

In [36]:
#drop the rows using index where the country value is null
Unclean_Data.drop(Unclean_Data[Unclean_Data.country.isnull() == True].index, axis=0, inplace=True)

In [38]:
#resetting the index
Unclean_Data.reset_index(drop=True)

Unnamed: 0.1,Unnamed: 0,reviews,stars,date,country,verified,corpus
0,0,✅ Trip Verified | This is the first time I ha...,5,2024-04-02,United Kingdom,True,first time seen new club world suite seat comf...
1,1,✅ Trip Verified | Flew business class from Do...,3,2024-04-02,United Kingdom,True,flew business class doha london st march servi...
2,2,✅ Trip Verified | Starting off at Heathrow Te...,4,2024-03-28,United Kingdom,True,starting heathrow terminal check fairly easy f...
3,3,Not Verified | We have flown this route with ...,8,2024-03-28,United Kingdom,False,flown route easyjet regularly twenty year age ...
4,4,✅ Trip Verified | A last minute business trip ...,1,2024-03-26,United Kingdom,True,last minute business trip hnd route regularly ...
...,...,...,...,...,...,...,...
3589,3595,Flight BA283 LHR-LAX on 31/5/2014 onboard G-XL...,7,2014-07-29,Netherlands,False,flight ba lhr lax onboard g xlee flight punctu...
3590,3596,Round-trip with family from JFK to HYD via LHR...,8,2014-07-29,United States,False,round trip family jfk hyd via lhr onward fligh...
3591,3597,LHR-BSL-LHR 22nd/23rd July BA748 and BA755 A31...,5,2014-07-29,United Kingdom,False,lhr bsl lhr nd rd july ba ba way bag drop seem...
3592,3598,We flew to Rome from Newcastle via Heathrow th...,3,2014-07-29,United Kingdom,False,flew rome newcastle via heathrow ground staff ...


In [39]:
# export the cleaned data

Unclean_Data.to_csv("C:/Users/vatsa/Downloads/British-Airways-reviews-analysis/Data/cleaned-BA-reviews.csv")