# Importing and cleaning data

In [16]:
#imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

#create a dataframe from csv file
cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA.csv", index_col=0)
df.head()

Unnamed: 0,Reviews,Ratings,Aircraft Type,Traveller,Countries,Flight Ticket,Route,Date Flown,Seat Comfort /5,Cabin Staff Services /5,Food and beverages /5,Ground service /5,Value for money /5,Recommended
0,✅ Trip Verified | At 7.54 am on the day of tr...,2.0,,Solo Leisure,United Kingdom,Economy Class,London to Los Angeles,March 2023,1.0,2.0,1.0,3.0,1,no
1,✅ Trip Verified | Would happily fly them agai...,10.0,Boeing 777 / A320,Solo Leisure,United States,Economy Class,New York to Istanbul via London,March 2023,5.0,5.0,5.0,5.0,5,yes
2,"Not Verified | Flew premium, only worth the e...",4.0,,Couple Leisure,United Kingdom,Premium Economy,London Heathrow to Las Vegas,March 2023,3.0,2.0,1.0,2.0,3,no
3,✅ Trip Verified | First our morning flight wa...,1.0,A321neo,Business,Canada,Business Class,London to Cairo,January 2023,1.0,2.0,1.0,3.0,1,no
4,✅ Trip Verified | Although it was a bit uncom...,8.0,Boeing 787,Solo Leisure,United Kingdom,Economy Class,London Singapore,February 2023,3.0,5.0,5.0,5.0,4,yes


In [17]:
# creating 'vertified' column from "reviews"
df['verified'] = df.Reviews.str.contains("Trip Verified")
df['verified']

0        True
1        True
2       False
3        True
4        True
        ...  
3480    False
3481    False
3482    False
3483    False
3484    False
Name: verified, Length: 3485, dtype: bool

REVIEW CLEANING

In [18]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()


reviews_data = df.Reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

# add the corpus to the original dataframe
df['corpus'] = corpus

In [19]:
# date format
df.dtypes
# convert the "Date Flown" column to datetime format
df['Date Flown'] = pd.to_datetime(df['Date Flown'])
df['Date Flown'].head()

0   2023-03-01
1   2023-03-01
2   2023-03-01
3   2023-01-01
4   2023-02-01
Name: Date Flown, dtype: datetime64[ns]

In [20]:
df['Date Flown']

0      2023-03-01
1      2023-03-01
2      2023-03-01
3      2023-01-01
4      2023-02-01
          ...    
3480          NaT
3481          NaT
3482          NaT
3483          NaT
3484          NaT
Name: Date Flown, Length: 3485, dtype: datetime64[ns]

In [21]:
# Check the structure of the dataset
print(df.head())
print(df.info())

                                             Reviews  Ratings  \
0  ✅ Trip Verified |  At 7.54 am on the day of tr...      2.0   
1  ✅ Trip Verified |  Would happily fly them agai...     10.0   
2  Not Verified |  Flew premium, only worth the e...      4.0   
3  ✅ Trip Verified |  First our morning flight wa...      1.0   
4  ✅ Trip Verified |  Although it was a bit uncom...      8.0   

       Aircraft Type       Traveller       Countries    Flight Ticket  \
0                NaN    Solo Leisure  United Kingdom    Economy Class   
1  Boeing 777 / A320    Solo Leisure   United States    Economy Class   
2                NaN  Couple Leisure  United Kingdom  Premium Economy   
3            A321neo        Business          Canada   Business Class   
4         Boeing 787    Solo Leisure  United Kingdom    Economy Class   

                             Route Date Flown  Seat Comfort /5  \
0            London to Los Angeles 2023-03-01              1.0   
1  New York to Istanbul via London 202

In [22]:
# Explore the variables and their distributions
print(df.describe())

# Check for missing values
print(df.isnull().sum())


           Ratings  Seat Comfort /5  Cabin Staff Services /5  \
count  3480.000000      3385.000000              3377.000000   
mean      4.843103         2.902511                 3.279538   
std       3.163383         1.361171                 1.487320   
min       1.000000         1.000000                 1.000000   
25%       2.000000         2.000000                 2.000000   
50%       4.000000         3.000000                 4.000000   
75%       8.000000         4.000000                 5.000000   
max      10.000000         5.000000                 5.000000   

       Food and beverages /5  Ground service /5  Value for money /5  
count            3147.000000        2647.000000         3485.000000  
mean                2.736257           2.845485            2.738020  
std                 1.442597           1.446685            1.468167  
min                 0.000000           1.000000            0.000000  
25%                 1.000000           1.000000            1.000000  
50%

### saving checkpoint

In [9]:
import os

cwd = os.getcwd()
df.to_csv(cwd+ "/BA_amended.csv")

In [10]:
df = pd.read_csv(cwd+"/BA_amended.csv", index_col=0)

In [12]:
# removing empty columns
cols_to_check = ['Reviews', 'Ratings', 'Aircraft Type', 'Countries',
                 'Flight Ticket', 'Route', 'Date Flown',
                'Seat Comfort /5', 'Cabin Staff Services /5',
                 'Food and beverages /5', 'Ground service /5',
                'Value for money /5', 'Recommended', 'verified',
                'corpus'] # replace with the names of the columns you want to check for missing values

# select only the rows where at least one value is missing in any of the columns
missing_rows = df[df[cols_to_check].isnull().any(axis=1)]

# display the number of rows with missing values
print(f"{len(missing_rows)} rows with missing values.")

# drop the rows with missing values and update the DataFrame in place
df.drop(missing_rows.index, inplace=True)

# display the number of remaining rows in the DataFrame
print(f"{len(df)} rows remaining after dropping missing values.")

1780 rows with missing values.
1705 rows remaining after dropping missing values.


In [13]:
df.head(40)

Unnamed: 0,Reviews,Ratings,Aircraft Type,Traveller,Countries,Flight Ticket,Route,Date Flown,Seat Comfort /5,Cabin Staff Services /5,Food and beverages /5,Ground service /5,Value for money /5,Recommended,verified,corpus
1,✅ Trip Verified | Would happily fly them agai...,10.0,Boeing 777 / A320,Solo Leisure,United States,Economy Class,New York to Istanbul via London,March 2023,5.0,5.0,5.0,5.0,5,yes,True,would happily fly personal emergency allowed r...
3,✅ Trip Verified | First our morning flight wa...,1.0,A321neo,Business,Canada,Business Class,London to Cairo,January 2023,1.0,2.0,1.0,3.0,1,no,True,first morning flight cancelled moved afternoon...
4,✅ Trip Verified | Although it was a bit uncom...,8.0,Boeing 787,Solo Leisure,United Kingdom,Economy Class,London Singapore,February 2023,3.0,5.0,5.0,5.0,4,yes,True,although bit uncomfortable flight economy flig...
5,✅ Trip Verified | Boarding was decently organ...,6.0,A380,Business,Belgium,Business Class,London to Dubai,February 2023,2.0,3.0,2.0,4.0,3,yes,True,boarding decently organised still rather stran...
6,✅ Trip Verified | Boarding on time and departu...,7.0,A320,Business,Belgium,Business Class,Brussels to London,February 2023,2.0,4.0,4.0,4.0,4,yes,True,boarding time departure time flight london hea...
9,✅ Trip Verified | Very competent check in sta...,4.0,A320,Couple Leisure,United Arab Emirates,Economy Class,Faro to Gatwick,February 2023,2.0,2.0,1.0,5.0,2,no,True,competent check staff saw problem left arm ins...
10,"✅ Trip Verified | Check in was so slow, no se...",5.0,A320,Family Leisure,United Kingdom,Economy Class,Oslo to London,February 2023,3.0,3.0,1.0,2.0,3,no,True,check slow self check bag drop boarding ok fli...
11,✅ Trip Verified | My review relates to the ap...,1.0,A320,Solo Leisure,United Kingdom,Economy Class,London to Madrid,February 2023,2.0,1.0,1.0,1.0,2,no,True,review relates appalling experience british ai...
14,✅ Trip Verified | BA 242 on the 6/2/23. Boardi...,9.0,Boeing 787-9,Couple Leisure,United Kingdom,Business Class,Mexico City to London,February 2023,5.0,5.0,4.0,5.0,5,yes,True,ba boarding delayed due late arrival incoming ...
15,✅ Trip Verified | Not only my first flight in...,10.0,A320,Solo Leisure,Spain,Economy Class,Palma to Gatwick,January 2023,5.0,5.0,5.0,5.0,5,yes,True,first flight year also first time back england...


In [14]:
df.reset_index(drop=True)

Unnamed: 0,Reviews,Ratings,Aircraft Type,Traveller,Countries,Flight Ticket,Route,Date Flown,Seat Comfort /5,Cabin Staff Services /5,Food and beverages /5,Ground service /5,Value for money /5,Recommended,verified,corpus
0,✅ Trip Verified | Would happily fly them agai...,10.0,Boeing 777 / A320,Solo Leisure,United States,Economy Class,New York to Istanbul via London,March 2023,5.0,5.0,5.0,5.0,5,yes,True,would happily fly personal emergency allowed r...
1,✅ Trip Verified | First our morning flight wa...,1.0,A321neo,Business,Canada,Business Class,London to Cairo,January 2023,1.0,2.0,1.0,3.0,1,no,True,first morning flight cancelled moved afternoon...
2,✅ Trip Verified | Although it was a bit uncom...,8.0,Boeing 787,Solo Leisure,United Kingdom,Economy Class,London Singapore,February 2023,3.0,5.0,5.0,5.0,4,yes,True,although bit uncomfortable flight economy flig...
3,✅ Trip Verified | Boarding was decently organ...,6.0,A380,Business,Belgium,Business Class,London to Dubai,February 2023,2.0,3.0,2.0,4.0,3,yes,True,boarding decently organised still rather stran...
4,✅ Trip Verified | Boarding on time and departu...,7.0,A320,Business,Belgium,Business Class,Brussels to London,February 2023,2.0,4.0,4.0,4.0,4,yes,True,boarding time departure time flight london hea...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1700,"BA0567 15/6/15. There was a delay, which I und...",8.0,A320,Solo Leisure,United Kingdom,Economy Class,LHR to Milan Malpensa,June 2015,4.0,5.0,4.0,4.0,4,yes,False,ba delay understand accept crew helpful went w...
1701,"I flew from Seoul to London, business class an...",6.0,boeing 787,Solo Leisure,United Kingdom,Business Class,Seoul to London,June 2015,4.0,3.0,3.0,3.0,3,yes,False,flew seoul london business class surprised muc...
1702,"Ended up on a 25-year-old 747, the second-olde...",6.0,B747-400,Solo Leisure,Canada,Economy Class,LHR to YYZ,May 2015,3.0,4.0,4.0,4.0,4,yes,False,ended year old second oldest ba fleet boy show...
1703,We were boarded quickly but suffered a weather...,9.0,A320,Couple Leisure,United Kingdom,Economy Class,LGW to VCE,June 2015,3.0,5.0,3.0,4.0,4,yes,False,boarded quickly suffered weather delay hour lo...


In [15]:
# export the cleaned data
df.to_csv(cwd + "/cleaned-BA-reviews.csv")