In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [2]:
#create a dataframe from csv file

cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)

In [3]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | Easy check in a T5. Galleri...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,13th August 2023,United Kingdom
1,"Not Verified | Flight delayed by an hour, it ...",8,12th August 2023,United States
2,Not Verified | The staff are very rude and not...,2,11th August 2023,United Kingdom
3,✅ Trip Verified | Good domestic flight operat...,2,8th August 2023,United Kingdom
4,Not Verified | Failed at all basic travel fund...,10,8th August 2023,Canada


In [4]:
df.shape

(3500, 4)

create a column which mentions if the user is verified or not.

In [5]:
df['verified'] = df.reviews.str.contains("Trip Verified")
df['verified']

0        True
1       False
2       False
3        True
4       False
        ...  
3495    False
3496    False
3497    False
3498    False
3499    False
Name: verified, Length: 3500, dtype: bool

# Cleaning Reviews

 extract the column of reviews into a separate dataframe and clean it for semantic analysis

In [19]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...


True

In [20]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()

reviews_data = df.reviews.str.strip("✅ Trip Verified |")

# #create an empty list to collect cleaned data corpus
corpus =[]

# #loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [21]:
# add the corpus to the original dataframe

df['corpus'] = corpus
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | Easy check in a T5. Galleri...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,13th August 2023,United Kingdom,True,easy check gallery south north lounge packed a...
1,"Not Verified | Flight delayed by an hour, it ...",8,12th August 2023,United States,False,verified flight delayed hour happens biggie u ...
2,Not Verified | The staff are very rude and not...,2,11th August 2023,United Kingdom,False,verified staff rude trained properly exception...
3,✅ Trip Verified | Good domestic flight operat...,2,8th August 2023,United Kingdom,True,good domestic flight operated ba cityflyer gro...
4,Not Verified | Failed at all basic travel fund...,10,8th August 2023,Canada,False,verified failed basic travel fundamental fligh...


# Cleaning/Fromat date

In [22]:
df.dtypes

reviews     object
stars       object
date        object
country     object
verified      bool
corpus      object
dtype: object

In [24]:
# convert the date to datetime format

df.date = pd.to_datetime(df.date)
df.date.head()

0   2023-08-13
1   2023-08-12
2   2023-08-11
3   2023-08-08
4   2023-08-08
Name: date, dtype: datetime64[ns]

# Cleaning ratings with stars

In [25]:
#check for unique values
df.stars.unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '8', '2', '10', '1', '9', '6',
       '4', '3', '7', '5', 'None'], dtype=object)

In [26]:
# remove the \t and \n from the ratings
df.stars = df.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [27]:
df.stars.value_counts()

1       810
2       402
3       396
8       347
10      296
9       291
7       291
5       249
4       239
6       173
None      6
Name: stars, dtype: int64

There are 6 rows having values "None" in the ratings. We will drop all these 6 rows.

In [28]:
# drop the rows where the value of ratings is None
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [29]:
#check the unique values again
df.stars.unique()

array(['5', '8', '2', '10', '1', '9', '6', '4', '3', '7'], dtype=object)

# Check for null Values

In [31]:
df.isnull().sum()

reviews     0
stars       0
date        0
country     2
verified    0
corpus      0
dtype: int64

We have two missing values for country. For this we can just remove those two reviews (rows) from the dataframe

In [32]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [33]:
df.shape

(3492, 6)

In [34]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | Easy check in a T5. Galleri...,5,2023-08-13,United Kingdom,True,easy check gallery south north lounge packed a...
1,"Not Verified | Flight delayed by an hour, it ...",8,2023-08-12,United States,False,verified flight delayed hour happens biggie u ...
2,Not Verified | The staff are very rude and not...,2,2023-08-11,United Kingdom,False,verified staff rude trained properly exception...
3,✅ Trip Verified | Good domestic flight operat...,2,2023-08-08,United Kingdom,True,good domestic flight operated ba cityflyer gro...
4,Not Verified | Failed at all basic travel fund...,10,2023-08-08,Canada,False,verified failed basic travel fundamental fligh...
...,...,...,...,...,...,...
3487,Having taken several domestic flights to Londo...,3,2014-06-25,United Kingdom,False,taken several domestic flight london heathrow ...
3488,Have flown with BA several times over past few...,8,2014-06-25,United Kingdom,False,flown ba several time past year mostly economy...
3489,LHR to VIE BA 704 23 June 2014. The meal servi...,9,2014-06-25,United Kingdom,False,lhr vie ba june meal service euro traveller ap...
3490,Flew from LHR to Hong Kong April 13th 2014 BA ...,6,2014-06-25,United Kingdom,False,flew lhr hong kong april th ba flight took hou...


Now our data is all cleaned and ready for data visualization and data analysis.

In [35]:
# export the cleaned data

df.to_csv(cwd + "/cleaned-BA-reviews.csv")