In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('BA_reviews.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,date,country
0,0,Not Verified | Flown with BA four times. As an...,5.0,25th August 2024,United Kingdom
1,1,✅ Trip Verified | You may never see your ref...,10.0,22nd August 2024,United States
2,2,Not Verified | Cargo service: that's how I fel...,1.0,21st August 2024,United States
3,3,✅ Trip Verified | Inefficient and slow airport...,5.0,21st August 2024,United Kingdom
4,4,Not Verified | The customer service is one of...,4.0,18th August 2024,United States


In [5]:
df=df.drop('Unnamed: 0', axis=1)

In [6]:
df

Unnamed: 0,reviews,stars,date,country
0,Not Verified | Flown with BA four times. As an...,5.0,25th August 2024,United Kingdom
1,✅ Trip Verified | You may never see your ref...,10.0,22nd August 2024,United States
2,Not Verified | Cargo service: that's how I fel...,1.0,21st August 2024,United States
3,✅ Trip Verified | Inefficient and slow airport...,5.0,21st August 2024,United Kingdom
4,Not Verified | The customer service is one of...,4.0,18th August 2024,United States
...,...,...,...,...
3495,Heathrow to Bergen Business no priority boardi...,4.0,22nd October 2014,United Kingdom
3496,BA176 JFK to Heathrow 13 October 2014. The out...,8.0,22nd October 2014,United Kingdom
3497,Very impressed with Premium Economy on BA Sydn...,3.0,22nd October 2014,Australia
3498,A319 Barcelona to Heathrow Business no announc...,7.0,22nd October 2014,United Kingdom


In [7]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [8]:
df['verified']

0       False
1        True
2       False
3        True
4       False
        ...  
3495    False
3496    False
3497    False
3498    False
3499    False
Name: verified, Length: 3500, dtype: bool

In [20]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional but recommended for WordNet Lemmatizer

# Initialize the lemmatizer
lemma = WordNetLemmatizer()

# Clean the review data by removing the "✅ Trip Verified |" prefix
reviews_data = df['reviews'].str.replace("✅ Trip Verified |", "", regex=False)

# Create an empty list to collect cleaned data corpus
corpus = []

# Loop through each review, remove punctuations, convert to lowercase, lemmatize, and remove stopwords
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]', ' ', rev)  # Use re.sub to replace non-alphabetical characters with a space
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\madna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\madna\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\madna\AppData\Roaming\nltk_data...


In [21]:
df['corpus'] = corpus

In [22]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Flown with BA four times. As an...,5.0,25th August 2024,United Kingdom,False,verified flown ba four time anxious traveller ...
1,✅ Trip Verified | You may never see your ref...,10.0,22nd August 2024,United States,True,may never see refund cancelled return flight s...
2,Not Verified | Cargo service: that's how I fel...,1.0,21st August 2024,United States,False,verified cargo service felt upper deck suppose...
3,✅ Trip Verified | Inefficient and slow airport...,5.0,21st August 2024,United Kingdom,True,inefficient slow airport operation resting lau...
4,Not Verified | The customer service is one of...,4.0,18th August 2024,United States,False,verified customer service one worst ever seen ...


In [23]:
df.dtypes

reviews      object
stars       float64
date         object
country      object
verified       bool
corpus       object
dtype: object

In [26]:
import pandas as pd
import re

# Function to remove ordinal suffixes from day in date string
def remove_suffix(date_str):
    return re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)

# Apply the function to the date column to clean it
df['date'] = df['date'].apply(remove_suffix)

# Convert the cleaned date strings to datetime objects
df['date'] = pd.to_datetime(df['date'], format='%d %B %Y')

# Display the DataFrame to verify the conversion
print(df.head())


                                             reviews  stars       date  \
0  Not Verified | Flown with BA four times. As an...    5.0 2024-08-25   
1  ✅ Trip Verified |   You may never see your ref...   10.0 2024-08-22   
2  Not Verified | Cargo service: that's how I fel...    1.0 2024-08-21   
3  ✅ Trip Verified | Inefficient and slow airport...    5.0 2024-08-21   
4  Not Verified |  The customer service is one of...    4.0 2024-08-18   

          country  verified                                             corpus  
0  United Kingdom     False  verified flown ba four time anxious traveller ...  
1   United States      True  may never see refund cancelled return flight s...  
2   United States     False  verified cargo service felt upper deck suppose...  
3  United Kingdom      True  inefficient slow airport operation resting lau...  
4   United States     False  verified customer service one worst ever seen ...  


In [27]:
df.stars.unique()

array([ 5., 10.,  1.,  4.,  2.,  8.,  3.,  6.,  9.,  7., nan])

In [28]:
df.dropna(axis=0)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Flown with BA four times. As an...,5.0,2024-08-25,United Kingdom,False,verified flown ba four time anxious traveller ...
1,✅ Trip Verified | You may never see your ref...,10.0,2024-08-22,United States,True,may never see refund cancelled return flight s...
2,Not Verified | Cargo service: that's how I fel...,1.0,2024-08-21,United States,False,verified cargo service felt upper deck suppose...
3,✅ Trip Verified | Inefficient and slow airport...,5.0,2024-08-21,United Kingdom,True,inefficient slow airport operation resting lau...
4,Not Verified | The customer service is one of...,4.0,2024-08-18,United States,False,verified customer service one worst ever seen ...
...,...,...,...,...,...,...
3495,Heathrow to Bergen Business no priority boardi...,4.0,2014-10-22,United Kingdom,False,heathrow bergen business priority boarding bus...
3496,BA176 JFK to Heathrow 13 October 2014. The out...,8.0,2014-10-22,United Kingdom,False,ba jfk heathrow october outward flight awful r...
3497,Very impressed with Premium Economy on BA Sydn...,3.0,2014-10-22,Australia,False,impressed premium economy ba sydney heathrow r...
3498,A319 Barcelona to Heathrow Business no announc...,7.0,2014-10-22,United Kingdom,False,barcelona heathrow business announcement board...


In [29]:
df.stars.unique()

array([ 5., 10.,  1.,  4.,  2.,  8.,  3.,  6.,  9.,  7., nan])

In [30]:
df.stars.value_counts()

stars
1.0     863
2.0     406
3.0     402
8.0     339
10.0    284
9.0     273
7.0     273
5.0     249
4.0     235
6.0     173
Name: count, dtype: int64

In [31]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3496
         True   False  False    False     False        3
         False  False  True     False     False        1
Name: count, dtype: int64

In [32]:
df.country.isnull().value_counts()

country
False    3499
True        1
Name: count, dtype: int64

In [33]:
df.reviews.isnull().value_counts()

reviews
False    3500
Name: count, dtype: int64

In [34]:
df.date.isnull().value_counts()

date
False    3500
Name: count, dtype: int64

In [35]:
df.stars.isnull().value_counts()#

stars
False    3497
True        3
Name: count, dtype: int64

In [36]:
df = df.dropna(subset=['stars', 'country'])

In [37]:
df.stars.isnull().value_counts()

stars
False    3496
Name: count, dtype: int64

In [38]:
df.shape

(3496, 6)

In [39]:
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Flown with BA four times. As an...,5.0,2024-08-25,United Kingdom,False,verified flown ba four time anxious traveller ...
1,✅ Trip Verified | You may never see your ref...,10.0,2024-08-22,United States,True,may never see refund cancelled return flight s...
2,Not Verified | Cargo service: that's how I fel...,1.0,2024-08-21,United States,False,verified cargo service felt upper deck suppose...
3,✅ Trip Verified | Inefficient and slow airport...,5.0,2024-08-21,United Kingdom,True,inefficient slow airport operation resting lau...
4,Not Verified | The customer service is one of...,4.0,2024-08-18,United States,False,verified customer service one worst ever seen ...
...,...,...,...,...,...,...
3491,Heathrow to Bergen Business no priority boardi...,4.0,2014-10-22,United Kingdom,False,heathrow bergen business priority boarding bus...
3492,BA176 JFK to Heathrow 13 October 2014. The out...,8.0,2014-10-22,United Kingdom,False,ba jfk heathrow october outward flight awful r...
3493,Very impressed with Premium Economy on BA Sydn...,3.0,2014-10-22,Australia,False,impressed premium economy ba sydney heathrow r...
3494,A319 Barcelona to Heathrow Business no announc...,7.0,2014-10-22,United Kingdom,False,barcelona heathrow business announcement board...


In [40]:
df.to_csv("cleaned_data.csv")