## Data Cleaning
Once you have your dataset, you need to prepare it. The data will be very messy and contain purely text. You will need to perform data cleaning in order to prepare the data for analysis

In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Majid.Mehmood.ST
[nltk_data]     YLERSPLUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv("data\BA_reviews.csv", index_col=0)
df.head()

Unnamed: 0,reviews
0,✅ Trip Verified | Online check in worked fine...
1,✅ Trip Verified |. The BA first lounge at Term...
2,Not Verified | Paid a quick visit to Nice yest...
3,✅ Trip Verified | Words fail to describe this...
4,✅ Trip Verified | Absolutely terrible experie...


In [3]:
df.reviews= df.reviews.str.split('|',expand=True)[1]
df.head()

Unnamed: 0,reviews
0,Online check in worked fine. Quick security ...
1,. The BA first lounge at Terminal 5 was a zoo...
2,Paid a quick visit to Nice yesterday from Hea...
3,Words fail to describe this last awful fligh...
4,Absolutely terrible experience. The app woul...


In [4]:
# Remove leading/trailing whitespaces
df["reviews"] = df["reviews"].str.strip()
df.head()

Unnamed: 0,reviews
0,Online check in worked fine. Quick security ch...
1,. The BA first lounge at Terminal 5 was a zoo...
2,Paid a quick visit to Nice yesterday from Heat...
3,Words fail to describe this last awful flight ...
4,Absolutely terrible experience. The app would ...


In [5]:
# Remove special characters and convert to lowercase
df["reviews"] = df["reviews"].apply(lambda x: re.sub(r"[^a-zA-Z\s]", "", x).lower())
df.head()

Unnamed: 0,reviews
0,online check in worked fine quick security che...
1,the ba first lounge at terminal was a zoo a...
2,paid a quick visit to nice yesterday from heat...
3,words fail to describe this last awful flight ...
4,absolutely terrible experience the app would n...


In [6]:
# Remove numbers
df["reviews"] = df["reviews"].apply(lambda x: re.sub(r"\d+", "", x))
df.head()

Unnamed: 0,reviews
0,online check in worked fine quick security che...
1,the ba first lounge at terminal was a zoo a...
2,paid a quick visit to nice yesterday from heat...
3,words fail to describe this last awful flight ...
4,absolutely terrible experience the app would n...


In [7]:
# Remove punctuation
df["reviews"] = df["reviews"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))
df.head()

Unnamed: 0,reviews
0,online check in worked fine quick security che...
1,the ba first lounge at terminal was a zoo a...
2,paid a quick visit to nice yesterday from heat...
3,words fail to describe this last awful flight ...
4,absolutely terrible experience the app would n...


In [8]:
# Remove extra whitespaces
df["reviews"] = df["reviews"].apply(lambda x: re.sub(r"\s+", " ", x))
df.head()

Unnamed: 0,reviews
0,online check in worked fine quick security che...
1,the ba first lounge at terminal was a zoo at ...
2,paid a quick visit to nice yesterday from heat...
3,words fail to describe this last awful flight ...
4,absolutely terrible experience the app would n...


In [9]:
#Remove stop words
stop_words = set(stopwords.words("english"))
df["reviews"] = df["reviews"].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))
#df["reviews"] = df["reviews"].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

df.head()



Unnamed: 0,reviews
0,online check worked fine quick security check ...
1,ba first lounge terminal zoo pm dirty tables u...
2,paid quick visit nice yesterday heathrow decid...
3,words fail describe last awful flight baby acr...
4,absolutely terrible experience app would let c...


In [10]:
#obersrve lots of "ba" words in our data removing that words too

df["reviews"] = df["reviews"].str.replace("ba ", "")
df.head()

Unnamed: 0,reviews
0,online check worked fine quick security check ...
1,first lounge terminal zoo pm dirty tables used...
2,paid quick visit nice yesterday heathrow decid...
3,words fail describe last awful flight baby acr...
4,absolutely terrible experience app would let c...


In [11]:

# Save the cleaned data to a new CSV file
df.to_csv("data/cleaned_BA_reviews.csv")
