# Data Cleaning process of our Collected data (part-2)

Now we have created a dataframe from the scraped data, we need to clean it to analyse further. 

In [1]:
#importing the required libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re #regex

In [2]:
#creating the dataframe from the csv file that we stored in our directory
cwd = os.getcwd()

df = pd.read_csv(cwd+'/BristishAirlineReviews.csv', index_col=0)

In [3]:
df

Unnamed: 0,reviews,stars,date,country
0,"Not Verified | The worst service ever, my bag...",5,9th September 2023,(Kuwait)
1,✅ Trip Verified | 4/4 flights we booked this ...,1,6th September 2023,(Germany)
2,✅ Trip Verified | British Airways has a total...,1,4th September 2023,(United Kingdom)
3,"✅ Trip Verified | London Heathrow to Keflavik,...",1,4th September 2023,(Iceland)
4,✅ Trip Verified | Mumbai to London Heathrow in...,8,4th September 2023,(Iceland)
...,...,...,...,...
3645,✅ Trip Verified | Care and support shocking. ...,8,4th September 2023,(United Kingdom)
3646,✅ Trip Verified | Flying A380 business class ...,8,2nd September 2023,(Australia)
3647,✅ Trip Verified | British Airways absolutely ...,2,1st September 2023,(United Kingdom)
3648,✅ Trip Verified | My recent experience with B...,1,1st September 2023,(United States)


We will also create a column which mentions if the user is verified or not.

In [4]:
df['verified'] = df.reviews.str.contains('Trip Verified')

In [5]:
df['verified']

0       False
1        True
2        True
3        True
4        True
        ...  
3645     True
3646     True
3647     True
3648     True
3649     True
Name: verified, Length: 3650, dtype: bool

In [6]:
df['verified'].value_counts()

True     3285
False     365
Name: verified, dtype: int64

In [7]:
#removing the brackets in the country column using replace function
df['country'] = df['country'].str.replace(r'\(|\)','')

  df['country'] = df['country'].str.replace(r'\(|\)','')


In [8]:
df

Unnamed: 0,reviews,stars,date,country,verified
0,"Not Verified | The worst service ever, my bag...",5,9th September 2023,Kuwait,False
1,✅ Trip Verified | 4/4 flights we booked this ...,1,6th September 2023,Germany,True
2,✅ Trip Verified | British Airways has a total...,1,4th September 2023,United Kingdom,True
3,"✅ Trip Verified | London Heathrow to Keflavik,...",1,4th September 2023,Iceland,True
4,✅ Trip Verified | Mumbai to London Heathrow in...,8,4th September 2023,Iceland,True
...,...,...,...,...,...
3645,✅ Trip Verified | Care and support shocking. ...,8,4th September 2023,United Kingdom,True
3646,✅ Trip Verified | Flying A380 business class ...,8,2nd September 2023,Australia,True
3647,✅ Trip Verified | British Airways absolutely ...,2,1st September 2023,United Kingdom,True
3648,✅ Trip Verified | My recent experience with B...,1,1st September 2023,United States,True


## Cleaning the reviews

we will use the process of lemmatization of words from nltk library for semantic analysis

In [10]:
#importing nltk library
import nltk

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NIKHIL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NIKHIL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\NIKHIL\AppData\Roaming\nltk_data...


True

In [19]:
#we will import WordNetLemmatizer and stopwords from nltk library
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

lemma = WordNetLemmatizer()

# Creating an empty list to collect cleaned data corpus
corpus = []

reviews_data = df.reviews

# Loop through each review, remove punctuations, lowercase it, lemmatize, join, and add it to the corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]', ' ', rev)  # Replacing non-alphabetical characters with space
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)


In [20]:
df['corpus'] = corpus

In [21]:
df

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,"Not Verified | The worst service ever, my bag...",5,9th September 2023,Kuwait,False,verified worst service ever baggage arrive tim...
1,✅ Trip Verified | 4/4 flights we booked this ...,1,6th September 2023,Germany,True,trip verified flight booked holiday delayed ho...
2,✅ Trip Verified | British Airways has a total...,1,4th September 2023,United Kingdom,True,trip verified british airway total lack respec...
3,"✅ Trip Verified | London Heathrow to Keflavik,...",1,4th September 2023,Iceland,True,trip verified london heathrow keflavik iceland...
4,✅ Trip Verified | Mumbai to London Heathrow in...,8,4th September 2023,Iceland,True,trip verified mumbai london heathrow business ...
...,...,...,...,...,...,...
3645,✅ Trip Verified | Care and support shocking. ...,8,4th September 2023,United Kingdom,True,trip verified care support shocking written pr...
3646,✅ Trip Verified | Flying A380 business class ...,8,2nd September 2023,Australia,True,trip verified flying business class pleasure b...
3647,✅ Trip Verified | British Airways absolutely ...,2,1st September 2023,United Kingdom,True,trip verified british airway absolutely care r...
3648,✅ Trip Verified | My recent experience with B...,1,1st September 2023,United States,True,trip verified recent experience british airway...


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3650 entries, 0 to 3649
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   reviews   3650 non-null   object
 1   stars     3650 non-null   int64 
 2   date      3650 non-null   object
 3   country   3650 non-null   object
 4   verified  3650 non-null   bool  
 5   corpus    3650 non-null   object
dtypes: bool(1), int64(1), object(4)
memory usage: 174.7+ KB


In [23]:
#converting the date into datetime format
df['date'] = pd.to_datetime(df['date'])

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3650 entries, 0 to 3649
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   reviews   3650 non-null   object        
 1   stars     3650 non-null   int64         
 2   date      3650 non-null   datetime64[ns]
 3   country   3650 non-null   object        
 4   verified  3650 non-null   bool          
 5   corpus    3650 non-null   object        
dtypes: bool(1), datetime64[ns](1), int64(1), object(3)
memory usage: 174.7+ KB


In [25]:
df

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,"Not Verified | The worst service ever, my bag...",5,2023-09-09,Kuwait,False,verified worst service ever baggage arrive tim...
1,✅ Trip Verified | 4/4 flights we booked this ...,1,2023-09-06,Germany,True,trip verified flight booked holiday delayed ho...
2,✅ Trip Verified | British Airways has a total...,1,2023-09-04,United Kingdom,True,trip verified british airway total lack respec...
3,"✅ Trip Verified | London Heathrow to Keflavik,...",1,2023-09-04,Iceland,True,trip verified london heathrow keflavik iceland...
4,✅ Trip Verified | Mumbai to London Heathrow in...,8,2023-09-04,Iceland,True,trip verified mumbai london heathrow business ...
...,...,...,...,...,...,...
3645,✅ Trip Verified | Care and support shocking. ...,8,2023-09-04,United Kingdom,True,trip verified care support shocking written pr...
3646,✅ Trip Verified | Flying A380 business class ...,8,2023-09-02,Australia,True,trip verified flying business class pleasure b...
3647,✅ Trip Verified | British Airways absolutely ...,2,2023-09-01,United Kingdom,True,trip verified british airway absolutely care r...
3648,✅ Trip Verified | My recent experience with B...,1,2023-09-01,United States,True,trip verified recent experience british airway...


In [27]:
#checking the anamoly in stars column
df.stars.value_counts()

1    2322
8     664
5     332
2     332
Name: stars, dtype: int64

In [29]:
df.stars.unique()

array([5, 1, 8, 2], dtype=int64)

In [30]:
#checking for null values
df.isnull().sum()

reviews     0
stars       0
date        0
country     0
verified    0
corpus      0
dtype: int64

There are no null values in our dataset

In [31]:
df.shape

(3650, 6)

In [32]:
#now resetting the index before exporting the data for data visualisation and further analysis
df.reset_index(drop = True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,"Not Verified | The worst service ever, my bag...",5,2023-09-09,Kuwait,False,verified worst service ever baggage arrive tim...
1,✅ Trip Verified | 4/4 flights we booked this ...,1,2023-09-06,Germany,True,trip verified flight booked holiday delayed ho...
2,✅ Trip Verified | British Airways has a total...,1,2023-09-04,United Kingdom,True,trip verified british airway total lack respec...
3,"✅ Trip Verified | London Heathrow to Keflavik,...",1,2023-09-04,Iceland,True,trip verified london heathrow keflavik iceland...
4,✅ Trip Verified | Mumbai to London Heathrow in...,8,2023-09-04,Iceland,True,trip verified mumbai london heathrow business ...
...,...,...,...,...,...,...
3645,✅ Trip Verified | Care and support shocking. ...,8,2023-09-04,United Kingdom,True,trip verified care support shocking written pr...
3646,✅ Trip Verified | Flying A380 business class ...,8,2023-09-02,Australia,True,trip verified flying business class pleasure b...
3647,✅ Trip Verified | British Airways absolutely ...,2,2023-09-01,United Kingdom,True,trip verified british airway absolutely care r...
3648,✅ Trip Verified | My recent experience with B...,1,2023-09-01,United States,True,trip verified recent experience british airway...


In [33]:
#exporting the cleaned dataset

df.to_csv(cwd+'\Cleaned_BA_reviews')