In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


## Data CLeaning

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [3]:
df = pd.read_csv('/content/drive/MyDrive/British Airways/reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,month,year,country,seat_type
0,0,"✅ Trip Verified | Very poor service, very fru...",\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,March,2023,United Kingdom,Economy Class
1,1,Not Verified | Generally poor. Sent to gate o...,1,March,2023,United Kingdom,Economy Class
2,2,Not Verified | BA changed our prepaid seats a...,3,March,2023,United Kingdom,Economy Class
3,3,Not Verified | Flew from London Heathrow to M...,1,March,2023,United Kingdom,Economy Class
4,4,Not Verified | I was meant to fly in January t...,1,March,2023,United Kingdom,Economy Class


In [5]:
df.shape

(3498, 7)

In [4]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [5]:
df["verified"]

0       False
1        True
2        True
3        True
4        True
        ...  
3493    False
3494    False
3495    False
3496    False
3497    False
Name: verified, Length: 3498, dtype: bool

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,month,year,country,seat_type,verified
0,0,Not Verified | I was meant to fly in January t...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,March,2023,United Kingdom,Economy Class,False
1,1,✅ Trip Verified | We have flown repeatedly wi...,1,March,2023,Austria,Economy Class,True
2,2,✅ Trip Verified | I was horrified by the extr...,1,March,2023,United States,Economy Class,True
3,3,✅ Trip Verified | \nThe worst cabin experienc...,3,March,2023,France,Economy Class,True
4,4,✅ Trip Verified | First time flying with Briti...,2,March,2023,India,Premium Economy,True


In [6]:
df.drop(columns = df.columns[0],axis=1, inplace=True)
df.head()

Unnamed: 0,reviews,stars,month,year,country,seat_type
0,"✅ Trip Verified | Very poor service, very fru...",\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,March,2023,United Kingdom,Economy Class
1,Not Verified | Generally poor. Sent to gate o...,1,March,2023,United Kingdom,Economy Class
2,Not Verified | BA changed our prepaid seats a...,3,March,2023,United Kingdom,Economy Class
3,Not Verified | Flew from London Heathrow to M...,1,March,2023,United Kingdom,Economy Class
4,Not Verified | I was meant to fly in January t...,1,March,2023,United Kingdom,Economy Class


## Review Formatting

In [7]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [8]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemma = WordNetLemmatizer()

df.reviews = df.reviews.str.strip("✅ Trip Verified | ")
df.reviews = df.reviews.str.strip("Not Verified | ")

corpus = []

for s in df.reviews:
  s = re.sub('[^a-zA-Z]',' ',s)
  s = s.lower()
  s = s.split()
  s = [lemma.lemmatize(word) for word in s if word not in set(stopwords.words('english'))]
  s = " ".join(s)
  corpus.append(s)

In [9]:
df['corpus'] = corpus

In [10]:
df.head()

Unnamed: 0,reviews,stars,month,year,country,seat_type,corpus
0,"y poor service, very frustrating. Firstly my f...",\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,March,2023,United Kingdom,Economy Class,poor service frustrating firstly flight lhr ca...
1,"Generally poor. Sent to gate on time, sat mayb...",1,March,2023,United Kingdom,Economy Class,generally poor sent gate time sat maybe minute...
2,BA changed our prepaid seats at the last minut...,3,March,2023,United Kingdom,Economy Class,ba changed prepaid seat last minute charged u ...
3,Flew from London Heathrow to Marrakech. BA cha...,1,March,2023,United Kingdom,Economy Class,flew london heathrow marrakech ba changed pre ...
4,I was meant to fly in January to Algeria. I pa...,1,March,2023,United Kingdom,Economy Class,meant fly january algeria paid ticket day mean...


## Formatting Stars

In [11]:
df.stars.unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '1', '3', '2', '8', '7', '10',
       '4', '6', '5', '9', 'None'], dtype=object)

In [14]:
df.stars.value_counts()

1       766
2       393
3       385
8       352
10      315
7       305
9       299
5       261
4       235
6       182
None      5
Name: stars, dtype: int64

In [13]:
df.stars = df.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [15]:
df.drop(df[df.stars == 'None'].index, axis=0,inplace = True)

# Checking for Null values

In [16]:
df.isnull().value_counts()

reviews  stars  month  year   country  seat_type  corpus
False    False  False  False  False    False      False     3491
                              True     False      False        2
dtype: int64

In [17]:
df.country.isnull().value_counts()

False    3491
True        2
Name: country, dtype: int64

In [18]:
df.drop(df[df.country.isnull() == True].index, axis = 0, inplace = True)

In [19]:
df.head()

Unnamed: 0,reviews,stars,month,year,country,seat_type,corpus
0,"y poor service, very frustrating. Firstly my f...",5,March,2023,United Kingdom,Economy Class,poor service frustrating firstly flight lhr ca...
1,"Generally poor. Sent to gate on time, sat mayb...",1,March,2023,United Kingdom,Economy Class,generally poor sent gate time sat maybe minute...
2,BA changed our prepaid seats at the last minut...,3,March,2023,United Kingdom,Economy Class,ba changed prepaid seat last minute charged u ...
3,Flew from London Heathrow to Marrakech. BA cha...,1,March,2023,United Kingdom,Economy Class,flew london heathrow marrakech ba changed pre ...
4,I was meant to fly in January to Algeria. I pa...,1,March,2023,United Kingdom,Economy Class,meant fly january algeria paid ticket day mean...


In [20]:
df.to_csv("/content/drive/MyDrive/British Airways/cleaned-reviews.csv")

In [25]:
df

Unnamed: 0,reviews,stars,month,year,country,seat_type,verified,corpus
0,I was meant to fly in January to Algeria. I pa...,5,March,2023,United Kingdom,Economy Class,False,meant fly january algeria paid ticket day mean...
1,We have flown repeatedly with British Airways ...,1,March,2023,Austria,Economy Class,True,flown repeatedly british airway one world alli...
2,I was horrified by the extremely small seats a...,1,March,2023,United States,Economy Class,True,horrified extremely small seat poor training c...
3,\nThe worst cabin experience ever: cramped sea...,3,March,2023,France,Economy Class,True,worst cabin experience ever cramped seat low c...
4,First time flying with British Airways and fir...,2,March,2023,India,Premium Economy,True,first time flying british airway first time fl...
...,...,...,...,...,...,...,...,...
3493,Business LHR to BKK. 747-400. First try back w...,2,August,2012,United Kingdom,Economy Class,False,business lhr bkk first try back ba year flown ...
3494,his was a bmi Regional operated flight on a RJ...,6,August,2012,United Kingdom,Economy Class,False,bmi regional operated flight rj manchester hea...
3495,LHR-HKG on Boeing 747 - 23/08/12. Much has bee...,7,August,2012,United Kingdom,Economy Class,False,lhr hkg boeing much written tired old fleet go...
3496,Just got back from Bridgetown Barbados flying ...,10,August,2012,United Kingdom,Business Class,False,got back bridgetown barbados flying british ai...
