In [1]:
import pandas as pd
import csv
from collections import defaultdict

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 

# References used:
# NLTK basics: https://benalexkeen.com/basic-language-processing-with-nltk/
# NLTK parts of speech (POS): https://cs.nyu.edu/~grishman/jet/guide/PennPOS.html
# NLTK tokenization, POS tagging: https://keremkargin.medium.com/nlp-tokenization-stemming-lemmatization-and-part-of-speech-tagging-9088ac068768

df = pd.read_csv('ba_reviews_clean.csv')
df.head(3)

Unnamed: 0,header,author,date,place,content,aircraft,traveller_type,seat_type,route,date_flown,recommended,trip_verified,rating,seat_comfort,cabin_staff_service,food_beverages,ground_service,value_for_money,entertainment
0,service was mediocre at best,Gary Storer,03/10/2023,United Kingdom,"Just returned from Chicago, flew out 10 days ...",A380,Couple Leisure,Economy Class,Chicago to Manchester via Heathrow,01/10/2023,no,Not Verified,1.0,2.0,3.0,1.0,2.0,2,
1,BA standards continue to decline,A Jensen,02/10/2023,United Kingdom,BA standards continue to decline every time ...,A320,Business,Business Class,London Heathrow to Munich,01/09/2023,no,Verified,1.0,2.0,1.0,2.0,1.0,1,
2,"won the race to the bottom""",John Rockett,02/10/2023,United Kingdom,Awful. Business class check in queue just as...,A320,Couple Leisure,Business Class,Heathrow to Istanbul,01/09/2023,no,Not Verified,1.0,2.0,3.0,2.0,1.0,1,


In [4]:
example = df.loc[:,'content'][0]

# stopwords: a set that holds common "filler" words like 'a', 'the', etc to filter out
stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence

['Just', 'returned', 'Chicago', ',', 'flew', '10', 'days', 'ago', 'American', 'Airlines', 'absolutely', 'superb', 'every', 'way', ',', 'high', 'expectations', 'return', 'flight', 'BA', '.', 'What', 'disappointment', '.', 'The', 'Airbus', 'A380', 'may', 'nice', 'pilots', 'perspective', 'passenger', 'awful', '.', 'Very', 'uncomfortable', 'seats', ',', 'inflight', 'entertainment', 'flight', 'tracker', 'failed', 'work', 'throughout', 'flight', ',', 'inflight', 'meal', 'inedible', 'service', 'mediocre', 'best', '.', 'Our', 'short', 'flight', 'Heathrow', 'Manchester', 'much', 'improved', ',', 'welcoming', 'attentive', 'flight', 'staff', 'flight', 'even', 'arrived', 'early', '.', 'In', 'future', 'travel', 'one', 'American', 'carriers', '.']


In [5]:
# pos: part of speech. E.g, NN = noun
tagged = nltk.pos_tag(word_tokens)
tagged

[('Just', 'RB'),
 ('returned', 'VBN'),
 ('from', 'IN'),
 ('Chicago', 'NNP'),
 (',', ','),
 ('flew', 'VBD'),
 ('out', 'IN'),
 ('10', 'CD'),
 ('days', 'NNS'),
 ('ago', 'RB'),
 ('on', 'IN'),
 ('American', 'NNP'),
 ('Airlines', 'NNP'),
 ('absolutely', 'RB'),
 ('superb', 'VBP'),
 ('in', 'IN'),
 ('every', 'DT'),
 ('way', 'NN'),
 (',', ','),
 ('had', 'VBD'),
 ('high', 'JJ'),
 ('expectations', 'NNS'),
 ('on', 'IN'),
 ('return', 'NN'),
 ('flight', 'NN'),
 ('with', 'IN'),
 ('BA', 'NNP'),
 ('.', '.'),
 ('What', 'WP'),
 ('a', 'DT'),
 ('disappointment', 'NN'),
 ('.', '.'),
 ('The', 'DT'),
 ('Airbus', 'NNP'),
 ('A380', 'NNP'),
 ('may', 'MD'),
 ('be', 'VB'),
 ('nice', 'JJ'),
 ('from', 'IN'),
 ('a', 'DT'),
 ('pilots', 'NNS'),
 ('perspective', 'NN'),
 ('but', 'CC'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('passenger', 'NN'),
 ('it', 'PRP'),
 ('was', 'VBD'),
 ('awful', 'JJ'),
 ('.', '.'),
 ('Very', 'RB'),
 ('uncomfortable', 'JJ'),
 ('seats', 'NNS'),
 (',', ','),
 ('the', 'DT'),
 ('inflight', 'JJ'),
 ('entertainm

In [6]:
# takes and turns all reviews into a giant string for tokenization 
all_reviews = df.loc[:,'content']
all_reviews = ''.join(all_reviews)

## 
make custom arr of stop words for this specific british airways reviews dataset - flight, gate, airport, etc
use stemming or lemmas to group plurals and singular versions of words into one word
    then re-run word-tokenization
    then regression/ML stuff to check which ratings correlate most closely to the overall rating

In [7]:
custom_stop_words = {'airplane', 'flight', 'gate', 'airport', 'plane', ',', '.'}
custom_stop_words.update(set(stopwords.words('english')))

In [8]:
cleaned_tokenized = nltk.word_tokenize(all_reviews)
clean_filtered = [w.lower() for w in cleaned_tokenized if not w.lower() in custom_stop_words]

In [9]:
# get the top 25 mentioned words
def getFrequentWords(tokenized):
    freqs = defaultdict(int)
    nouns = [word[0].lower() for word in tokenized if word[1] == 'NN']

    for noun in nouns:
        if noun in freqs:
            freqs[noun] += 1
        else:
            freqs[noun] = 1

    import operator
    sorted_by_most_common = sorted(freqs.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_by_most_common[:25]

In [10]:
clean_tokens = nltk.pos_tag(clean_filtered)
print(getFrequentWords(clean_tokens))

[('service', 1123), ('ba', 1037), ('food', 950), ('seat', 863), ('class', 821), ('time', 820), ('business', 684), ('economy', 616), ('cabin', 590), ('staff', 567), ('crew', 546), ('heathrow', 447), ('london', 435), ('airline', 385), ('lounge', 367), ('aircraft', 365), ('experience', 342), ('club', 306), ('hour', 280), ('meal', 262), ('return', 258), ('world', 244), ('board', 241), ('choice', 222), ('way', 218)]


In [12]:
ps = PorterStemmer()
w1 = ("airport")
w2 = ("airplane")
print( ps.stem(w1), ps.stem(w2) )

airport airplan


In [13]:
lemmatizer = WordNetLemmatizer()

# Lemmatize single word
print(lemmatizer.lemmatize("airplanes"))
print(lemmatizer.lemmatize("airports"))

airplane
airport


In [14]:
clean_lemma = []
for word in clean_filtered:
    clean_lemma.append(lemmatizer.lemmatize(word))

all_lemma = ''.join(clean_lemma)
pos_lemma = nltk.pos_tag(clean_filtered)
end_result = getFrequentWords(pos_lemma)
end_result

[('service', 1123),
 ('ba', 1037),
 ('food', 950),
 ('seat', 863),
 ('class', 821),
 ('time', 820),
 ('business', 684),
 ('economy', 616),
 ('cabin', 590),
 ('staff', 567),
 ('crew', 546),
 ('heathrow', 447),
 ('london', 435),
 ('airline', 385),
 ('lounge', 367),
 ('aircraft', 365),
 ('experience', 342),
 ('club', 306),
 ('hour', 280),
 ('meal', 262),
 ('return', 258),
 ('world', 244),
 ('board', 241),
 ('choice', 222),
 ('way', 218)]

Lemmatization didn't do much... at all... at least on the part about nouns. BUT doing this whole word tokenization in the first place showed that the most mentioned attributes of service were:
    service
    food
    seat
    class (business, economy)
    staff
    crew
    experience

which highlights the metrics of FOOD/BEVERAGES, ground service?, SEAT CLASS
and things like ENTERTAINMENT were less thought about

In [15]:
csv_file_path = 'nltk_top_frequency.csv'
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)

    for row in end_result:
        writer.writerow(row)

In [16]:
df = pd.DataFrame(end_result)

csv_file_path = 'nltk_top_frequency_pd.csv'
df.to_csv(csv_file_path, index=False, header=False)

Correlation Coefficients

In [17]:
df = pd.read_csv('ba_reviews_clean.csv')

In [18]:
ratings = df[["rating",
              "seat_comfort",
              "cabin_staff_service",
              "food_beverages",
              "ground_service",
              "value_for_money",
              "entertainment"]]
ratings.head()

In [23]:
corr_coeff = ratings.corr(method='pearson')
corr_coeff.to_csv('rating_correlations.csv')
corr_coeff