In [46]:
import pandas as pd
import gensim #the library for Topic modelling
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim_models #LDA visualization library

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

Data cleaning


In [52]:
df = pd.read_csv('data/BA_reviews_cleaned.csv',index_col=0)
print(df.shape)
df

(1100, 1)


Unnamed: 0,reviews
0,"Booked a BA holiday to Marrakech, after postin..."
1,Extremely sub-par service. Highlights: No onli...
2,I virtually gave up on British Airways about t...
3,I was pleasantly surprised that the airline co...
4,"British Airways is late, their website is atro..."
...,...
1095,Manchester to London. We were flying business ...
1096,London to Athens. On the date of the flight I ...
1097,Very disappointing. I was scheduled on a Briti...
1098,This was really a no frills business class. Th...


In [53]:
#clean the data
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

In [54]:
df['reviews_clean']=df['reviews'].apply(clean)

In [55]:
df

Unnamed: 0,reviews,reviews_clean
0,"Booked a BA holiday to Marrakech, after postin...","[booked, ba, holiday, marrakech, posting, nega..."
1,Extremely sub-par service. Highlights: No onli...,"[extremely, subpar, service, highlight, online..."
2,I virtually gave up on British Airways about t...,"[virtually, gave, british, airway, three, year..."
3,I was pleasantly surprised that the airline co...,"[pleasantly, surprised, airline, could, mainta..."
4,"British Airways is late, their website is atro...","[british, airway, late, website, atrocious, le..."
...,...,...
1095,Manchester to London. We were flying business ...,"[manchester, london, flying, business, class, ..."
1096,London to Athens. On the date of the flight I ...,"[london, athens, date, flight, received, text,..."
1097,Very disappointing. I was scheduled on a Briti...,"[disappointing, scheduled, british, airway, fl..."
1098,This was really a no frills business class. Th...,"[really, frill, business, class, flight, part,..."


In [57]:
#create dictionary
dictionary = corpora.Dictionary(df['reviews_clean'])
#Total number of non-zeroes in the BOW matrix (sum of the number of unique words per document over the entire corpus).
print(dictionary.num_nnz)

71294


In [59]:
#create document term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['reviews_clean'] ]
print(len(doc_term_matrix))

1100


In [60]:
#lda = gensim.models.ldamodel.LdaModel

In [67]:
lda = gensim.models.ldamulticore.LdaMulticore

In [83]:
num_topics=5
%time ldamodel = models.LdaMulticore(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0,workers=3)

Wall time: 1min 15s


In [84]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.023*"seat" + 0.021*"flight" + 0.014*"ba" + 0.012*"food" + 0.011*"class" + 0.011*"crew" + 0.011*"good" + 0.010*"cabin" + 0.010*"service" + 0.009*"business"'),
 (1,
  '0.013*"suitcase" + 0.009*"london" + 0.006*"airway" + 0.006*"british" + 0.006*"bag" + 0.004*"number" + 0.004*"via" + 0.003*"experience" + 0.003*"back" + 0.003*"day"'),
 (2,
  '0.023*"flight" + 0.015*"ba" + 0.015*"refund" + 0.012*"british" + 0.012*"airway" + 0.011*"airline" + 0.011*"customer" + 0.009*"booked" + 0.009*"voucher" + 0.008*"travel"'),
 (3,
  '0.008*"ba" + 0.007*"passenger" + 0.006*"london" + 0.005*"airway" + 0.005*"lounge" + 0.005*"flight" + 0.005*"staff" + 0.004*"british" + 0.004*"airport" + 0.004*"service"'),
 (4,
  '0.033*"flight" + 0.015*"ba" + 0.011*"hour" + 0.010*"service" + 0.009*"london" + 0.008*"time" + 0.008*"staff" + 0.007*"u" + 0.006*"one" + 0.006*"check"')]

In [85]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

In [None]:
pyLDAvis.save_html(lda_display, 'lda.html')

In [86]:
# Assigns the topics to the documents in corpus
lda_corpus = ldamodel[doc_term_matrix]

In [87]:
[doc for doc in lda_corpus]

[[(0, 0.16296071),
  (1, 0.0040327706),
  (2, 0.0041213576),
  (3, 0.0041058837),
  (4, 0.8247793)],
 [(0, 0.35584992),
  (1, 0.0027609465),
  (2, 0.0027886464),
  (3, 0.26407132),
  (4, 0.37452912)],
 [(0, 0.5351287),
  (1, 0.004798634),
  (2, 0.45037225),
  (3, 0.0048201606),
  (4, 0.004880228)],
 [(0, 0.55230933),
  (1, 0.0034348844),
  (2, 0.09928021),
  (3, 0.0034256917),
  (4, 0.34154987)],
 [(0, 0.007055602),
  (1, 0.00693423),
  (2, 0.43336558),
  (3, 0.12311977),
  (4, 0.42952484)],
 [(0, 0.6910387),
  (1, 0.004016675),
  (2, 0.0041529816),
  (3, 0.0040941234),
  (4, 0.2966975)],
 [(0, 0.0017550869),
  (1, 0.0017664041),
  (2, 0.20675486),
  (3, 0.21580431),
  (4, 0.5739193)],
 [(0, 0.29224244),
  (1, 0.0013367431),
  (2, 0.0013478489),
  (3, 0.29596192),
  (4, 0.40911102)],
 [(0, 0.31185326),
  (1, 0.006587506),
  (2, 0.0066354433),
  (3, 0.00655003),
  (4, 0.66837376)],
 [(0, 0.79318935),
  (1, 0.002373434),
  (2, 0.0023726046),
  (3, 0.19962612),
  (4, 0.0024384898)],
 [(0,

In [88]:
scores = list(chain(*[[score for topic_id,score in topic] \
                      for topic in [doc for doc in lda_corpus]]))

threshold = sum(scores)/len(scores)
print(threshold)

0.19999999983809805


In [89]:
cluster1 = [j for i,j in zip(lda_corpus,df.index) if i[0][1] > threshold]
cluster2 = [j for i,j in zip(lda_corpus,df.index) if i[1][1] > threshold]
cluster3 = [j for i,j in zip(lda_corpus,df.index) if i[2][1] > threshold]
cluster4 = [j for i,j in zip(lda_corpus,df.index) if i[3][1] > threshold]
cluster5 = [j for i,j in zip(lda_corpus,df.index) if i[4][1] > threshold]

print(len(cluster1))
print(len(cluster2))
print(len(cluster3))
print(len(cluster4))
print(len(cluster5))

726
50
182
102
652


In [90]:
df.iloc[cluster1]

Unnamed: 0,reviews,reviews_clean
1,Extremely sub-par service. Highlights: No onli...,"[extremely, subpar, service, highlight, online..."
2,I virtually gave up on British Airways about t...,"[virtually, gave, british, airway, three, year..."
3,I was pleasantly surprised that the airline co...,"[pleasantly, surprised, airline, could, mainta..."
5,Flew from Amman to London on Nov. 14 2022. Not...,"[flew, amman, london, nov, 14, 2022, sure, typ..."
7,Flying LHR T5 to CPT November 2022: BA app an...,"[flying, lhr, t5, cpt, november, 2022, ba, app..."
...,...,...
1091,Flew London Heathrow to New York JFK. Have flo...,"[flew, london, heathrow, new, york, jfk, flown..."
1095,Manchester to London. We were flying business ...,"[manchester, london, flying, business, class, ..."
1097,Very disappointing. I was scheduled on a Briti...,"[disappointing, scheduled, british, airway, fl..."
1098,This was really a no frills business class. Th...,"[really, frill, business, class, flight, part,..."


In [91]:
df.iloc[cluster2]

Unnamed: 0,reviews,reviews_clean
11,On July 19th 2022 I had submitted a complaint ...,"[july, 19th, 2022, submitted, complaint, form,..."
15,A great flight. The suites on this aircraft re...,"[great, flight, suite, aircraft, really, provi..."
36,British Airways personnel was just magnificent...,"[british, airway, personnel, magnificent, flig..."
46,I sat in seat 25C. Our Cabin Crew lady was cal...,"[sat, seat, 25c, cabin, crew, lady, called, gi..."
55,"Holidays ruined, 2 suitcases lost in 2 flights...","[holiday, ruined, 2, suitcase, lost, 2, flight..."
80,Terrible experience with British Airways! I wa...,"[terrible, experience, british, airway, obliga..."
176,This past November/December I flew Washington-...,"[past, novemberdecember, flew, washingtonlondo..."
180,Unfortunately having just flown in the new clu...,"[unfortunately, flown, new, club, world, cabin..."
194,"Overall, a very lovely flight with BA to Edinb...","[overall, lovely, flight, ba, edinburgh, chris..."
249,Year after year the British Airways (BA) servi...,"[year, year, british, airway, ba, service, rou..."


In [92]:
df.iloc[cluster3]

Unnamed: 0,reviews,reviews_clean
2,I virtually gave up on British Airways about t...,"[virtually, gave, british, airway, three, year..."
4,"British Airways is late, their website is atro...","[british, airway, late, website, atrocious, le..."
6,This is the worst experience I have ever had w...,"[worst, experience, ever, airline, flew, briti..."
11,On July 19th 2022 I had submitted a complaint ...,"[july, 19th, 2022, submitted, complaint, form,..."
12,"I booked the flight on Oct 8, but have to canc...","[booked, flight, oct, 8, cancel, flight, day, ..."
...,...,...
1058,Amsterdam to London. Service Ryanair wouldn't ...,"[amsterdam, london, service, ryanair, tolerate..."
1060,Will never fly with BA again. Outbound flight ...,"[never, fly, ba, again, outbound, flight, econ..."
1067,Budapest to Philadelphia via London. They chan...,"[budapest, philadelphia, via, london, changed,..."
1090,London to Miami. British Airways have been so ...,"[london, miami, british, airway, disappointing..."


In [82]:
df.iloc[cluster4]

Unnamed: 0,reviews,reviews_clean
2,I virtually gave up on British Airways about t...,"[virtually, gave, british, airway, three, year..."
3,I was pleasantly surprised that the airline co...,"[pleasantly, surprised, airline, could, mainta..."
5,Flew from Amman to London on Nov. 14 2022. Not...,"[flew, amman, london, nov, 14, 2022, sure, typ..."
7,Flying LHR T5 to CPT November 2022: BA app an...,"[flying, lhr, t5, cpt, november, 2022, ba, app..."
9,"Check in was a shambles at BWI, just 3 counter...","[check, shamble, bwi, 3, counter, open, full, ..."
...,...,...
1091,Flew London Heathrow to New York JFK. Have flo...,"[flew, london, heathrow, new, york, jfk, flown..."
1093,Flew Funchal to London Gatwick. Just wanted to...,"[flew, funchal, london, gatwick, wanted, thank..."
1095,Manchester to London. We were flying business ...,"[manchester, london, flying, business, class, ..."
1098,This was really a no frills business class. Th...,"[really, frill, business, class, flight, part,..."
