In [170]:
import pandas as pd
import numpy as np
import re

# Load Data

In [171]:
Data = pd.read_csv('All Reviews1.csv', encoding = "ISO-8859-1" )

In [172]:
Data.head(5)

Unnamed: 0.1,Unnamed: 0,Review
0,0,Update: Now that the tech issues have been add...
1,1,Changing my rating since the problems are fixe...
2,2,App worked pretty much as expected. Could have...
3,3,Highly recommend when you travel with Air Cana...
4,4,The app went from very useful to absolutely us...


In [173]:
Data.shape

(1763, 2)

In [174]:
Data.duplicated().value_counts()

False    1763
dtype: int64

In [175]:
#pip install nltk

In [176]:
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
import tqdm
import unicodedata

# Train and Test Data Split

In [177]:
reviews = Data['Review'].values

In [178]:
#creating the classification model using the classification algo
from sklearn.model_selection import train_test_split
train_reviews, test_reviews = train_test_split(reviews, test_size=0.2, random_state=42)


In [179]:
print("Train Reviews shape :", train_reviews.shape)
print("Test Reviews shape :", test_reviews.shape)

Train Reviews shape : (1410,)
Test Reviews shape : (353,)


# Text Wrangling and Normalization

In [180]:
#function defined to remove html tags from data

def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text

In [181]:
#function defined to remove unicode data

def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

In [182]:
# lower case and remove special characters\whitespaces

def pre_process_corpus(docs):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    doc = strip_html_tags(doc)
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    doc = doc.lower()
    doc = remove_accented_chars(doc)
    #doc = contractions.fix(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = re.sub(' +', ' ', doc)
    doc = doc.strip()  
    norm_docs.append(doc)
  
  return norm_docs

In [183]:
norm_train_reviews = pre_process_corpus(train_reviews)


100%|██████████| 1410/1410 [00:03<00:00, 413.51it/s]


In [184]:
#Count Vecotorizer 

from sklearn.feature_extraction.text import CountVectorizer

In [214]:
# build BOW features on train reviews
cv = CountVectorizer(min_df=2, max_df=0.95,stop_words='english', ngram_range=(2,2))
cv_train_features = cv.fit_transform(norm_train_reviews)


In [215]:
cv_train_features.shape

(1410, 2340)

In [199]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [211]:
# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=2, max_df=0.95, ngram_range=(2,2),
                     sublinear_tf=True, stop_words='english')
tv_train_features = tv.fit_transform(norm_train_reviews)

In [212]:
tv_train_features.shape

(1410, 2340)

In [213]:
norm_test_reviews = pre_process_corpus(test_reviews)

100%|██████████| 353/353 [00:00<00:00, 3404.30it/s]


In [216]:
cv_test_features = cv.fit_transform(norm_test_reviews)
tv_test_features = tv.fit_transform(norm_test_reviews)

print ('CV Test Features shape :', cv_test_features.shape)
print ('TF-IDF Test Features shape :', tv_test_features.shape)

CV Test Features shape : (353, 384)
TF-IDF Test Features shape : (353, 384)


# LDA - Latent Drichhlet Allocation

In [217]:
from sklearn.decomposition import LatentDirichletAllocation

In [218]:
LDA_cv = LatentDirichletAllocation(n_components=6,random_state=30)

In [219]:
LDA_cv.fit(cv_train_features)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=6, n_jobs=None,
                          perp_tol=0.1, random_state=30, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [221]:
cv.get_feature_names()

['20 min',
 '24 hours',
 '30 minutes',
 'able checkin',
 'absolutely ridiculous',
 'ac app',
 'ac website',
 'account app',
 'accurate flight',
 'add flight',
 'add google',
 'aeroplan account',
 'aeroplan number',
 'air canada',
 'airline app',
 'airplane mode',
 'altitude status',
 'android app',
 'anymore consider',
 'app acknowledge',
 'app air',
 'app app',
 'app available',
 'app awful',
 'app bad',
 'app boarding',
 'app book',
 'app check',
 'app come',
 'app completely',
 'app convenient',
 'app crashed',
 'app crashes',
 'app current',
 'app did',
 'app does',
 'app doesnt',
 'app dont',
 'app download',
 'app far',
 'app flight',
 'app going',
 'app great',
 'app horrible',
 'app ive',
 'app just',
 'app keeps',
 'app kept',
 'app login',
 'app mess',
 'app months',
 'app multiple',
 'app needs',
 'app negligible',
 'app opens',
 'app previous',
 'app really',
 'app says',
 'app sucks',
 'app takes',
 'app terrible',
 'app time',
 'app track',
 'app trash',
 'app typical',
 

In [223]:
for index,topic in enumerate(LDA_cv.components_):
    print(f'THE TOP 20 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

THE TOP 20 WORDS FOR TOPIC #0


IndexError: list index out of range

In [224]:
LDA_tv = LatentDirichletAllocation(n_components=6,random_state=42)

In [225]:
LDA_tv.fit(tv_train_features)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=6, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [226]:
for index,topic in enumerate(LDA_tv.components_):
    print(f'THE TOP 20 WORDS FOR TOPIC #{index}')
    print([tv.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

THE TOP 20 WORDS FOR TOPIC #0


IndexError: list index out of range

In [227]:
norm_reviews = pre_process_corpus(reviews)

100%|██████████| 1763/1763 [00:00<00:00, 2137.40it/s]


In [256]:
# build BOW features on train reviews
cov = CountVectorizer(min_df=1, max_df=0.5,stop_words='english', ngram_range=(3,3))
cov_features = cov.fit_transform(norm_reviews)

In [229]:
LDA_cov = LatentDirichletAllocation(n_components=6,random_state=30)

In [230]:
LDA_cov.fit(cov_features)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=6, n_jobs=None,
                          perp_tol=0.1, random_state=30, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [231]:
cov.get_feature_names()

['10 90',
 '10 accompany',
 '10 characters',
 '10 different',
 '10 doesnt',
 '10 fix',
 '10 hour',
 '10 load',
 '10 minute',
 '10 minutes',
 '10 new',
 '10 seconds',
 '10 time',
 '10 times',
 '10 tries',
 '10 uninstall',
 '100 buck',
 '100 million',
 '100 useless',
 '1000 percent',
 '1000 rent',
 '1000s flights',
 '100k option',
 '100k super',
 '100k travelers',
 '1022 search',
 '109 met',
 '10am boards',
 '10k build',
 '10kb mobile',
 '10th password',
 '10th update',
 '11 hour',
 '11 hours',
 '11 people',
 '11 taken',
 '11 years',
 '1100 roughly',
 '1122 improve',
 '1126am ft',
 '113019 update',
 '12 fees',
 '12 hour',
 '12 hours',
 '12 time',
 '12 wheelchairs',
 '120 time',
 '1200 agent',
 '121619 update',
 '1230pm easy',
 '13 reviews',
 '1309 departing',
 '14 hours',
 '14 time',
 '15 hours',
 '15 minutes',
 '15 release',
 '15 steps',
 '15 times',
 '150 people',
 '150 vouchers',
 '15th decided',
 '16 digit',
 '160 expensive',
 '160 million',
 '17 nov',
 '17th finally',
 '18 rented',


In [232]:
for index,topic in enumerate(LDA_cov.components_):
    print(f'THE TOP 20 WORDS FOR TOPIC #{index}')
    print([cov.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

THE TOP 20 WORDS FOR TOPIC #0
['app just', 'canada app', 'previous app', 'airline apps', 'open app', 'app wont', 'airplane mode', 'old app', 'customer service', 'app works', 'using app', 'airline app', 'flight info', 'new app', 'app doesnt', 'boarding passes', 'confirmation number', 'doesnt work', 'air canada', 'boarding pass']


THE TOP 20 WORDS FOR TOPIC #1
['connected wifi', 'pretty good', 'app needs', 'download app', 'doesnt work', 'old app', 'app sucks', 'check app', 'use app', 'check flight', 'new update', 'app does', 'customer service', 'new app', 'easy use', 'flight status', 'does work', 'boarding pass', 'boarding passes', 'air canada']


THE TOP 20 WORDS FOR TOPIC #2
['upcoming flights', 'app worked', 'ive used', 'aeroplan number', 'airline app', 'new version', 'half time', 'worst app', 'app does', 'completely useless', 'user friendly', 'new app', 'customer service', 'let check', 'wont let', 'old app', 'useless app', 'boarding passes', 'air canada', 'boarding pass']


THE TOP 

In [262]:
cov_features

<1763x26187 sparse matrix of type '<class 'numpy.int64'>'
	with 28269 stored elements in Compressed Sparse Row format>

In [264]:
top_cov = LDA_cov.fit(cov_features)

In [265]:
for index,topic in enumerate(LDA_cov.components_):
    print(f'THE TOP 20 WORDS FOR TOPIC #{index}')
    print([cov.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

THE TOP 20 WORDS FOR TOPIC #0
['boarding passes phone', 'display boarding pass', 'happened app does', 'air canada app', 'boarding pass line', 'load boarding pass', 'app boarding pass', 'doesnt work wont', 'able check app', 'just like airline', 'app does provide', 'boarding pass frustrating', 'wont let add', 'says network available', 'app worked great', 'way eboarding passes', 'ive used app', 'retrieve boarding pass', 'print boarding pass', 'app doesnt work']


THE TOP 20 WORDS FOR TOPIC #1
['app completely useless', 'save boarding passes', 'app having technical', 'use app time', 'load boarding pass', 'wont let log', 'app ive used', 'access boarding pass', 'times doesnt work', 'completely useless app', 'enter aeroplan number', 'doesnt let check', 'new air canada', 'boarding pass gate', 'retrieve boarding pass', 'boarding pass flights', 'worst app ive', 'just doesnt work', 'air canada app', 'turn airplane mode']


THE TOP 20 WORDS FOR TOPIC #2
['fly delta app', 'air canada rouge', 'googl

In [235]:
top_cov.shape

(1763, 6)

In [236]:
Data['LDA_topic_cv']=top_cov.argmax(axis=1)

In [237]:
Data

Unnamed: 0.1,Unnamed: 0,Review,LDA_topic_cv
0,0,Update: Now that the tech issues have been add...,2
1,1,Changing my rating since the problems are fixe...,1
2,2,App worked pretty much as expected. Could have...,3
3,3,Highly recommend when you travel with Air Cana...,3
4,4,The app went from very useful to absolutely us...,2
...,...,...,...
1758,3602,"Terrible to find the flight, information onli...",5
1759,3604,Worst App again.... Automatically shuts down. ...,4
1760,3609,My flight information just dissapears during l...,4
1761,3613,"This app just randomly loses your data, and cr...",4


# LSA Model

In [239]:
import os
import smart_open
import gensim

In [240]:
from gensim import models, corpora

In [255]:
from nltk import word_tokenize
from nltk.corpus import stopwords
 
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
 
#def clean_text(text):
#    tokenized_text = word_tokenize(text.lower())
#    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
#    return cleaned_text
 
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in norm_reviews:
    tokenized_data.append(cov_features)
 
 
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

TypeError: decoding to str: need a bytes-like object, csr_matrix found

In [None]:
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [246]:
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

Topic #0: 0.047*"this app is completely useless prior to the redesign it had some flaws but all around worked ok and easy to use booking were automatically loaded if you booked through flight pass checkin was easy all boarding passes were displayed and your main screen was a countdown to your next flight now there not even a point of taking space up on your phone with this usless app" + 0.047*"update now that the tech issues have been addressed changing to 4 stars from 1 star earlier the checkin functionality is great and also like to call out ability to view incoming flights often 34 flights down need to have altitude status and progression integrated into the app" + 0.047*"what a horrible update trading updated color schemes for greatly decreased functionality makes absolutely no sense the app is now just a collection of links to websites where you used to be able to change seats bid for upgrades get your boarding pass etc these functions now just redirect you to a web browser even a

In [257]:
from sklearn.cluster import KMeans

NUM_CLUSTERS = 6
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=100, random_state=42).fit(cov_features)
km

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=10000,
       n_clusters=6, n_init=100, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)

In [260]:
feature_names = cov.get_feature_names()
topn_features = 10
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]

In [261]:
for cluster_num in range(NUM_CLUSTERS):
    key_features = [feature_names[index] 
                        for index in ordered_centroids[cluster_num, :topn_features]]
    #modules = cat_cluster[cat_cluster['kmeans_cluster'] == cluster_num]['Module'].values.tolist()
    print('CLUSTER #'+str(cluster_num+1))
    print('Key Features:', key_features)
    #print('Impacted Module', modules)
    print('-'*80)

CLUSTER #1
Key Features: ['way eboarding passes', 'like need connected', 'airport useless international', 'check airport useless', 'cell phone connected', 'didnt allow worked', 'passes security app', 'time cell phone', 'network wifi opened', 'flight arrived destination']
--------------------------------------------------------------------------------
CLUSTER #2
Key Features: ['air canada app', 'print boarding pass', 'app doesnt work', 'air canada rouge', 'boarding pass app', 'turn airplane mode', 'access boarding pass', 'save boarding pass', 'app worked great', 'retrieve boarding pass']
--------------------------------------------------------------------------------
CLUSTER #3
Key Features: ['carry bookings make', 'extremely delayed refreshing', 'based aeroplan number', 'make sure info', 'bookings make sure', 'new updated version', 'number youll manually', 'wont carry bookings', 'manually app extremely', 'sure info ahead']
---------------------------------------------------------------