In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

In [2]:
Data = pd.read_csv('ModuleReviews.csv', encoding = "ISO-8859-1" )

In [3]:
Data

Unnamed: 0,REVIEW,SENTIMENT,Module
0,Update: Now that the tech issues have been add...,Positive,Update
1,Changing my rating since the problems are fixe...,Positive,Update
2,App worked pretty much as expected. Could have...,Positive,Maps
3,Highly recommend when you travel with Air Cana...,Positive,Application Crashes
4,The app went from very useful to absolutely us...,Positive,Application Crashes
...,...,...,...
825,This app works OK generally but I have been ha...,Negative,My trips page
826,Whenever I get a notification about a flight f...,Negative,Application Crashes
827,I travel a lot on a number of airlines. Delta ...,Positive,My trips page
828,Had to keep adding my flight info so I could u...,Negative,Error Messages


In [4]:
Data['Module'].isna().value_counts()

False    828
True       2
Name: Module, dtype: int64

In [5]:
Data['Module'] = Data['Module'].fillna(Data['Module'].mode()[0])

In [6]:
Data['Module'].isna().value_counts()

False    830
Name: Module, dtype: int64

In [7]:
Data['Module'].value_counts()

User Experience        314
My trips page          110
Application Crashes     65
Update                  64
Flight Status           50
Booking Page            49
Login Issues            45
Check-in                30
Boarding                30
Baggage page            16
Error Messages          14
Payment Page            13
User Data issue         11
Maps                     6
Navigation               4
Seat selection           4
Calender                 2
Tracking                 2
Search page              1
Name: Module, dtype: int64

In [8]:
Data.Module.replace({"Tracking":"Flight Status", "Search page":"Booking Page"},inplace=True)

In [9]:
Data['Module'].value_counts()

User Experience        314
My trips page          110
Application Crashes     65
Update                  64
Flight Status           52
Booking Page            50
Login Issues            45
Check-in                30
Boarding                30
Baggage page            16
Error Messages          14
Payment Page            13
User Data issue         11
Maps                     6
Navigation               4
Seat selection           4
Calender                 2
Name: Module, dtype: int64

In [10]:
print ("Number of Unique values :", Data['Module'].nunique())
print ("List of Unique values :", Data['Module'].unique())

Number of Unique values : 17
List of Unique values : ['Update' 'Maps' 'Application Crashes' 'Payment Page' 'Flight Status'
 'User Experience' 'Booking Page' 'My trips page' 'Boarding' 'Check-in'
 'Login Issues' 'User Data issue' 'Baggage page' 'Error Messages'
 'Calender' 'Seat selection' 'Navigation ']


In [11]:
import seaborn as sns

sns.countplot(x='Module', data=Data)

<matplotlib.axes._subplots.AxesSubplot at 0x1ecc84ab648>

In [12]:
reviews = Data['REVIEW'].values
Categories = Data['Module'].values

# Text Wrangling and Normalization

In [13]:
from bs4 import BeautifulSoup
import re
import tqdm
import unicodedata

In [14]:
#function defined to remove html tags from data

def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text

In [15]:
#function defined to remove unicode data

def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

In [16]:
# lower case and remove special characters\whitespaces

def pre_process_corpus(docs):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    doc = strip_html_tags(doc)
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    doc = doc.lower()
    doc = remove_accented_chars(doc)
    #doc = contractions.fix(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = re.sub(' +', ' ', doc)
    doc = doc.strip()  
    norm_docs.append(doc)
  
  return norm_docs

In [17]:
norm_reviews = pre_process_corpus(reviews)

100%|██████████| 830/830 [00:00<00:00, 2582.18it/s]


In [18]:
#Count Vecotorizer 

from sklearn.feature_extraction.text import CountVectorizer

In [19]:
# build BOW features on train reviews
cv = CountVectorizer(min_df=5, max_df=0.95,stop_words='english', ngram_range=(1,2))
cv_features = cv.fit_transform(norm_reviews)

In [20]:
cv_features

<830x862 sparse matrix of type '<class 'numpy.int64'>'
	with 13912 stored elements in Compressed Sparse Row format>

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True, stop_words='english')
tv_features = tv.fit_transform(norm_reviews)

# LDA - Latent Drichhlet Allocation

In [23]:
from sklearn.decomposition import LatentDirichletAllocation

In [24]:
LDA = LatentDirichletAllocation(n_components=10,random_state=40)

In [25]:
LDA.fit(cv_features)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=40, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [26]:
len(cv.get_feature_names())

862

In [27]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 20 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

THE TOP 20 WORDS FOR TOPIC #0
['having', 'number', 'delta app', 'crashes', 'work', 'trips', 'fly', 'phone', 'trip', 'just', 'boarding passes', 'use', 'passes', 'time', 'flight', 'boarding pass', 'pass', 'delta', 'boarding', 'app']


THE TOP 20 WORDS FOR TOPIC #1
['time', 'know', 'working', 'great app', 'dont', 'easy', 'updates', 'works', 'delta', 'airline', 'fly', 'needs', 'boarding', 'like', 'flight', 'use', 'just', 'flights', 'great', 'app']


THE TOP 20 WORDS FOR TOPIC #2
['good', 'day', 'flights', 'pass', 'flying', 'log', 'flight', 'easy', 'easy use', 'time', 'booking', 'account', 'password', 'use', 'better', 'works', 'air canada', 'air', 'canada', 'app']


THE TOP 20 WORDS FOR TOPIC #3
['tried', 'does', 'information', 'phone', 'ive', 'number', 'security', 'check', 'times', 'use', 'gate', 'boarding passes', 'just', 'flight', 'passes', 'time', 'boarding pass', 'pass', 'boarding', 'app']


THE TOP 20 WORDS FOR TOPIC #4
['just', 'trips', 'right', 'flight', 'like', 'work', 'fixed', 'ne

In [28]:
print ("List of Unique values :", Data['Module'].unique())

List of Unique values : ['Update' 'Maps' 'Application Crashes' 'Payment Page' 'Flight Status'
 'User Experience' 'Booking Page' 'My trips page' 'Boarding' 'Check-in'
 'Login Issues' 'User Data issue' 'Baggage page' 'Error Messages'
 'Calender' 'Seat selection' 'Navigation ']


In [29]:
LDA.fit(tv_features)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=40, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [30]:
len(tv.get_feature_names())

862

In [31]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 20 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

THE TOP 20 WORDS FOR TOPIC #0
['app works', 'tried', 'experience', 'times', 'ac', 'just', 'flight', 'better', 'use', 'time', 'boarding pass', 'work', 'boarding', 'works', 'pass', 'airlines', 'delta', 'new', 'log', 'app']


THE TOP 20 WORDS FOR TOPIC #1
['boarding', 'issue', 'reviews', 'dont', 'makes', 'just', 'easy', 'apps', 'checkin', 'functionality', 'needs', 'flights', 'updates', 'like', 'friendly', 'user friendly', 'great app', 'user', 'great', 'app']


THE TOP 20 WORDS FOR TOPIC #2
['using', 'login', 'booked', 'version', 'boarding pass', 'pass', 'account', 'flight', 'boarding', 'reset', 'password', 'flights', 'info', 'time', 'excellent', 'crashes', 'app', 'use', 'easy use', 'easy']


THE TOP 20 WORDS FOR TOPIC #3
['doesnt', 'useless', 'flights', 'confirmation', 'works', 'great', 'open', 'flight', 'information', 'delta', 'working', 'boarding pass', 'just', 'use', 'time', 'pass', 'boarding', 'number', 'having', 'app']


THE TOP 20 WORDS FOR TOPIC #4
['fields', 'fast', 'info', 'love'

# Word 2 Vec in LDA

In [33]:
import os
import smart_open
import gensim


In [38]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = Data
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [40]:
import gensim.models

sentences = norm_reviews
model = gensim.models.Word2Vec(sentences=sentences)