## 3.2 Course Review Topic Modeling

**Credit to Analytics Vidhya Articles for the ideas and codes:**  
https://www.analyticsvidhya.com/blog/2018/10/mining-online-reviews-topic-modeling-lda/  
https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import gensim
from gensim import corpora
import nltk
from nltk import FreqDist

%matplotlib inline
pd.options.display.max_colwidth = 350



---
### Course Name: Review from 'Machine Learning by Stanford University, Andrew Ng' on Coursera
### Data Import

In [2]:
path = './data/reviews_machine_learning.csv'
reviews = pd.read_csv(path)

In [3]:
# quick pre-processing, dropping columns and remove rows with null values
reviews.drop(columns=['Unnamed: 0'], inplace=True)
reviews.dropna(inplace=True)

In [4]:
# create review label: 1 - 3: bad reviews, 4/5: good reviews
reviews['review_is_bad'] = reviews['rating'].map(lambda x: 1 if x < 4 else 0)

### Data Pre-processing 
#### `tokenize`, `stopwords`, `lemmatize`, `pos_tag`

In [5]:
from nltk.corpus import wordnet

In [6]:
# define a function to associate the wordnet object value corresponding to the POS tag
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [7]:
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

In [8]:
def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

In [9]:
# clean text data
reviews["review_clean"] = reviews["review"].apply(lambda x: clean_text(x))

In [10]:
reviews.head(1)

Unnamed: 0,review,date_of_review,rating,course_href,review_is_bad,review_clean
0,"This is an extremely basic course. Machine learning is built on mathematics, yet this course treats mathematics as a mysterious monster to be avoided at all costs, which unfortunately left this student feeling frustrated and patronized. So much time is wasted in the videos with arduous explanations of trivialities, and so little taken up with t...","Mar 18, 2017",1,/learn/machine-learning,1,extremely basic course machine learn build mathematics yet course treat mathematics mysterious monster avoid cost unfortunately leave student feeling frustrate patronized much time waste videos arduous explanation triviality little take impart meaningful knowledge end abandon video altogether quizes basic largely base recall rather application ...


## LDA Modeling

In [11]:
from nltk import word_tokenize

In [12]:
def nouns_adj(text):
    # Given a string of text, tokenize the text and pull out only the nouns and adjectives
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if   is_noun_adj(pos)]
    return ' '.join(nouns_adj)

In [13]:
reviews["review_lda"] = reviews["review_clean"].apply(lambda x: nouns_adj(x))

In [14]:
lda_reviews = reviews["review_lda"]

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
# create document term matrix
min_df = 4 # minimum required occurences of a word, e.g 4
max_features = 10000 # max number of unique words, e.g 10000

vectorizer = CountVectorizer(
              min_df=min_df, 
              max_features=max_features, 
              ngram_range=(1,2))

In [17]:
vec_lda_reviews = vectorizer.fit_transform(lda_reviews.values)

In [18]:
vec_lda_reviews_df = pd.DataFrame(vec_lda_reviews.toarray(), columns=vectorizer.get_feature_names())
vec_lda_reviews_df.index = lda_reviews.index
vec_lda_reviews_df.head()

Unnamed: 0,ability,ability complex,ability explain,able,able apply,able build,able complete,able course,able explain,able finish,...,zero knowledge,área,ótimo,за,курс,на,не,очень,по,что
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
tokenized_reviews = pd.Series(lda_reviews).apply(lambda x: x.split())

In [20]:
dictionary = corpora.Dictionary(tokenized_reviews)

In [21]:
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]

In [22]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

In [23]:
# Build LDA model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=6, random_state=42,
                chunksize=1000, passes=50)

In [24]:
lda_model.print_topics()

[(0,
  '0.022*"que" + 0.021*"curso" + 0.013*"un" + 0.013*"la" + 0.013*"muy" + 0.012*"para" + 0.011*"los" + 0.010*"el" + 0.009*"con" + 0.007*"excelente"'),
 (1,
  '0.066*"course" + 0.022*"great" + 0.019*"assignment" + 0.015*"program" + 0.014*"time" + 0.014*"ng" + 0.014*"exercise" + 0.014*"lecture" + 0.012*"easy" + 0.012*"lot"'),
 (2,
  '0.062*"ng" + 0.062*"course" + 0.058*"thank" + 0.031*"coursera" + 0.030*"thanks" + 0.025*"andrew" + 0.022*"much" + 0.020*"professor" + 0.018*"i" + 0.018*"best"'),
 (3,
  '0.055*"science" + 0.044*"data" + 0.030*"computer" + 0.020*"python" + 0.010*"benefit" + 0.009*"scientist" + 0.009*"thought" + 0.008*"update" + 0.007*"evaluate" + 0.007*"language"'),
 (4,
  '0.057*"course" + 0.041*"machine" + 0.029*"learn" + 0.016*"ml" + 0.014*"great" + 0.014*"good" + 0.013*"use" + 0.012*"algorithm" + 0.011*"work" + 0.011*"concept"'),
 (5,
  '0.128*"course" + 0.125*"machine" + 0.080*"learn" + 0.040*"learning" + 0.021*"good" + 0.021*"great" + 0.021*"start" + 0.020*"best" + 

In [25]:
import pyLDAvis
import pyLDAvis.gensim

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [26]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
vis

  and should_run_async(code)
