In [1]:
corpus = [
'Great course. Love the professor.',
'Great content. Textbook was great',
'This course has very hard assignments. Great content.',
'Love the professor.',
'Hard assignments though',
'Hard to understand.'
]

In [2]:
import pandas as pd
df = pd.DataFrame(corpus)
df.columns = ['reviews']

In [12]:
from textblob import TextBlob
df['polarity'] = df['reviews'].apply(lambda x: TextBlob(x).polarity)
df['subjective'] = df['reviews'].apply(lambda x: TextBlob(x).subjectivity)
print(df)

                                             reviews  polarity  subjective
0                  Great course. Love the professor.  0.650000    0.675000
1                  Great content. Textbook was great  0.800000    0.750000
2  This course has very hard assignments. Great c...  0.210417    0.727083
3                                Love the professor.  0.500000    0.600000
4                            Hard assignments though -0.291667    0.541667
5                                Hard to understand. -0.291667    0.541667


In [13]:
# sentiment analysis - bigram/trigram/ngram

from nltk.corpus import stopwords
stoplist = stopwords.words('english') + ['though']

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
# matrix of ngrams
ngrams = c_vec.fit_transform(df['reviews'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})
print(df_ngram)

    frequency             bigram/trigram
0           2             love professor
1           2           hard assignments
2           2              great content
3           1             textbook great
4           1            hard understand
5           1     hard assignments great
6           1          great course love
7           1               great course
8           1     great content textbook
9           1      course love professor
10          1                course love
11          1    course hard assignments
12          1                course hard
13          1     content textbook great
14          1           content textbook
15          1  assignments great content
16          1          assignments great


In [16]:
df_ngram['polarity'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram['subjective'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).subjectivity)
print(df_ngram)

    frequency             bigram/trigram  polarity  subjective
0           2             love professor  0.500000    0.600000
1           2           hard assignments -0.291667    0.541667
2           2              great content  0.800000    0.750000
3           1             textbook great  0.800000    0.750000
4           1            hard understand -0.291667    0.541667
5           1     hard assignments great  0.254167    0.645833
6           1          great course love  0.650000    0.675000
7           1               great course  0.800000    0.750000
8           1     great content textbook  0.800000    0.750000
9           1      course love professor  0.500000    0.600000
10          1                course love  0.500000    0.600000
11          1    course hard assignments -0.291667    0.541667
12          1                course hard -0.291667    0.541667
13          1     content textbook great  0.800000    0.750000
14          1           content textbook  0.000000    0

In [17]:
# Topic Modelling

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline
tfidf_vectorizer = TfidfVectorizer(stop_words=stoplist, ngram_range=(2,3))
nmf = NMF(n_components=3)
pipe = make_pipeline(tfidf_vectorizer, nmf)
pipe.fit(df['reviews'])
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), n_top_words=3)

Topic #0: love professor, great course love, great course
Topic #1: hard assignments, assignments great, course hard assignments
Topic #2: textbook great, great content textbook, content textbook



In [18]:
# LDA Model

from sklearn.decomposition import LatentDirichletAllocation
tfidf_vectorizer = TfidfVectorizer(stop_words=stoplist, ngram_range=(2,3))
lda = LatentDirichletAllocation(n_components=3)
pipe = make_pipeline(tfidf_vectorizer, lda)
pipe.fit(df['reviews'])
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
print_top_words(lda, tfidf_vectorizer.get_feature_names(), n_top_words=3)

Topic #0: hard understand, love professor, assignments great
Topic #1: great course love, great course, course love professor
Topic #2: hard assignments, great content textbook, content textbook great

