# Topic Analysis - LDA

In [20]:
#Class for reading and displaying topics and word clouds
from AdvancedAnalytics import TextAnalytics
import pandas as pd
# Classes for Text Preprocessing
import string
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
# sklearn methods for Preparing the Term-Doc Matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# sklearn methods for Extracting Topics using the Term-Doc Matrix
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

In [21]:
data=pd.read_excel('D:/isen-656/CaliforniaCabernet.xlsx')
data=data[['Review','description','points','price','Region']]

In [22]:
# program constants and reviews
n_reviews = len(data['description'])
s_words = 'english'
ngram = (1,2)
reviews = data['description']

# Constants
m_features = None 
n_topics = 9 
max_iter = 10 
max_df = 0.5 
learning_offset = 10. 
learning_method = 'online' 
tf_matrix='tfidf'

In [23]:
ta = TextAnalytics()
cv = CountVectorizer(max_df=max_df, min_df=2, max_features=m_features,\
analyzer=ta.my_analyzer)
tf = cv.fit_transform(reviews)
terms = cv.get_feature_names()
print('{:.<22s}{:>6d}'.format("Number of Reviews", len(reviews)))
print('{:.<22s}{:>6d}'.format("Number of Terms", len(terms)))


Number of Reviews..... 13135
Number of Terms.......  5595


In [24]:
term_sums = tf.sum(axis=0)
term_counts = []
for i in range(len(terms)):
    term_counts.append([terms[i], term_sums[0,i]])
def sortSecond(e):
    return e[1]
term_counts.sort(key=sortSecond, reverse=True)
print("\nTerms with Highest Frequency:")
for i in range(10):
   print('{:<15s}{:>5d}'.format(term_counts[i][0], term_counts[i][1]))


Terms with Highest Frequency:
wine            7439
tannin          5134
cherry          5123
cabernet        4968
oak             4670
black           4596
currant         4404
dry             4146
fruit           3543
rich            2947


In [25]:
##Construct the TF/IDF matrix from the Term Frequency matrix
print("\nConstructing Term/Frequency Matrix using TF-IDF")
# Default for norm is 'l2', use norm=None to supress
tfidf_vect = TfidfTransformer(norm=None, use_idf=True) #set norm=None
# tf matrix is (n_reviews)x(m_terms)
tf = tfidf_vect.fit_transform(tf)
# Display the terms with the largest TFIDF value
term_idf_sums = tf.sum(axis=0)
term_idf_scores = []
for i in range(len(terms)):
   term_idf_scores.append([terms[i], term_idf_sums[0,i]])
print("The Term/Frequency matrix has", tf.shape[0], " rows, and",\
tf.shape[1], " columns.")
print("The Term list has", len(terms), " terms.")
term_idf_scores.sort(key=sortSecond, reverse=True)
print("\nTerms with Highest TF-IDF Scores:")
for i in range(10):
    j = i
    print('{:<15s}{:>8.2f}'.format(term_idf_scores[j][0], \
    term_idf_scores[j][1]))


Constructing Term/Frequency Matrix using TF-IDF
The Term/Frequency matrix has 13135  rows, and 5595  columns.
The Term list has 5595  terms.

Terms with Highest TF-IDF Scores:
wine           12619.14
cabernet       10151.85
tannin         10083.79
cherry         10070.38
black           9932.89
oak             9651.13
currant         9241.95
dry             9122.76
fruit           8436.35
rich            7429.52


In [26]:
uv = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,\
                               learning_method=learning_method, \
                               learning_offset=learning_offset, \
                               random_state=12345)
U = uv.fit_transform(tf)
# Store topic selection for each doc in topics[]
topics = [0] * n_reviews
for i in range(n_reviews):
    max = abs(U[i][0])
    topics[i] = 0
    for j in range(n_topics):
        x = abs(U[i][j])
        if x > max:
            max = x
            topics[i] = j

In [27]:
#Topic Scores
rev_scores = []
for i in range(n_reviews):
    u = [0] * (n_topics+1)
    u[0] = topics[i]
    for j in range(n_topics):
        u[j+1] = U[i][j]
    rev_scores.append(u)
    
cols = ["topic"]
for i in range(n_topics):
    s = "T"+str(i+1)
    cols.append(s)
df_rev = pd.DataFrame.from_records(rev_scores, columns=cols)
data = data.join(df_rev)

In [28]:
# Average in cluster

Table=data[['points','price','topic']]
Table_aggregate=Table.groupby('topic',as_index=False)[['points','price']].mean()
Table_aggregate['topic']=[1,2,3,4,5,6,7,8,9]
Table_aggregate=Table_aggregate.rename(columns={'topic':'cluster'})


In [29]:
# Displaying average points and price in each cluster

Table_aggregate

Unnamed: 0,cluster,points,price
0,1,88.777064,51.688419
1,2,89.24478,51.176056
2,3,89.42236,66.510903
3,4,87.408734,54.037752
4,5,89.281855,60.55452
5,6,89.003111,51.037406
6,7,89.463496,63.09268
7,8,89.459365,57.049152
8,9,86.087905,42.237874


In [30]:
# Displaying top 15 words in each cluster

TextAnalytics.display_topics(uv.components_, terms, n_terms=15, mask=None)

Topic #1: 
+valley        +dry           +napa          +vineyard      +everyday      
+cabernet      +nicely        +wine          +thin          +grape         
+source        +drink         +pleasant      +tannin        +supple        

Topic #2: 
+wrap          +mark          +show          +mint          +bordeaux      
+keep          +wine          +tannin        +spice         +especially    
+cherry        +napa          +know          +valley        +acidity       

Topic #3: 
+go            +frame         +bottle        +soften        +produce       
+case          +may           +wine          +sour          +production    
+mountain      +time          +vintage       +graphite      +develop       

Topic #4: 
+acid          +fruit         +finish        +wine          +note          
+somewhat      +tannin        +dry           +wood          +hot           
+black         +cherry        +raisin        +violet        +ageability    

Topic #5: 
+body          +full         