In [43]:
import pandas as pd
import pprint
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string,lxml,bs4
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [44]:
#Loading Reviews to Dataframe...
print('Loading reviews...',end='')
df = pd.read_csv('train.csv')
print('[ok]')
print('Loaded {:,} reviews' .format(len(df)))

Loading reviews...[ok]
Loaded 426,340 reviews


In [45]:
df.columns #Deciding on Columns we might need, dropping the rest for now

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [46]:
""" 
We Found relatively Small number of NAN onject in the dataset  in both TEXT and Summary features
wE will replace with '' and drop the UN-NEEDED columns for now
"""
df.Summary.fillna('', inplace=True)
df.Text.fillna('', inplace=True)
df.drop(['Id','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Time','ProductId'],axis=1,inplace=True)

In [47]:
df.columns

Index(['Score', 'Summary', 'Text'], dtype='object')

In [48]:
#Concatinating TEXT and Summary features as per the DR. example. Which is OK as I didnt find obvious correlation
df['text']=df['Summary']+' '+df['Text'] 
df.drop(['Summary','Text'],axis=1,inplace=True)
df['length']=df['text'].str.len() 

In [49]:
df.columns

Index(['Score', 'text', 'length'], dtype='object')

In [50]:
string.punctuation # we will remove this

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [51]:
#This function will remove html tags, punctiaton [Must run in order ]
def tokenizer(text):
    no_html = bs4.BeautifulSoup(text,'lxml').get_text()
    no_punctuation = [char for char in no_html if char not in string.punctuation]
    no_punctuation = ''.join(no_punctuation)
    return no_punctuation

In [52]:
df_original = df.copy() #Backup to work on raw input data if we will need it later...

In [53]:
df['text'] = df['text'].apply(tokenizer)

In [54]:

Topdf=df.copy()
Botdf=df.copy()
# Get names of indexes for which score does not have values 1 and 5
TopIndex = Topdf[ Topdf['Score'] < 5 ].index
BotIndex = Botdf[ Botdf['Score'] > 1 ].index
 
# Delete these row indexes from dataFrame
Topdf.drop(TopIndex , inplace=True)
Botdf.drop(BotIndex , inplace=True)


In [55]:
Topdf.head()

Unnamed: 0,Score,text,length
0,5,Very Good I received this product early from t...,207
1,5,Organic Kosher Tasty Assortment of Premium Tea...,1998
2,5,excellent glutenfree spaghetti great taste gre...,1146
3,5,Lindt is Lindt Buying this multipack I was mis...,201
4,5,YUM These bars are so good I loved them warmed...,164


In [56]:
#Topdf.to_csv('top.csv', sep=',', encoding='utf-8')

In [57]:
Botdf.head()

Unnamed: 0,Score,text,length
18,1,Plastic taste This is the first coffee I tried...,355
31,1,Dangerous for doggies Dont buy these I have a...,302
49,1,Ugh Fake Food I wanted a sugar splurge but I c...,264
55,1,worst products ever As a Chinese myself I neve...,241
56,1,Enormous ripoff This is nearly twice as expens...,156


In [58]:
no_features = 1000

#Topdf = Topdf.sample(frac=0.0025)
print('Loaded {:,} reviews' .format(len(Topdf)))

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(Topdf['text'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()


Loaded 272,492 reviews


In [59]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
#tf_vectorizer = CountVectorizer(max_df=1, min_df=2, max_features=no_features, stop_words='english')
tf_vectorizer = CountVectorizer(stop_words='english',max_features=no_features)
tf = tf_vectorizer.fit_transform(Topdf['text'])
tf_feature_names = tf_vectorizer.get_feature_names()

In [80]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
import mglearn as mglearn

def display_topics_mg(model, feature_names, no_top_words):
    sorting = np.argsort(model.components_, axis=1)[:, ::-1]
    feature_names_arr = np.array(feature_names)
    mglearn.tools.print_topics(topics=range(20), feature_names=feature_names_arr,
                           sorting=sorting, topics_per_chunk=5, n_words=20)        
        
        

In [81]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

no_topics = 20

# Run NMF
nmfModel = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd')
nmf= nmfModel.fit(tf)

In [82]:
no_top_words = 20
print("NMF")
display_topics_mg(nmf, tf_feature_names, no_top_words)

NMF
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
like          tea           coffee        food          great         
really        green         cup           dry           price         
dont          teas          strong        foods         snack         
tastes        drink         roast         dog           tastes        
try           black         keurig        eat           tasting       
im            bags          bold          quality       loves         
doesnt        cup           blend         diet          value         
think         iced          kcup          wellness      deal          
tried         organic       starbucks     canned        really        
stuff         drinking      kcups         feed          easy          
ive           bag           dark          healthy       makes         
didnt         stash         morning       health        recommend     
kn

In [83]:
lsiModel = TruncatedSVD(n_components=no_topics)
lsi = lsiModel.fit(tf)

In [84]:
print("LSI")
display_topics_mg(lsi, tf_feature_names, no_top_words)

LSI
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
like          tea           coffee        coffee        great         
great         green         cup           food          product       
tea           teas          taste         tea           price         
good          drink         great         dog           dog           
coffee        coffee        flavor        cat           love          
just          cup           roast         cats          dogs          
food          black         strong        cup           coffee        
taste         flavor        keurig        dogs          tea           
love          drinking      bold          dry           amazon        
flavor        bags          dark          green         loves         
product       iced          chocolate     wellness      treats        
best          stash         like          canned        food          
re

In [85]:
# Run NMF
nmfidfModel = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd')
nmfidf= nmfidfModel.fit(tfidf)

In [86]:
print("NMF-idf")
display_topics_mg(nmfidf, tfidf_feature_names, no_top_words)

NMF-idf
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
price         tea           coffee        food          great         
amazon        green         cup           dry           taste         
store         teas          strong        foods         snack         
buy           drink         bold          dogs          tasting       
local         iced          keurig        wellness      price         
stores        black         kcups         baby          value         
grocery       bags          roast         eat           tastes        
order         cup           smooth        quality       deal          
shipping      stash         kcup          canned        easy          
cheaper       chai          blend         feed          gift          
buying        drinking      morning       healthy       stuff         
happy         earl          coffees       diet          healthy      

In [67]:
# Run LDA
#ldaModel = LatentDirichletAllocation(n_components=1000, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
#lda=ldaModel.fit(tf)


In [68]:
#print("LDA")
#display_topics(lda, tf_feature_names, no_top_words)


In [69]:
# Log Likelyhood: Higher the better
#print("Log Likelihood: ", ldaModel.score(lda))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
#print("Perplexity: ", ldaModel.perplexity(lda))