In [1]:
import pandas as pd
import pprint
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string,lxml,bs4
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
#Loading Reviews to Dataframe...
print('Loading reviews...',end='')
df = pd.read_csv('train.csv')
print('[ok]')
print('Loaded {:,} reviews' .format(len(df)))

Loading reviews...[ok]
Loaded 426,340 reviews


In [4]:
df.columns #Deciding on Columns we might need, dropping the rest for now

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [5]:
""" 
We Found relatively Small number of NAN onject in the dataset  in both TEXT and Summary features
wE will replace with '' and drop the UN-NEEDED columns for now
"""
df.Summary.fillna('', inplace=True)
df.Text.fillna('', inplace=True)
df.drop(['Id','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Time','ProductId'],axis=1,inplace=True)

In [6]:
df.columns

Index(['Score', 'Summary', 'Text'], dtype='object')

In [7]:
#Concatinating TEXT and Summary features as per the DR. example. Which is OK as I didnt find obvious correlation
df['text']=df['Summary']+' '+df['Text'] 
df.drop(['Summary','Text'],axis=1,inplace=True)
df['length']=df['text'].str.len() 

In [8]:
df.columns

Index(['Score', 'text', 'length'], dtype='object')

In [9]:
string.punctuation # we will remove this

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
#This function will remove html tags, punctiaton [Must run in order ]
def tokenizer(text):
    no_html = bs4.BeautifulSoup(text,'lxml').get_text()
    no_punctuation = [char for char in no_html if char not in string.punctuation]
    no_punctuation = ''.join(no_punctuation)
    return no_punctuation

In [11]:
df_original = df.copy() #Backup to work on raw input data if we will need it later...

In [12]:
df['text'] = df['text'].apply(tokenizer)

In [37]:

Topdf=df.copy()
Botdf=df.copy()
# Get names of indexes for which score does not have values 1 and 5
TopIndex = Topdf[ Topdf['Score'] < 5 ].index
BotIndex = Botdf[ Botdf['Score'] > 1 ].index
 
# Delete these row indexes from dataFrame
Topdf.drop(TopIndex , inplace=True)
Botdf.drop(BotIndex , inplace=True)


In [38]:
Topdf.head()

Unnamed: 0,Score,text,length
0,5,Very Good I received this product early from t...,207
1,5,Organic Kosher Tasty Assortment of Premium Tea...,1998
2,5,excellent glutenfree spaghetti great taste gre...,1146
3,5,Lindt is Lindt Buying this multipack I was mis...,201
4,5,YUM These bars are so good I loved them warmed...,164


In [39]:
#Topdf.to_csv('top.csv', sep=',', encoding='utf-8')

In [40]:
Botdf.head()

Unnamed: 0,Score,text,length
18,1,Plastic taste This is the first coffee I tried...,355
31,1,Dangerous for doggies Dont buy these I have a...,302
49,1,Ugh Fake Food I wanted a sugar splurge but I c...,264
55,1,worst products ever As a Chinese myself I neve...,241
56,1,Enormous ripoff This is nearly twice as expens...,156


In [52]:
no_features = 1000

#Topdf = Topdf.sample(frac=0.0025)
print('Loaded {:,} reviews' .format(len(Topdf)))

# NMF is able to use tf-idf
#tfidf_vectorizer = TfidfVectorizer(max_df=1, min_df=2, max_features=no_features, stop_words='english')
#tfidf_vectorizer = TfidfVectorizer(stop_words='english')
#tfidf = tfidf_vectorizer.fit_transform(Topdf['text'])
#tfidf_feature_names = tfidf_vectorizer.get_feature_names()


Loaded 272,492 reviews


In [53]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
#tf_vectorizer = CountVectorizer(max_df=1, min_df=2, max_features=no_features, stop_words='english')
tf_vectorizer = CountVectorizer(stop_words='english',max_features=no_features)
tf = tf_vectorizer.fit_transform(Topdf['text'])
tf_feature_names = tf_vectorizer.get_feature_names()

In [54]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

no_topics = 20

# Run NMF
nmfModel = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd')
nmf= nmfModel.fit(tf)

In [56]:
# Run LDA
ldaModel = LatentDirichletAllocation(n_components=1000, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
lda=ldaModel.fit(tf)

KeyboardInterrupt: 

In [None]:
# Log Likelyhood: Higher the better
#print("Log Likelihood: ", ldaModel.score(lda))

In [None]:
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
#print("Perplexity: ", ldaModel.perplexity(lda))

In [None]:
lsiModel = TruncatedSVD(n_components=no_topics)
lsi = lsiModel.fit(tf)

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))


In [None]:
no_top_words = 10
print("NMF")
display_topics(nmf, tf_feature_names, no_top_words)


In [None]:
print("LDA")
display_topics(lda, tf_feature_names, no_top_words)

In [None]:
print("LSI")
display_topics(lsi, tf_feature_names, no_top_words)