In [2]:
import pandas as pd
import os
import numpy as np
from wordcloud import WordCloud
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('/Users/marikhomeriki/code/marikhomeriki/product_review_analysis/raw_data/train_data/train.csv', header=None)
df_test = pd.read_csv('/Users/marikhomeriki/code/marikhomeriki/product_review_analysis/raw_data/test_data/test.csv', header=None)
df_test = df.rename({0: 'label', 1: 'text'}, axis = 1)
df = df.rename({0: 'label', 1: 'text'}, axis = 1)

In [4]:
df.head()

Unnamed: 0,label,text
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [5]:
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return lemmatized

In [6]:
len_train = int(10/100*len(df["text"]))
train_sentences = df['text'][:len_train]

In [7]:
X = pd.DataFrame(train_sentences)
X.head()

Unnamed: 0,text
0,"Unfortunately, the frustration of being Dr. Go..."
1,Been going to Dr. Goldberg for over 10 years. ...
2,I don't know what Dr. Goldberg was like before...
3,I'm writing this review to give you a heads up...
4,All the food is great here. But the best thing...


In [8]:
X['clean_text'] = X.text.apply(clean)
X['clean_text'] = X['clean_text'].astype('str')

In [12]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

data_vectorized = vectorizer.fit_transform(X['clean_text'])

lda_model = LatentDirichletAllocation(n_components=8)

lda_vectors = lda_model.fit_transform(data_vectorized)

In [13]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])

In [14]:
print_topics(lda_model, vectorizer)

Topic 0:
[('food', 10783.024328373127), ('good', 10605.945704919364), ('place', 8154.214951823843), ('like', 7141.572862050609), ('chicken', 6863.074785821025), ('ordered', 5945.549126061695), ('restaurant', 5793.2713564248015), ('sauce', 5328.974158484404), ('salad', 5103.8994140836), ('one', 4496.946664591148)]
Topic 1:
[('room', 6058.257138200451), ('one', 3765.445847945894), ('like', 3724.8712833345435), ('place', 3290.165640710726), ('table', 2914.9035745839046), ('would', 2838.5277936559887), ('hotel', 2762.094935183864), ('time', 2485.9198648883926), ('night', 2481.5496341769954), ('back', 2454.270759895313)]
Topic 2:
[('food', 7869.403975386748), ('good', 7672.397086179339), ('place', 7095.723510417427), ('like', 4664.03065800661), ('get', 3769.5460558355862), ('always', 3538.5093496381105), ('great', 3298.823994222144), ('go', 3253.464811390565), ('love', 3146.2609462008095), ('one', 3109.006891868654)]
Topic 3:
[('food', 9940.100868407202), ('time', 8614.315990370196), ('serv