In [3]:
import re
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", 200)

# NLTK
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk import FreqDist
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem.porter import *

# Gensim
import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Spacy for lemmatization
import spacy

# Plotting
# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim
import seaborn as sns
import matplotlib.pyplot as plt
from numpy.random import normal
%matplotlib inline
from wordcloud import WordCloud

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

AttributeError: type object 'thinc.neural.ops.array' has no attribute '__reduce_cython__'

In [None]:
# Import dataset
data = pd.read_csv('Data/top_50.csv')
data.info()

data.head()

In [None]:
# function to plot most frequent terms
def freq_words(x, terms = 30):
    
    all_words = ' '.join([text for text in x])
    all_words = all_words.split()

    fdist = FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})

    # selecting top 20 most frequent words
    d = words_df.nlargest(columns="count", n = terms) 
    plt.figure(figsize=(20,5))
    ax = sns.barplot(data=d, x= "word", y = "count")
    plt.xticks(rotation='vertical')
    ax.set(ylabel = 'Count')
    plt.show()

In [None]:
listing=float(input("What listing are you looking at?\n"))

In [None]:
type(data['review'])

In [None]:
valid_data=data[data.listing_id==listing].review

freq_words(valid_data)

In [None]:
# Remove unwanted characters, numbers and symbols
valid_data = valid_data.str.replace("[^a-zA-Z#]", " ")

In [None]:
# Function to remove stopwords
def remove_stopwords(rev):
    rev_new = " ".join([i for i in rev if i not in stop_words])
    return rev_new

# Remove short words (length < 3)
valid_data = valid_data.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

# Remove stopwords from the text
reviews = [remove_stopwords(r.split()) for r in valid_data]

# Make entire text lowercase
reviews = [r.lower() for r in reviews]

In [None]:
freq_words(reviews, 35)

In [None]:
# conda install -c conda-forge spacy
# python -m spacy download en # one time run

In [None]:
nlp = spacy.load('en', disable=['parser', 'ner'])

def lemmatization(texts, tags=['NN', 'ADJ']): 
    # filter noun and adjective
    output = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        output.append([token.lemma_ for token in doc if token.pos_ in tags])
    
    return output

In [None]:
tokenized_reviews = pd.Series(reviews).apply(lambda x: x.split())
print(tokenized_reviews[1])

In [None]:
reviews_2 = lemmatization(tokenized_reviews)
print(reviews_2[1]) # print lemmatized review

In [None]:
reviews_3 = []
for i in range(len(reviews_2)):
    reviews_3.append(' '.join(reviews_2[i]))

valid_data = reviews_3

freq_words(valid_data, 35)

In [None]:
dictionary = corpora.Dictionary(reviews_2)
print(dictionary)

In [None]:
doc_term_matrix = [dictionary.doc2bow(rev) for rev in reviews_2]

In [None]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=5, random_state=100,
                chunksize=1000, passes=50)

In [None]:
lda_model.print_topics()

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
vis

In [None]:
dict = {"Value": ["Price","Amount","Rate","Cheap","Worth","Low","Money","Economical","Reasonable","Fee","Expensive"],
        "Location": ["Railway","View","Station","Airport","Distance","Far","Close","Convenient","Train","Metro"],   
        "Service": ["Desk","Check-in","Check-out","Reliable","Fast","Convenient"],
        "Meal": ["Drink","Breakfast","Spicy","Food","Tasty","Tea","Buffet","Bar","Restaurant","Dinner","Lunch","Brunch","Delicious"],
        "Facility": ["Pool","Spa","Wi-fi","Gymnasium","Gym","Internet","Ample","Parking","Wireless","Broken"],
        "Room": ["Bed","Dirty","Clean","Toilet","Bathroom","Shower","Dryer","Fridge","View"],
        "Quality": ["Satusfactory","Ample","Hygienic","Proper","Ambience","Odour","Smell",],
        "Staff": ["Good","Polite","Helpful","Friendly","Reliable","Quick"],
        "Surrounding": ["Landmark","Monument","Temple","Mosque","Church","Restaurant","Beach","Diner","Mall","Market"]}

In [None]:
# Convert all documents to TF Vectors
all_tf_vectors = [dictionary.doc2bow(doc) for doc in reviews_2]

In [None]:
# Label the trained data. Since the folder name is the label, I use the same labels.

all_data_as_dict = [{id:1 for (id, tf_value) in vec} for vec in all_tf_vectors]
print(all_data_as_dict[:10])

In [None]:
value_data = [(d, 'Crime') for d in all_data_as_dict[0:num_crime_docs]]
sports_data = [(d, 'Sports') for d in all_data_as_dict[num_crime_docs:]]
all_labeled_data = crime_data + sports_data