In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from gensim.models import Word2Vec
import sklearn
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from extract_from_es import getData
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/lokin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lokin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# Cleaning text in reviews
def cleaning_reviews(hotel_id, reviews):
    clean_reviews = {}
    to_delete = set(string.punctuation) - {'.'}
    string_punc = "".join(to_delete)
    clean_reviews[hotel_id]= {}
#     print(reviews.keys())
    for key in reviews.keys():
        text = str(reviews[key]).split()
        table = str.maketrans('', '', string_punc)
        stripped_punctuation = [w.translate(table) for w in text]
        normalize_to_lower = [w.lower() for w in stripped_punctuation]
        clean_words = [x for x in normalize_to_lower if not re.match("[0-9]", x)]
        clean_sent = ' '.join(clean_words)
        clean_reviews[hotel_id][key] = clean_sent
    return clean_reviews

In [3]:
# Break review text into sentences by fullstop
def reviews_into_sentences(cleaned_reviews):
    for key in cleaned_reviews.keys():
            sentence_filter = []
            sentences = str(cleaned_reviews[key]).split(".")
            for sentence in sentences:
                if sentence!='':
                    sentence = sentence.strip()
                    sentence_filter.append(sentence)
                    cleaned_reviews[key] = sentence_filter
    return cleaned_reviews

In [4]:
def calculate_sentiment(cleaned_reviews):
    a1 = SentimentIntensityAnalyzer()
    sentiment_scores = {}
    for key in cleaned_reviews.keys():
        sentiment_scores[key] = {}
        for sent_index, sentence in enumerate(cleaned_reviews[key]):
            scores = a1.polarity_scores(sentence)
            sentiment_scores[key][sent_index] = scores
    return sentiment_scores

In [5]:
def find_features(cleaned_reviews, sentiment_scores):
    sentences_features = {}
    for key in cleaned_reviews.keys():
        sentences_features[key] = {}
        fet_lst = []
        for sentence_index, sentence in enumerate(cleaned_reviews[key]):
            feature_list = []
            word_tokens = nltk.word_tokenize(sentence)
            result = nltk.pos_tag(word_tokens)
            pos_tag_list, word_list, word_fin_list, pos_tag_fin_list = [],[],[],[]
            for i in range(len(result)):
                pos_tag_list.append(result[i][1])
                word_list.append(result[i][0])
            word_fin_list.append(word_list)
            pos_tag_fin_list.append(pos_tag_list)
            for w in range(len(word_fin_list)):
                for j in range(len(word_fin_list[w])):
                    if pos_tag_fin_list[w][j]=='NN':
                        for k in range(j+1,len(word_fin_list[w])):
                            if pos_tag_fin_list[w][k]=='CC':
                                break
                            if pos_tag_fin_list[w][k]=='JJ':
                                feature_list.append(word_fin_list[w][j])
                                break
                        sentences_features[key][sentence_index] = feature_list
                        fet_lst.append(feature_list)
        sentences_features[key]["feature_list"] = fet_lst
    return sentences_features

In [6]:
def get_unique_features(sentences_features):
    unique_features = {}
    for key in sentences_features.keys():
        unique_features[key] = {}
#         print(key)
        feat = sentences_features[key]["feature_list"]
        fet_lst = []
        fet_set = set()
        temp= []
        for each_ in feat:
            temp_str = "".join(each_)
            if temp_str not in fet_set:
                temp.append(each_)
                fet_set.add(temp_str)
        unique_features[key]["feature_list"] = temp
    return unique_features

In [7]:
def word_to_vec_hotel(hotel_id, unique_features):
    data_month = hotel_features = {}
    hotel_features[hotel_id] = {}
    for key in unique_features.keys():
        md = Word2Vec(unique_features[key]["feature_list"],min_count=1)
        words_ = list(md.wv.vocab)
        word2vec = {}
        for feat_name in unique_features[key]["feature_list"]:
            for f in feat_name:
                if f in md.wv.vocab:
                    word2vec[f] = md[f]
        data_month[key] = pd.DataFrame.from_dict(word2vec)
    hotel_features[hotel_id] = data_month
    return hotel_features

In [9]:
hotel_ids = ["611947","1418811","111428","84217"]
hotel_data = {}
for ids in hotel_ids:   
    reviews = getData(ids)
    cleaned_reviews = cleaning_reviews(ids,reviews)
    review_sentences = reviews_into_sentences(cleaned_reviews[ids])
    sentiment_scores_sentence = calculate_sentiment(review_sentences)
    sentences_features = find_features(review_sentences, sentiment_scores_sentence)
    unique_sentence_features = get_unique_features(sentences_features)
    hotel_features = word_to_vec_hotel(ids, unique_sentence_features)
    hotel_data[ids] = hotel_features

  # This is added back by InteractiveShellApp.init_path()


In [10]:
hotel_data.keys()

dict_keys(['611947', '1418811', '111428', '84217'])

In [14]:
hotel_data['84217']['February'].keys()

Index(['beverage', 'accent', 'david', 'recognition', 'program', 'hotel',
       'beach', 'staff', 'care', 'pool', 'resort', 'fee', 'bonus', 'marriott',
       'bonoy', 'energy', 'life', 'sheraton', 'sand', 'key', 'fun', 'sun',
       'please', 'thank', 'time', 'dose', 'happiness', 'part', 'sunshine',
       'beauty', 'side', 'park', 'mixture', 'playground', 'place', 'work',
       'environment', 'joy', 'stay', 'experience', 'cabanas', 'charge',
       'concession', 'day', 'usage', 'tranquil', 'island', 'location',
       'traffic', 'era', 'relaxation', 'therapy', 'interaction', 'view',
       'floor', 'fire', 'pit', 'vacation', 'bustle', 'property', 'person', 'i',
       'something', 'conference', 'parking', 'opt', 'room', '’', 's',
       'complaint', 'size', 'welcoming', 'transportation', 'information',
       'fitness', 'center', 'delight', 'tub'],
      dtype='object')

In [13]:
hotel_data['611947']['February'].keys()

Index(['york', 'hilton', 'director', 'hotel', 'feel', 'everyone', 'money',
       'resort', 'fee', 'majority',
       ...
       'breakfast', 'distance', 'radio', 'return', 'rappor', 'property', 'man',
       'ny', 'need', 'coffe'],
      dtype='object', length=153)