In [1]:
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

import sys
import os
import pandas as pd

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words


from items import *
from dataframe import *
from chronos import *

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline
import numpy as np

import calendar
from ggplot import *

import re

In [2]:
# data open and auto parse

with open('data/review_items.jl', 'r') as f:
    raw = f.read()

reviews = as_dataframe(raw)

with open('data/hotel_items.jl','r') as f:
    raw = f.read()

hotels = as_dataframe(raw)

In [3]:
list(hotels.columns.values)

['batch_id',
 'city',
 'description',
 'item_id',
 'item_title',
 'num_rooms',
 'recommendation_list',
 'site_name',
 'url']

In [4]:
hotels.recommendation_list[0]

['<div class="propertyLink" onclick="(ta.prwidgets.getjs(this,\'handlers\')).click(\'/Hotel_Review-g186609-d498425-Reviews-Park_House_Hotel-Galway_County_Galway_Western_Ireland.html\', \'HotelName\', 498425, true);(new Event(event)).stopPropagation();">Park House Hotel</div>',
 '<div class="propertyLink" onclick="(ta.prwidgets.getjs(this,\'handlers\')).click(\'/Hotel_Review-g186609-d226394-Reviews-Radisson_Blu_Hotel_Spa_Galway-Galway_County_Galway_Western_Ireland.html\', \'HotelName\', 226394, true);(new Event(event)).stopPropagation();">Radisson Blu Hotel &amp; Spa, Galway</div>',
 '<div class="propertyLink" onclick="(ta.prwidgets.getjs(this,\'handlers\')).click(\'/Hotel_Review-g186609-d214568-Reviews-Galway_Bay_Hotel-Galway_County_Galway_Western_Ireland.html\', \'HotelName\', 214568, true);(new Event(event)).stopPropagation();">Galway Bay Hotel</div>']

In [5]:
pat = r"(?P<id>Hotel[\w_-]+)\.html"
origin = hotels.recommendation_list[0][0]
import re

m = re.search(pat, origin)
print(m.group('id'))

prog = re.compile(pat)

res = prog.findall(origin)
print(len(res))

count = 0
for lst in hotels.recommendation_list:
    for strs in lst:
        if (len(prog.findall(strs)) != 1):
            print('Error')
        else:
            count+=1
if (count != (len(hotels.recommendation_list)*3)):
    print('Error')

# test complete

Hotel_Review-g186609-d498425-Reviews-Park_House_Hotel-Galway_County_Galway_Western_Ireland
1


In [6]:
#production
pat = r"(?P<id>Hotel[\w_-]+)\.html" # hotel id pattern

prog = re.compile(pat)

recommend_1 = []
recommend_2 = []
recommend_3 = []
for ind, hotel in hotels.iterrows():
    host_id = hotel.item_id
    recommend_list = hotel['recommendation_list']
    recommended = []
    i = 0
    for recommend in recommend_list:
        recommended.append(prog.findall(recommend)[0])
        i += 1
    recommend_1.append(recommended[0])
    recommend_2.append(recommended[1])
    recommend_3.append(recommended[2])

# print(len(recommend_1))
# print(len(recommend_2))
# print(len(recommend_3))

hotels['recommend_1'] = recommend_1
hotels['recommend_2'] = recommend_1
hotels['recommend_3'] = recommend_1

# test
print(list(hotels.columns.values))

# clean
cleaned_hotels = hotels.drop('recommendation_list', axis=1)

# test
list(cleaned_hotels.columns.values)

['batch_id', 'city', 'description', 'item_id', 'item_title', 'num_rooms', 'recommendation_list', 'site_name', 'url', 'recommend_1', 'recommend_2', 'recommend_3']


['batch_id',
 'city',
 'description',
 'item_id',
 'item_title',
 'num_rooms',
 'site_name',
 'url',
 'recommend_1',
 'recommend_2',
 'recommend_3']

In [7]:
reviewts = reviews.set_index('timestamp_rating')

In [8]:
review_meta = reviewts.drop('review_text', axis=1)
list(review_meta.columns.values)

['batch_id',
 'item_id',
 'rating',
 'rating_percentage',
 'review_id',
 'review_title',
 'site_name',
 'url',
 'user_id']

In [9]:
review_text_array = reviewts[:]['review_text'].values
review_ids = reviewts[:]['review_id'].values

In [10]:
for i,document in enumerate(review_text_array):
    review_text_array[i] = re.sub(u"\n","",document)

In [11]:
review_text_array[3]

'Our fifth stay at The Twelve and it continues to remain a favourite destination for my wife and I. On arrival the reception is always warm and welcoming, room was clean, the bath filled in about three minutes, the breakfast is consistently good and every single staff member is superb. Unfortunately we did not receive the usual little card and cake to our room (a lovely little touch that we always tell our friends about) and the Internet was down for the duration of our stay (only a mild irritant and I am sure beyond the control of the hotel at the time). Overall, a relaxing and vibrant boutique hotel and we will of course be back annually.'

In [12]:
# makedirs python>=3.2
os.makedirs('dataset/review', exist_ok=True)
os.makedirs('dataset/review_meta', exist_ok=True)
os.makedirs('dataset/hotel', exist_ok=True)
# os.makedirs('dataset/hotel_meta', exist_ok=True)
os.makedirs('dataset/text/documents', exist_ok=True)
os.makedirs('dataset/text/sentences', exist_ok=True)



In [13]:
# clean existing dataset

import os, shutil
folders = ['dataset/review','dataset/review_meta','dataset/hotel','dataset/text/documents','dataset/text/sentences']
for folder in folders:
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            #elif os.path.isdir(file_path): shutil.rmtree(file_path)
        except Exception as e:
            print(e)

In [14]:
cleaned_hotels.to_csv('dataset/hotel/hotels.csv')

In [15]:
review_meta.to_csv('dataset/review_meta/review_meta.csv')

In [16]:
for i,document in enumerate(review_text_array):
    with open("dataset/text/documents/{0}.txt".format(review_ids[i]),"w") as f:
        f.write(document)