In [1]:
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

import sys
import os
import pandas as pd

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words


from items import *
from dataframe import *
from chronos import *

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline
import numpy as np

import calendar
from ggplot import *

import re

In [2]:
# data open and auto parse

normalisation_dict = {'unknown':-1}

with open('data/review_items.jl', 'r') as f:
    raw = f.read()


normalised_raw = "\n".join([ re.sub(r'([0-9]{4}-[0-9]{2}-[0-9]{2})T[0-9]{2}:[0-9]{2}:[0-9]{2}', r'\1', line)
for line in raw.splitlines() ])
    
reviews = as_dataframe(normalised_raw)
reviews['timestamp_rating'] = pd.to_datetime(reviews['timestamp_rating'], format="%Y-%m-%d", errors='coerce')
reviews['rating'] = pd.to_numeric(reviews['rating'].replace(normalisation_dict), errors='coerce')
reviews['rating_percentage'] = pd.to_numeric(reviews['rating_percentage'].replace(normalisation_dict), errors='coerce')
with open('data/hotel_items.jl','r') as f:
    raw = f.read()

    
hotels = as_dataframe(raw)

In [3]:
list(hotels.columns.values)

['batch_id',
 'city',
 'description',
 'item_id',
 'item_title',
 'num_rooms',
 'recommendation_list',
 'site_name',
 'url']

In [4]:
hotels.recommendation_list[0]

['<div class="propertyLink" onclick="(ta.prwidgets.getjs(this,\'handlers\')).click(\'/Hotel_Review-g186609-d498425-Reviews-Park_House_Hotel-Galway_County_Galway_Western_Ireland.html\', \'HotelName\', true);(new Event(event)).stopPropagation();">Park House Hotel</div>',
 '<div class="propertyLink" onclick="(ta.prwidgets.getjs(this,\'handlers\')).click(\'/Hotel_Review-g186609-d214568-Reviews-Galway_Bay_Hotel-Galway_County_Galway_Western_Ireland.html\', \'HotelName\', true);(new Event(event)).stopPropagation();">Galway Bay Hotel</div>',
 '<div class="propertyLink" onclick="(ta.prwidgets.getjs(this,\'handlers\')).click(\'/Hotel_Review-g186609-d226394-Reviews-Radisson_Blu_Hotel_Spa_Galway-Galway_County_Galway_Western_Ireland.html\', \'HotelName\', true);(new Event(event)).stopPropagation();">Radisson Blu Hotel &amp; Spa, Galway</div>']

In [5]:
pat = r"(?P<id>Hotel[\w_-]+)\.html"
origin = hotels.recommendation_list[0][0]
import re

m = re.search(pat, origin)
print(m.group('id'))

prog = re.compile(pat)

res = prog.findall(origin)
print(len(res))

count = 0
for lst in hotels.recommendation_list:
    for strs in lst:
        if (len(prog.findall(strs)) != 1):
            print('Error')
        else:
            count+=1
if (count != (len(hotels.recommendation_list)*3)):
    print('Error')

# test complete

Hotel_Review-g186609-d498425-Reviews-Park_House_Hotel-Galway_County_Galway_Western_Ireland
1


In [6]:
#production
pat = r"(?P<id>Hotel[\w_-]+)\.html" # hotel id pattern

prog = re.compile(pat)

recommend_1 = []
recommend_2 = []
recommend_3 = []
for ind, hotel in hotels.iterrows():
    host_id = hotel.item_id
    recommend_list = hotel['recommendation_list']
    recommended = []
    i = 0
    for recommend in recommend_list:
        recommended.append(prog.findall(recommend)[0])
        i += 1
    recommend_1.append(recommended[0])
    recommend_2.append(recommended[1])
    recommend_3.append(recommended[2])

# print(len(recommend_1))
# print(len(recommend_2))
# print(len(recommend_3))

hotels['recommend_1'] = recommend_1
hotels['recommend_2'] = recommend_1
hotels['recommend_3'] = recommend_1

# test
print(list(hotels.columns.values))

# clean
cleaned_hotels = hotels.drop('recommendation_list', axis=1)

# test
list(cleaned_hotels.columns.values)

['batch_id', 'city', 'description', 'item_id', 'item_title', 'num_rooms', 'recommendation_list', 'site_name', 'url', 'recommend_1', 'recommend_2', 'recommend_3']


['batch_id',
 'city',
 'description',
 'item_id',
 'item_title',
 'num_rooms',
 'site_name',
 'url',
 'recommend_1',
 'recommend_2',
 'recommend_3']

In [7]:
reviewts = reviews.set_index('timestamp_rating')

In [8]:
review_meta = reviewts.drop('review_text', axis=1)
list(review_meta.columns.values)

['batch_id',
 'item_id',
 'rating',
 'rating_percentage',
 'review_id',
 'review_title',
 'site_name',
 'url',
 'user_id']

In [9]:
review_text_array = reviewts[:]['review_text'].values
review_ids = reviewts[:]['review_id'].values

In [10]:
for i,document in enumerate(review_text_array):
    review_text_array[i] = re.sub(u"\n","",document)

In [11]:
review_text_array[3]

'We stayed here one night but the staff were super helpful. Very Nice and great view of the Bay, 10 min drive to the center of town. The pub in the hotel was really nice and the staff very courteous and polite. Would definitely like to go back for a longer stay.'

In [12]:
# makedirs python>=3.2
# os.makedirs('dataset/review', exist_ok=True)
os.makedirs('dataset/review_meta', exist_ok=True)
os.makedirs('dataset/hotel', exist_ok=True)
# os.makedirs('dataset/hotel_meta', exist_ok=True)
os.makedirs('dataset/text/documents/raw', exist_ok=True)
# os.makedirs('dataset/text/sentences', exist_ok=True)



In [13]:
# clean existing dataset

import os, shutil
folders = ['dataset/review_meta','dataset/hotel','dataset/text/documents']
for folder in folders:
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            #elif os.path.isdir(file_path): shutil.rmtree(file_path)
        except Exception as e:
            print(e)

In [14]:
cleaned_hotels.to_csv('dataset/hotel/hotels.csv')

In [15]:
review_meta.to_csv('dataset/review_meta/review_meta.csv')

In [16]:
import tarfile

compressed = "dataset/text/documents/compressed.tgz"
tar = tarfile.open(compressed, "w:gz")

for i,document in enumerate(review_text_array):
    filename = "dataset/text/documents/raw/{0}.txt".format(review_ids[i])
    with open(filename,"w") as f:
        f.write(document)
    tar.add(filename)

tar.close()