In [6]:
import re
import pandas as pd
import json
import os 
import time

def review_cleaning(path,path_out):
    """Load data in the folder path, clean them and save the cleaned data in the folder path_out"""
    tps1 = time.clock() 
    
    # Loading of the stop words list
    stp_w = pd.read_csv("english.txt", header=None,delimiter="\n", quoting=3)
    english_stop_words = [word for word in stp_w[0]]
        
    # Loading of the files in path
    file_list =[f for f in os.listdir(path)] # in this case, path = 'json_original/'
    
    for f in file_list:
        # We open the json file
        with open(path+f) as data:
            dt = json.load(data)
            
        hotel_id = dt['HotelInfo']['HotelID']
        
        ids = [review['ReviewID'] for review in dt['Reviews']]
        
        # We clean an lower the words
        reviews = [re.sub("[^a-zA-Z]", " ",review['Content']).lower().split() for review in dt['Reviews']]
        # We remove stop words
        reviews = [[word for word in review if word not in set(english_stop_words)]
                    for review in reviews]
        
        # We create a dictionary with the ids and the reviews
        reviews = dict(zip(ids,reviews))
        
        # We remove the bad reviews that contains the word showreview
        kept_reviews = {el:reviews[el] for el in reviews if 'showreview' not in reviews[el]}
        
        ids = [el for el in kept_reviews]
        content = [kept_reviews[el] for el in kept_reviews]
        
        # We save the data
        output = pd.DataFrame({"ReviewID":ids,"Content":content}).to_csv(path_out+hotel_id+'.csv', index=False)
        
    tps2 = time.clock() 
    print ('Cleaning done in %.3f'%(tps2-tps1))

In [None]:
review_cleaning('json_original/','cleaned_data/')