In [5]:
import csv
import json
from collections import defaultdict
import matplotlib.pyplot as plt
from nltk.tokenize import TreebankWordTokenizer

In [15]:
with open("reviews.csv") as csvfile:
    with open("reviews.json", "w") as f:
        reader = csv.reader(csvfile)
        data_json = {}
        reviews = []
        isFirstLine = True
        last_line = None
        for line in reader:
            if isFirstLine:
                isFirstLine = False
                continue
            row_num, state, area_name, reviewer_name = line[0], line[1], line[2], line[3]
            review_date, rating, text = line[4], line[5], line[6]
            review = {
                "row_number":row_num,
                "state":state,
                "area_name":area_name,
                "reviewer_name":reviewer_name,
                "review_date":review_date,
                "rating":rating,
                "text":text
            }
            reviews.append(review)
            last_line = review
        data_json["reviews"] = reviews
        json.dump(data_json, f)
        print(last_line)

{'row_number': '18261', 'state': 'new-hampshire', 'area_name': 'whaleback-mountain', 'reviewer_name': 'David  Cook', 'review_date': '4th February 2017', 'rating': '2', 'text': "My dad and I visited on a whim on our way back from Montreal, coming out of Vermont. Whaleback is itty-bitty but offers a fun day of skiing and has a few trails that are surprisingly challenging (the one called YOOYM seems borderline suicidal ... we passed on it).\n\nI'd recommend it."}


In [3]:
dataset = {}
with open("reviews.json", "r") as f:
    dataset = json.load(f)

# Data Exploration for Milestone 2

In [14]:
def getStatesAreasSortedByNumReviews():
    state_to_review_row_nums = defaultdict(list)
    area_name_to_review_row_nums = defaultdict(list)
    for i in range(len(dataset['reviews'])):
        review = dataset['reviews'][i]
        state_to_review_row_nums[review['state']].append(i)
        area_name_to_review_row_nums[review['area_name']].append(i)
    
    state_names, num_reviews_by_state = [], []
    for state_name, row_nums in state_to_review_row_nums.items():
        state_names.append(state_name)
        num_reviews_by_state.append(len(row_nums))
    area_names, num_reviews_by_area = [], []
    for area_name, row_nums in area_name_to_review_row_nums.items():
        area_names.append(area_name)
        num_reviews_by_area.append(len(row_nums))
        
    unsorted_states = [(state_names[i], num_reviews_by_state[i]) for i in range(len(state_names))]
    unsorted_areas = [(area_names[i], num_reviews_by_area[i]) for i in range(len(area_names))]
    sorted_states = sorted(unsorted_states, key=lambda x: x[1], reverse=True)
    sorted_areas = sorted(unsorted_areas, key=lambda x: x[1], reverse=True)
    
    return sorted_states, sorted_areas

In [21]:
def makePlot(sorted_lst, top, byState):
    if top:
        plt.bar([sorted_lst[i][0] for i in range(10)], [sorted_lst[i][1] for i in range(10)])
    else:
        plt.bar([sorted_lst[-i][0] for i in range(1,10)], [sorted_lst[-i][1] for i in range(1,10)])
    plt.xticks(rotation="vertical")
    xlab = "State" if byState else "Ski Area"
    plt.xlabel(xlab)
    plt.ylabel("Number of Reviews")
    q = "largest" if top else "fewest"
    title = xlab + "s with " + q + " number of reviews"
    plt.title(title)
    return

sorted_states, sorted_areas = getStatesAreasSortedByNumReviews()
# makePlot(sorted_areas, top=False, byState=False)

# End milestone 2
# Begin tokenizing dataset

In [7]:
tokenizer = TreebankWordTokenizer()
def tokenize_and_write_dataset(dataset, tokenizer, file_name="dataset_tokenized.json"):
    dataset_tokenized = {}
    for review in dataset["reviews"]:
        review_tokenized = {'state':review['state'], 'area_name':review['area_name'], 'reviewer_name':review['reviewer_name'],
                            'review_date':review['review_date'], 'rating':review['rating']}
        review_tokenized['tokens'] = tokenizer.tokenize(review['text'])
        dataset_tokenized[int(review['row_number'])] = review_tokenized
    with open(file_name, "w") as f:
        json.dump(dataset_tokenized, f)
    return