In [1]:
import csv
import json
import math
import numpy as np
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
from nltk.tokenize import TreebankWordTokenizer

In [2]:
def convert_raw_csv_dataset_to_json():
    with open("reviews.csv") as csvfile:
        with open("reviews.json", "w") as f:
            reader = csv.reader(csvfile)
            data_json = {}
            reviews = []
            isFirstLine = True
            for line in reader:
                if isFirstLine:
                    isFirstLine = False
                    continue
                row_num, state, area_name, reviewer_name = line[0], line[1], line[2], line[3]
                review_date, rating, text = line[4], line[5], line[6]
                review = {
                    "row_number":row_num,
                    "state":state,
                    "area_name":area_name,
                    "reviewer_name":reviewer_name,
                    "review_date":review_date,
                    "rating":rating,
                    "text":text
                }
                reviews.append(review)
            data_json["reviews"] = reviews
            json.dump(data_json, f)
    return

def load_untokenized_dataset(file_name="reviews.json"):
    with open(file_name, "r") as f:
        dataset = json.load(f)
    return dataset

# Data Exploration for Milestone 2

In [3]:
def getStatesAreasSortedByNumReviews(dataset):
    state_to_review_row_nums = defaultdict(list)
    area_name_to_review_row_nums = defaultdict(list)
    for i in range(len(dataset['reviews'])):
        review = dataset['reviews'][i]
        state_to_review_row_nums[review['state']].append(i)
        area_name_to_review_row_nums[review['area_name']].append(i)
    
    state_names, num_reviews_by_state = [], []
    for state_name, row_nums in state_to_review_row_nums.items():
        state_names.append(state_name)
        num_reviews_by_state.append(len(row_nums))
    area_names, num_reviews_by_area = [], []
    for area_name, row_nums in area_name_to_review_row_nums.items():
        area_names.append(area_name)
        num_reviews_by_area.append(len(row_nums))
        
    unsorted_states = [(state_names[i], num_reviews_by_state[i]) for i in range(len(state_names))]
    unsorted_areas = [(area_names[i], num_reviews_by_area[i]) for i in range(len(area_names))]
    sorted_states = sorted(unsorted_states, key=lambda x: x[1], reverse=True)
    sorted_areas = sorted(unsorted_areas, key=lambda x: x[1], reverse=True)
    
    return sorted_states, sorted_areas

def makePlot(sorted_lst, top, byState):
    if top:
        plt.bar([sorted_lst[i][0] for i in range(10)], [sorted_lst[i][1] for i in range(10)])
    else:
        plt.bar([sorted_lst[-i][0] for i in range(1,10)], [sorted_lst[-i][1] for i in range(1,10)])
    plt.xticks(rotation="vertical")
    xlab = "State" if byState else "Ski Area"
    plt.xlabel(xlab)
    plt.ylabel("Number of Reviews")
    q = "largest" if top else "fewest"
    title = xlab + "s with " + q + " number of reviews"
    plt.title(title)
    return

dataset_untokenized = load_untokenized_dataset()
sorted_states, sorted_areas = getStatesAreasSortedByNumReviews(dataset=dataset_untokenized)

In [9]:
# makePlot(sorted_states, top=True, byState=True)
# makePlot(sorted_states, top=False, byState=True)
# makePlot(sorted_areas, top=True, byState=False)
# makePlot(sorted_areas, top=False, byState=False)

# End milestone 2
# Begin tokenizing dataset

In [4]:
def tokenize_and_write_dataset(dataset, tokenizer, file_name="dataset_tokenized.json"):
    dataset_tokenized = {}
    for review in dataset["reviews"]:
        review_tokenized = {'state':review['state'], 'area_name':review['area_name'], 'reviewer_name':review['reviewer_name'],
                            'review_date':review['review_date'], 'rating':review['rating']}
        review_tokenized['tokens'] = tokenizer.tokenize(review['text'])
        dataset_tokenized[review['row_number']] = review_tokenized
    with open(file_name, "w") as f:
        json.dump(dataset_tokenized, f)
    return

In [24]:
tokenizer = TreebankWordTokenizer()
tokenize_and_write_dataset(dataset=dataset, tokenizer=tokenizer)

In [2]:
def load_tokenized_dataset(file_name="dataset_tokenized.json"):
    with open(file_name, "r") as f:
        dataset = json.load(f)
    return dataset

def printReview(dataset, n):
    review = dataset[str(n)]
    print(review['area_name'], review['state'])
    print(review['rating'])
    print(review['tokens'])

In [3]:
dataset_tokenized = load_tokenized_dataset()

In our tokenized dataset, we have the structure:
{
  "0": {
    'state': "california",
    'area_name': "squaw-valley-usa",
    'reviewer_name': "john smith",
    'review_date': "31st December 2019",
    'rating': "4",
    'tokens': ["we", "went", "to", ... ]
  }
  "1": {...}
}

In [4]:
def build_inverted_index(dataset):
    """
    Return dict:
    {
        "word" : [(doc_num, tf1), ...]
    }
    """
    inv_idx = defaultdict(list)
    for row_num, review in dataset.items():
        tokens = review['tokens']
        token_to_count = defaultdict(int)
        for tok in tokens:
            token_to_count[tok] += 1
        for tok, tf in token_to_count.items():
            inv_idx[tok].append((row_num, tf))
    return inv_idx

In [5]:
inv_idx = build_inverted_index(dataset_tokenized)

In [6]:
def build_idf(inv_idx, min_df, max_df_ratio, n_docs):
    """
    Returns dict[term] = log(num_docs / (1+df))
    """
    idf = {}
    for term, lst in inv_idx.items():
        df = len(lst)
        if df >= min_df and (df/n_docs) <= max_df_ratio:
            idf[term] = math.log(n_docs/(1+df), 2)
    return idf

In [7]:
idf = build_idf(inv_idx, min_df=10, max_df_ratio=0.3, n_docs=len(dataset_tokenized))

In [13]:
# x = [(term, len(lst)) for term, lst in inv_idx.items()]
# print(sorted(x, key=lambda l:l[1], reverse=True)[10:30])
print(idf["mountain"])

1.867979416207268


In [8]:
def build_doc_norms(inv_idx, idf, n_docs):
    norms = np.zeros(n_docs)
    for term, lst in inv_idx.items():
        try:
            idf_i = idf[term]
            for row_num, tf in lst:
                norms[int(row_num)] += (tf*idf_i)**2
        except KeyError:
            continue
    return [math.sqrt(i) for i in norms]

In [9]:
norms = build_doc_norms(inv_idx, idf, len(dataset_tokenized))

In [10]:
def search(query, inv_idx, idf, doc_norms, tokenizer=TreebankWordTokenizer()):
    query = tokenizer.tokenize(query.lower())
    tf_q = Counter(query)
    numerators = [0 for i in range(len(doc_norms))] # intialize the score for each doc to 0
    q_norm = 0
    for term in tf_q:
        if term not in idf:
            continue
        q_norm += ((tf_q[term]*idf[term]) ** 2)
        for doc_num, tf in inv_idx[term]:
            w_iq = tf_q[term] * idf[term]
            w_ij = tf * idf[term]
            numerators[int(doc_num)] += (w_iq*w_ij)
    q_norm = math.sqrt(q_norm)
    result = [(numerators[i]/(q_norm*doc_norms[i]), i) for i in range(len(doc_norms)) if doc_norms[i] != 0]
    return sorted(result, key=lambda x: x[0], reverse=True)

In [11]:
query = "The biggest mountain on the east coast"
results = search(query, inv_idx, idf, norms)

In [12]:
print("***********")
print(query)
for score, idx in results[:10]:
    print(score, dataset_tokenized[str(idx)]["area_name"])

***********
The biggest mountain on the east coast
0.49430364841221247 giants-ridge-resort
0.49430364841221247 jay-peak
0.44832083893128866 mad-river-glen
0.44832083893128866 mad-river-glen
0.39426417828179083 jackson-hole
0.3370198379857671 mt-baker
0.33613188983556824 sunday-river
0.32230170141813014 sunday-river
0.30954305627785605 sunday-river
0.2889183871285024 elk-mountain-ski-resort
