# Data Exploration

In [6]:
import copy
import json
import numpy as np
from collections import Counter
from operator import itemgetter

file_name_data = 'proj1_data.json'

with open(file_name_data) as fp:
    data = json.load(fp)

## Pre-processing Data

In [2]:
def process_data(data_list):
    """
    brief: Converts text data to lower case, splits it by whitespace, and encodes the is_root feature on json data point provided by projmaterials1
    param data: list of dictionary type data to perform text lower case conversion, text splitting, and is_root encoding
    return: list of dictionary type data that's been processed
    """
    
    # Deep copy data as to avoid overwriting which could cause unintented side effects
    result = copy.deepcopy(data_list)
    for index, value in enumerate(result):
        result[index]['text'] = value['text'].lower().split(' ')
        result[index]['is_root'] = int(value['is_root'] == 'true')
        
    return result

## Helper Functions

In [3]:
def concatenate_all_text(data_list):
    
    all_text = []
    for index, value in enumerate(data_list):
        all_text.extend(value['text'])
    
    return all_text

def get_top_words(data_list, n_top_words=160):
    
    top_words = []
    
    d = Counter(concatenate_all_text(data_list))
    d_sorted = sorted(d.items(), key=itemgetter(1), reverse=True)
    
    assert len(d_sorted) >= n_top_words, 'Too many top words'
    
    for i in range(n_top_words):
        top_words.append(d_sorted[i][0])
        
    return top_words

def get_top_words_count(data_point, top_words):
    
    word_count = np.zeros(len(top_words))
    
    for index, word in enumerate(top_words):
        word_count[index] = data_point['text'].count(word)
    
    return word_count

def add_top_words_count_to_data(data_list, top_words):
    
    result = copy.deepcopy(data_list)
    for index, value in enumerate(result):
        top_word_count = get_top_words_count(value, top_words)
        result[index]['top_word_count'] = top_word_count
    
    return result

## Split Data into Training, Validation, and Test Sets

In [4]:
X = process_data(data)
X_train = X[0:10000]
X_validation = X[10000:11000]
X_test = X[11000:]

assert len(X_train) == 10000 , 'Expected 10000. Got %d' % len(X_train)
assert len(X_validation) == 1000 , 'Expected 1000. Got %d' % len(X_validation)
assert len(X_test) == 1000 , 'Expected 1000. Got %d' % len(X_test)

## Word Count Features

In [7]:
top_160_words = get_top_words(X_train)
assert len(top_160_words) == 160, 'Expected 160. Got %d' % len(top_160_words)

X_train = add_top_words_count_to_data(X_train, top_160_words)