# Data Exploration

In [1]:
import copy
import json
import numpy as np
from collections import Counter
from operator import itemgetter

file_name_data = 'proj1_data.json'

with open(file_name_data) as fp:
    data = json.load(fp)

# preprocessing data 

In [2]:
def process_data(data_list):
    """
    brief: Converts text data to lower case, splits it by whitespace, and encodes the is_root feature on 
    json data point provided by projmaterials1 param data: list of dictionary type data to perform text 
    lower case conversion, text splitting, and is_root encoding return: list of dictionary type data that's 
    been processed
    """
    
    # Deep copy data as to avoid overwriting which could cause unintented side effects
    result = copy.deepcopy(data_list)
    for index, value in enumerate(result):
        result[index]['is_root'] = int(value['is_root'] == True)
        del (result[index]['text'])
    return result

In [None]:
process_data(data)

# Split Data into Training, Validation, and Test Sets

In [3]:
X = process_data(data)
X_train = X[0:10000]
X_validation = X[10000:11000]
X_test = X[11000:]

#print(len(X_validation))

assert len(X_train) == 10000 , 'Expected 10000. Got %d' % len(X_train)
assert len(X_validation) == 1000 , 'Expected 1000. Got %d' % len(X_validation)
assert len(X_test) == 1000 , 'Expected 1000. Got %d' % len(X_test)

# Split input and output

In [4]:
Y_train = np.zeros(shape = (10000,1));
for index in range(0, len(X_train)):
    Y_train[index] = X_train[index]['popularity_score']
    del(X_train[index]['popularity_score'])

In [5]:
print(X_train)

[{'is_root': 0, 'controversiality': 0, 'children': 0}, {'is_root': 0, 'controversiality': 0, 'children': 0}, {'is_root': 1, 'controversiality': 0, 'children': 0}, {'is_root': 0, 'controversiality': 0, 'children': 0}, {'is_root': 1, 'controversiality': 0, 'children': 0}, {'is_root': 1, 'controversiality': 0, 'children': 0}, {'is_root': 1, 'controversiality': 0, 'children': 0}, {'is_root': 1, 'controversiality': 0, 'children': 1}, {'is_root': 0, 'controversiality': 1, 'children': 1}, {'is_root': 0, 'controversiality': 0, 'children': 1}, {'is_root': 1, 'controversiality': 0, 'children': 0}, {'is_root': 0, 'controversiality': 0, 'children': 0}, {'is_root': 0, 'controversiality': 0, 'children': 0}, {'is_root': 0, 'controversiality': 0, 'children': 0}, {'is_root': 0, 'controversiality': 0, 'children': 0}, {'is_root': 1, 'controversiality': 0, 'children': 0}, {'is_root': 0, 'controversiality': 0, 'children': 0}, {'is_root': 0, 'controversiality': 0, 'children': 1}, {'is_root': 1, 'controversi

# Rearrange input array, 10000 examples by 164 features

In [15]:
X_train_values = np.zeros(shape = (10000 , 4))
for index in range(0, len(X_train)):
    X_train_values[index][0] = X_train[index]["is_root"]
    X_train_values[index][1] = X_train[index]['controversiality']
    X_train_values[index][2] = X_train[index]['children']
    X_train_values[index][3] = 1

# Close form approach

In [9]:
X_trasnposed = X_train_values.transpose()
X_trasnposed_X = X_trasnposed.dot(X_train_values)
X_trasnposed_X_inverse = np.linalg.inv(X_trasnposed_X)

# Y_train is a list, transform it to an array and size is 10000*1
Y_train = np.array(Y_train)
Y_train = Y_train.reshape((10000, 1))
#print(Y_train.shape)

X_trasnposed_Y = X_trasnposed.dot(Y_train)
weight_coefficient = X_trasnposed_X_inverse.dot(X_trasnposed_Y)

weight_coefficient.shape

(4, 1)

# Apply Validation set (closed-form)


In [10]:
Y_validation = list(np.zeros(1000));
for index in range(0, len(X_validation)):
    Y_validation[index] = X_validation[index]['popularity_score']
    del(X_validation[index]['popularity_score'])

In [12]:
X_validation_values = np.zeros(shape = (1000 , 4))
for index in range(0, len(X_validation)):
    X_validation_values[index][0] = X_validation[index]["is_root"]
    X_validation_values[index][1] = X_validation[index]['controversiality']
    X_validation_values[index][2] = X_validation[index]['children']
    X_validation_values[index][3] = 1

In [13]:
Y_valid_pridiction = X_validation_values.dot(weight_coefficient)
Y_valid_pridiction.shape

(1000, 1)

# Display results (Validation set)

In [14]:
#transform list to np.array
Y_validation = np.array(Y_validation)
Y_validation = Y_validation.reshape((1000, 1))

comparation_vlidation = np.hstack ([Y_validation,Y_valid_pridiction])
print(comparation_vlidation)

[[ 0.84333697  0.59464838]
 [ 0.89400237  0.82092517]
 [ 3.42605184  1.57165323]
 ..., 
 [ 0.65148906  0.59464838]
 [ 1.01984666  0.82092517]
 [-0.74624472  2.09610448]]


In [16]:
# calculate the MES
MSE_validation = ((Y_validation - Y_valid_pridiction)**2).mean(axis=0)
print(MSE_validation)

[ 1.02032668]


# Display results (Training set)

In [17]:
Y_train_pridiction = X_train_values.dot(weight_coefficient)

In [18]:
Y_train = np.array(Y_train)
comparation_train = np.hstack ([Y_train_pridiction,Y_train])
print(comparation_train)

[[ 0.82092517  1.25469816]
 [ 0.82092517  0.50981271]
 [ 0.59464838  0.3708268 ]
 ..., 
 [ 0.59464838  0.15810991]
 [ 0.82092517  0.89307136]
 [ 0.59464838  0.14033016]]


In [19]:
MSE_train = ((Y_train - Y_train_pridiction)**2).mean(axis=0)
print(MSE_train)

[ 1.08468307]
