# Data exploration

In [1]:
import json # we need to use the JSON package to load the data, since the data is stored in JSON format

with open("proj1_data.json") as fp:
    data = json.load(fp)
    
# Now the data is loaded.
# It a list of data points, where each datapoint is a dictionary with the following attributes:
# popularity_score : a popularity score for this comment (based on the number of upvotes) (type: float)
# children : the number of replies to this comment (type: int)
# text : the text of this comment (type: string)
# controversiality : a score for how "controversial" this comment is (automatically computed by Reddit)
# is_root : if True, then this comment is a direct reply to a post; 
# if False, this is a direct reply to another comment 

# Example:
data_point = data[0] # select the first data point in the dataset

# Now we print all the information about this datapoint
for info_name, info_value in data_point.items():
    print(info_name + " : " + str(info_value))

text : ITS RAINING SIDEWAYS
is_root : False
controversiality : 0
children : 0
popularity_score : 1.254698160267241


In [None]:
type(data_point['is_root'])

# Data Exploration

In [2]:
import copy
import json
import numpy as np
from collections import Counter
from operator import itemgetter

file_name_data = 'proj1_data.json'

with open(file_name_data) as fp:
    data = json.load(fp)

# Preprocessing Data

In [3]:
def process_data(data_list):
    """
    brief: Converts text data to lower case, splits it by whitespace, and encodes the is_root feature on 
    json data point provided by projmaterials1 param data: list of dictionary type data to perform text 
    lower case conversion, text splitting, and is_root encoding return: list of dictionary type data that's 
    been processed
    """
    
    # Deep copy data as to avoid overwriting which could cause unintented side effects
    result = copy.deepcopy(data_list)
    for index, value in enumerate(result):
        result[index]['text'] = value['text'].lower().split(' ')
        result[index]['is_root'] = int(value['is_root'] == True)
        
    return result

In [None]:
process_data(data)

# Helper Functions

In [4]:
def concatenate_all_text(data_list):
    
    all_text = []
    for index, value in enumerate(data_list):
        all_text.extend(value['text'])
    
    return all_text

def get_top_words(data_list, n_top_words=160):
    
    top_words = []
    
    d = Counter(concatenate_all_text(data_list))
    d_sorted = sorted(d.items(), key=itemgetter(1), reverse=True)
    
    assert len(d_sorted) >= n_top_words, 'Too many top words'
    
    for i in range(n_top_words):
        top_words.append(d_sorted[i][0])
        
    return top_words    
    
def get_top_words_count(data_point, top_words):
    
    word_count = np.zeros(len(top_words)) 
    
    for index, word in enumerate(top_words):
        word_count[index] = data_point['text'].count(word)
    
    return word_count

def add_top_words_count_to_data(data_list, top_words):
    
    result = copy.deepcopy(data_list)
    for index, value in enumerate(result):
        top_word_count = get_top_words_count(value, top_words)
        result[index]['top_word_count'] = top_word_count
    
    return result

In [None]:
preprocessed_data = process_data (data)
top_words = get_top_words(preprocessed_data)
preprocessed_data

# Split Data into Training, Validation, and Test Sets

In [5]:
X = process_data(data)
X_train = X[0:10000]
X_validation = X[10000:11000]
X_test = X[11000:]

#print(len(X_validation))

assert len(X_train) == 10000 , 'Expected 10000. Got %d' % len(X_train)
assert len(X_validation) == 1000 , 'Expected 1000. Got %d' % len(X_validation)
assert len(X_test) == 1000 , 'Expected 1000. Got %d' % len(X_test)

# Word Count Features

In [6]:
top_160_words = get_top_words(X_train)
assert len(top_160_words) == 160, 'Expected 160. Got %d' % len(top_160_words)

X_train = add_top_words_count_to_data(X_train, top_160_words)

# Split input and output

In [7]:
Y_train = np.zeros(shape = (10000,1));
for index in range(0, len(X_train)):
    Y_train[index] = X_train[index]['popularity_score']
    del(X_train[index]['popularity_score'])

# Rearrange input array, 10000 examples by 164 features

In [8]:
X_train_values = np.zeros(shape = (10000 , 164))
for index in range(0, len(X_train)):
    X_train_values[index][0] = X_train[index]["is_root"]
    X_train_values[index][1] = X_train[index]['controversiality']
    X_train_values[index][2] = X_train[index]['children']
    top_word_count = X_train[index]['top_word_count']
    for array_index in range(0, len(top_word_count)):
        X_train_values[index][array_index + 3] = top_word_count[array_index]
    X_train_values[index][163] = 1

In [None]:
print(X_train_values)

# Closed-form approach

In [9]:
X_trasnposed = X_train_values.transpose()
X_trasnposed_X = X_trasnposed.dot(X_train_values)
X_trasnposed_X_inverse = np.linalg.inv(X_trasnposed_X)

# Y_train is a list, transform it to an array and size is 10000*1
#Y_train = np.array(Y_train)
#Y_train = Y_train.reshape((10000, 1))
#print(Y_train.shape)

X_trasnposed_Y = X_trasnposed.dot(Y_train)
weight_coefficient = X_trasnposed_X_inverse.dot(X_trasnposed_Y)

weight_coefficient.shape

(164, 1)

# Apply Validation set (closed-form)

In [10]:
Y_validation = np.zeros(shape = (1000 , 1))
for index in range(0, len(X_validation)):
    Y_validation[index] = X_validation[index]['popularity_score']
    del(X_validation[index]['popularity_score'])

In [11]:
X_validation = add_top_words_count_to_data(X_validation, top_160_words)

X_validation_values = np.zeros(shape = (1000 , 164))
for index in range(0, len(X_validation)):
    X_validation_values[index][0] = X_validation[index]["is_root"]
    X_validation_values[index][1] = X_validation[index]['controversiality']
    X_validation_values[index][2] = X_validation[index]['children']
    top_word_count = X_validation[index]['top_word_count']
    for array_index in range(0, len(top_word_count)):
        X_validation_values[index][array_index + 3] = top_word_count[array_index]
    X_validation_values[index][163] = 1

In [12]:
Y_valid_pridiction = X_validation_values.dot(weight_coefficient)
Y_valid_pridiction.shape

(1000, 1)

# Display results (Validation set)

In [13]:
comparation_vlidation = np.hstack ([Y_validation,Y_valid_pridiction])
print(comparation_vlidation)

[[ 0.84333697  0.61270368]
 [ 0.89400237  0.79462912]
 [ 3.42605184  1.52805469]
 ..., 
 [ 0.65148906  0.26863527]
 [ 1.01984666  0.78444281]
 [-0.74624472  2.06937147]]


In [14]:
# calculate the MES
MSE_validation = ((Y_validation - Y_valid_pridiction)**2).mean(axis=0)
print(MSE_validation)

[ 0.9895357]


# Display results (Training set)

In [15]:
Y_train_pridiction = X_train_values.dot(weight_coefficient)

In [16]:
Y_train = np.array(Y_train)
comparation_train = np.hstack ([Y_train_pridiction,Y_train])
print(comparation_train)

[[ 0.82884594  1.25469816]
 [ 0.75712377  0.50981271]
 [ 0.60203502  0.3708268 ]
 ..., 
 [ 0.42636679  0.15810991]
 [ 0.84591094  0.89307136]
 [ 0.59971335  0.14033016]]


In [17]:
MSE_train = ((Y_train - Y_train_pridiction)**2).mean(axis=0)
print(MSE_train)

[ 1.04683291]
