## Final Project: _Comment Volume Prediction using Neural Networks and Decision Trees_


### Created by: _Nafiseh Asghari_


## PART 2 : Train Models 

In [1]:
# Common imports
import numpy as np
import pandas as pd

# import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Get the Data

In [2]:
colname = ["Page_Popularity", "Page_Checkins", "Page_talking_about", "Page_Category",
              "CC1_min", "CC1_max","CC1_avg","CC1_median","CC1_stdv",
               "CC2_min", "CC2_max","CC2_avg","CC2_median","CC2_stdv",
               "CC3_min", "CC3_max","CC3_avg","CC3_median","CC3_stdv",
               "CC4_min", "CC4_max","CC4_avg","CC4_median","CC4_stdv",
               "CC5_min", "CC5_max","CC5_avg","CC5_median","CC5_stdv",
               "CC1", "CC2","CC3","CC4","CC5", "Base_Time", "Post_Lenght",
               "Post_Share", "Post_promo_Status", "H", "Post_published_sun",
               "Post_published_mon","Post_published_tue","Post_published_wed",
               "Post_published_thu","Post_published_fri","Post_published_sat",
               "Base_DateTime_sun","Base_DateTime_mon","Base_DateTime_tue",
               "Base_DateTime_wed","Base_DateTime_thu","Base_DateTime_fri",
               "Base_DateTime_sat" , "Target"]

def read_file(variant_number): 
    
    dataset = pd.read_csv("C:/Users/Nafiseh/my ipynb/Features_Variant_{}.csv".format(variant_number), 
                          header = None, names = colname)
    return dataset

In [3]:
#import variant 1:
dataset_1= read_file(1)
dataset_1.shape

(40949, 54)

In [4]:
#import test set:
test = pd.read_csv("C:/Users/Nafiseh/Documents/UofT- ML/finalproject/Dataset/Testing/Features_TestSet.csv",
                   names = colname, header = None)

# Prepare the data for Machine Learning algorithms

In [5]:
# import external module
import ProjectModules

In [6]:
X_train , y_train = ProjectModules.split_X_y(dataset_1)
X_train.shape , y_train.shape

((40949, 53), (40949,))

In [7]:
X_test, y_test = ProjectModules.split_X_y(test)
X_test.shape , y_test.shape

((10044, 53), (10044,))

In [8]:
# Create a test case with random 100 instances
case1_test = ProjectModules.test_case(test, num_instances=100)
X_case1, y_case1 = ProjectModules.split_X_y(case1_test)
X_case1.shape , y_case1.shape

((100, 53), (100,))

# Define Evaluation Metrics

In [9]:
#hit@10

def hit10 (y_test, y_pred):
    # top 10 posts that had received largest number of comments in actual
    top10_real= y_test.argsort()[::-1][:10]
    # top 10 posts that were predicted to have largest number of comments
    top10_pred= y_pred.argsort()[::-1][:10]
    # count how many of these 2 sets have matched
    common_top10= np.array(list(set(top10_pred).intersection(top10_real)))
    return len(common_top10)

In [10]:
#AUC@10

from sklearn.metrics import roc_auc_score

def auc10 (y_test, y_pred):
    # top 10 posts that had received largest number of comments in actual
    top10_real= y_test.argsort()[::-1][:10]
    # define a new binary set which is 1 for top 10 and zero for the rest
    y_binary = [0]*len(y_test)
    for i, j in enumerate (y_test):
        if i in np.array(top10_real):
            y_binary[i] = 1
    # top 10 posts that were predicted to have largest number of comments
    top10_pred= y_pred.argsort()[::-1][:10]
    # define a new binary set which is 1 for top 10 and zero for the rest
    y_binary_p = [0]*len(y_pred)
    for i, j in enumerate (y_pred):
        if i in np.array(top10_pred):
            y_binary_p[i] = 1
    return roc_auc_score(y_binary,y_binary_p)

# Train models 

## _1. Random Forest_

### 1.1. Default Hyper-Parameters

In [11]:
# train RF with default hyper-parameters:
from sklearn.ensemble import RandomForestRegressor
import time

t1 = time.time()

rf_reg= RandomForestRegressor(random_state = 42)
rf_reg.fit(X_train,y_train)

t2 = time.time()
print("    {}: {:.1f} seconds".format(rf_reg.__class__.__name__, t2 - t1))
time_taken_rf = t2-t1

    RandomForestRegressor: 9.6 seconds


In [12]:
np.random.seed(365)
from sklearn.metrics import mean_absolute_error

def evaluation_metric (model):
    MAE= []
    HIT10 = []
    AUC10= []
    for case in range(10):

        case = ProjectModules.test_case(test, num_instances=100)
        X_case, y_case = ProjectModules.split_X_y(case)
        y_pred= model.predict(X_case)
        mae = mean_absolute_error(y_case, y_pred)
        MAE.append(mae)
        hit_10 = hit10(y_case ,y_pred)
        HIT10.append(hit_10)
        auc_10 = auc10(y_case ,y_pred)
        AUC10.append(auc_10)
    mae_test= round(mean_absolute_error(y_test, model.predict(X_test)),3)
    print ("MAE for the test set:", mae_test)
    print ("AVG MAE for 10 random test cases: " ,round(np.mean(MAE),3))
    print ("AVG HIT@10 for 10 random test cases: ", np.mean(HIT10))
    print ("AUC@10 for 10 random test cases: " , round(np.mean(AUC10),2))    

    return mae_test, MAE , HIT10, AUC10

In [13]:
mae_test_RF, MAE_RF, HIT10_RF,AUC10_RF = evaluation_metric (model = rf_reg)

MAE for the test set: 31.3
AVG MAE for 10 random test cases:  33.622
AVG HIT@10 for 10 random test cases:  6.3
AUC@10 for 10 random test cases:  0.79


----

### 1.2. Tuned Random Forest

In [14]:
# train RF with tuned hyper-parameters:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

t1 = time.time()


param_distribs = {
        "n_estimators": randint(low= 5, high = 100),
        "max_depth":randint(low=2, high=10),
        "min_samples_split":randint(low=20, high=2000)
    }

rnd_search_forest = RandomizedSearchCV(RandomForestRegressor(random_state = 42), param_distributions=param_distribs,
                                n_iter=20, cv=5, scoring="neg_median_absolute_error",n_jobs=-1)
rnd_search_forest.fit(X_train, np.ravel(y_train))


t2 = time.time()
print("{:.1f} seconds".format(t2 - t1))
time_taken_TRF = t2-t1

231.9 seconds


In [15]:
np.random.seed(365)

mae_test_TRF, MAE_TRF, HIT10_TRF,AUC10_TRF = evaluation_metric (model = rnd_search_forest)

MAE for the test set: 27.397
AVG MAE for 10 random test cases:  29.257
AVG HIT@10 for 10 random test cases:  5.2
AUC@10 for 10 random test cases:  0.73


## _2.MLP_

### 2.1.  ONE hidden Layer,  4 neurons

In [16]:
cols_to_norm = ["Page_Popularity", "Page_Checkins", "Page_talking_about", 
              "CC1_min", "CC1_max","CC1_avg","CC1_median","CC1_stdv",
               "CC2_min", "CC2_max","CC2_avg","CC2_median","CC2_stdv",
               "CC3_min", "CC3_max","CC3_avg","CC3_median","CC3_stdv",
               "CC4_min", "CC4_max","CC4_avg","CC4_median","CC4_stdv",
               "CC5_min", "CC5_max","CC5_avg","CC5_median","CC5_stdv",
               "CC1", "CC2","CC3","CC4","CC5", "Base_Time", "Post_Lenght",
               "Post_Share", "H"]

X_train[cols_to_norm] = X_train[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
X_test[cols_to_norm] = X_test[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [17]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [18]:
#Continuous Features:
    
num_features = ["Page_Popularity", "Page_Checkins", "Page_talking_about",
              "CC1_min", "CC1_max","CC1_avg","CC1_median","CC1_stdv",
               "CC2_min", "CC2_max","CC2_avg","CC2_median","CC2_stdv",
               "CC3_min", "CC3_max","CC3_avg","CC3_median","CC3_stdv",
               "CC4_min", "CC4_max","CC4_avg","CC4_median","CC4_stdv",
               "CC5_min", "CC5_max","CC5_avg","CC5_median","CC5_stdv",
               "CC1", "CC2","CC3","CC4","CC5", "Base_Time", "Post_Lenght",
               "Post_Share", "H"]

num_feature_cols = [tf.feature_column.numeric_column(k) for k in num_features]

In [19]:
#Categorical Features:
    # 1. binary Features:
cat_features = [
               "Post_promo_Status",  "Post_published_sun",
               "Post_published_mon","Post_published_tue","Post_published_wed",
               "Post_published_thu","Post_published_fri","Post_published_sat",
               "Base_DateTime_sun","Base_DateTime_mon","Base_DateTime_tue",
               "Base_DateTime_wed","Base_DateTime_thu","Base_DateTime_fri",
               "Base_DateTime_sat" ]

cat_feature_cols = [tf.feature_column.categorical_column_with_identity(k, num_buckets=2) for k in cat_features]
                    

    # 2. page category features:
Page_Category = tf.feature_column.categorical_column_with_identity("Page_Category", num_buckets=106, default_value=0 )

In [20]:
indicator_cat_cols= [tf.feature_column.indicator_column(k) for k in cat_feature_cols]
indicator_Page_Category = [tf.feature_column.indicator_column(Page_Category)]

In [21]:
feature_columns = num_feature_cols + indicator_cat_cols +indicator_Page_Category

In [22]:
np.set_printoptions(precision=2)
tf.logging.set_verbosity(tf.logging.ERROR)
import logging
logging.basicConfig(level=logging.INFO)
logging.info('Tensorflow %s' % tf.__version__) 


# Defining the Tensorflow input functions
# for training
def training_input_fn(batch_size=1):
    return tf.estimator.inputs.pandas_input_fn(
                x=X_train,
                y=y_train ,
                batch_size=batch_size,
                num_epochs=None,
                shuffle=True)
# for test
def test_input_fn():
    return tf.estimator.inputs.pandas_input_fn(
                x=X_test,
                y = y_test,
                batch_size=10,
                num_epochs=1,
                shuffle=False)


# Network Design
# --------------
feature_columns = num_feature_cols + indicator_cat_cols + indicator_Page_Category

STEPS_PER_EPOCH = 10
EPOCHS = 200
BATCH_SIZE = 50

hidden_layers = [4]
#dropout = 0.0


MODEL_PATH='./DNNRegressors_MLP1L/'
for hl in hidden_layers:
    MODEL_PATH += '%s_' % hl
#MODEL_PATH += 'D0%s' % (int(dropout*10))
logging.info('Saving to %s' % MODEL_PATH)


t1 = time.time()
# Building the Network -- high-level API TF.Learn
MLP_1Layer = tf.estimator.DNNRegressor(
                hidden_units=hidden_layers,
                feature_columns=feature_columns,
                model_dir=MODEL_PATH)

# Train it

logging.info('Train the DNN Regressor...\n')

for epoch in range(EPOCHS+1):

    # Fit the DNNRegressor 
    MLP_1Layer.train(input_fn=training_input_fn(batch_size=BATCH_SIZE),steps=STEPS_PER_EPOCH)


    # Evaluate the DNNRegressor every 10th epoch
    if epoch%10==0:
        eval_dict = MLP_1Layer.evaluate(input_fn=test_input_fn())

        print('Epoch %i: %.5f loss' % (epoch+1, eval_dict['loss']))


t2 = time.time()
print("{:.1f} seconds".format(t2 - t1))
time_taken_MLP1 = t2-t1

INFO:root:Tensorflow 1.6.0
INFO:root:Saving to ./DNNRegressors_MLP1L/4_
INFO:root:Train the DNN Regressor...



Epoch 1: 138167.45312 loss
Epoch 11: 133711.46875 loss
Epoch 21: 131893.31250 loss
Epoch 31: 132564.64062 loss
Epoch 41: 131284.68750 loss
Epoch 51: 130148.89844 loss
Epoch 61: 129524.93750 loss
Epoch 71: 129390.22656 loss
Epoch 81: 128452.21094 loss
Epoch 91: 127785.95312 loss
Epoch 101: 127945.39062 loss
Epoch 111: 127840.60938 loss
Epoch 121: 127329.77344 loss
Epoch 131: 127177.15625 loss
Epoch 141: 127299.17969 loss
Epoch 151: 126832.76562 loss
Epoch 161: 126266.88281 loss
Epoch 171: 126555.24219 loss
Epoch 181: 125976.48438 loss
Epoch 191: 125885.37500 loss
Epoch 201: 125878.05469 loss
892.4 seconds


In [23]:
predict_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      batch_size=10,
      num_epochs=1,
      shuffle=False)

pred_gen = MLP_1Layer.predict(predict_input_func)
predictions_MLP1 = list(pred_gen)
final_preds_MLP1 = []
for pred in predictions_MLP1:
    final_preds_MLP1.append(pred['predictions'])

In [27]:
np.random.seed(365)
MAE_MLP1 = []
HIT10_MLP1=[]
AUC10_MLP1 = []
for c in range(10):
    case_index= np.random.randint(0,len(test),100)
    case =test.iloc[case_index]
    y_pred= np.array(final_preds_MLP1)[case_index,0]
    X_case, y_case = ProjectModules.split_X_y(case)
    hit_10 = hit10(y_case ,y_pred)
    HIT10_MLP1.append(hit_10)
    auc_10 = auc10(y_case ,y_pred)
    AUC10_MLP1.append(auc_10)
    mae = mean_absolute_error(y_case, y_pred)
    MAE_MLP1.append(mae)
mae_test_MLP1=round(mean_absolute_error(y_test,final_preds_MLP1),3)
print ("MAE for the test set:", mae_test_MLP1)
print ("AVG MAE for 10 random test cases: " ,round(np.mean(MAE_MLP1),3))
print ("AVG HIT@10 for 10 random test cases: ", np.mean(HIT10_MLP1))
print ("AUC@10 for 10 random test cases: " , round(np.mean(AUC10_MLP1),2))

MAE for the test set: 29.433
AVG MAE for 10 random test cases:  31.233
AVG HIT@10 for 10 random test cases:  4.4
AUC@10 for 10 random test cases:  0.69


### 2.2. TWO hidden Layer, 20 & 4 neurons

In [28]:
np.set_printoptions(precision=2)

tf.logging.set_verbosity(tf.logging.ERROR)

import logging
logging.basicConfig(level=logging.INFO)
logging.info('Tensorflow %s' % tf.__version__) 


# Defining the Tensorflow input functions
# for training
def training_input_fn(batch_size=1):
    return tf.estimator.inputs.pandas_input_fn(
                x=X_train,
                y=y_train ,
                batch_size=batch_size,
                num_epochs=None,
                shuffle=True)
# for test
def test_input_fn():
    return tf.estimator.inputs.pandas_input_fn(
                x=X_test,
                y = y_test,
                batch_size=10,
                num_epochs=1,
                shuffle=False)


# Network Design
# --------------
feature_columns = num_feature_cols + indicator_cat_cols + indicator_Page_Category

STEPS_PER_EPOCH = 10
EPOCHS = 200
BATCH_SIZE = 50

hidden_layers = [20,4]
dropout = 0.0

MODEL_PATH='./DNNRegressors_MLP2L/'
for hl in hidden_layers:
    MODEL_PATH += '%s_' % hl
MODEL_PATH += 'D0%s' % (int(dropout*10))
logging.info('Saving to %s' % MODEL_PATH)


t1 = time.time()
# Building the Network -- high-level API TF.Learn
MLP_2Layer = tf.estimator.DNNRegressor(
                hidden_units=hidden_layers,
                feature_columns=feature_columns,
                model_dir=MODEL_PATH,
                dropout=dropout)

# Train it

logging.info('Train the DNN Regressor...\n')

for epoch in range(EPOCHS+1):

    # Fit the DNNRegressor 
    MLP_2Layer.train(input_fn=training_input_fn(batch_size=BATCH_SIZE),steps=STEPS_PER_EPOCH)


    # Evaluate the DNNRegressor every 10th epoch
    if epoch%10==0:
        eval_dict = MLP_2Layer.evaluate(input_fn=test_input_fn())

        print('Epoch %i: %.5f loss' % (epoch+1, eval_dict['loss']))


t2 = time.time()
print("{:.1f} seconds".format(t2 - t1))
time_taken_MLP2 = t2-t1

INFO:root:Tensorflow 1.6.0
INFO:root:Saving to ./DNNRegressors_MLP2L/20_4_D00
INFO:root:Train the DNN Regressor...



Epoch 1: 140683.67188 loss
Epoch 11: 135773.23438 loss
Epoch 21: 132286.81250 loss
Epoch 31: 128862.76562 loss
Epoch 41: 126485.79688 loss
Epoch 51: 126507.51562 loss
Epoch 61: 125298.60156 loss
Epoch 71: 123265.28906 loss
Epoch 81: 124012.88281 loss
Epoch 91: 122993.92969 loss
Epoch 101: 122307.85156 loss
Epoch 111: 121447.19531 loss
Epoch 121: 121298.13281 loss
Epoch 131: 120180.82812 loss
Epoch 141: 120809.44531 loss
Epoch 151: 119687.96875 loss
Epoch 161: 119164.31250 loss
Epoch 171: 117804.25781 loss
Epoch 181: 118184.74219 loss
Epoch 191: 117486.56250 loss
Epoch 201: 117532.82031 loss
973.3 seconds


In [29]:
predict_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      batch_size=10,
      num_epochs=1,
      shuffle=False)

pred_gen = MLP_2Layer.predict(predict_input_func)
predictions_MLP2 = list(pred_gen)
final_preds_MLP2 = []
for pred in predictions_MLP2:
    final_preds_MLP2.append(pred['predictions'])

In [30]:
np.random.seed(365)
MAE_MLP2 = []
HIT10_MLP2=[]
AUC10_MLP2 = []
for c in range(10):
    case_index= np.random.randint(0,len(test),100)
    case =test.iloc[case_index]
    y_pred= np.array(final_preds_MLP2)[case_index,0]
    X_case, y_case = ProjectModules.split_X_y(case)
    hit_10 = hit10(y_case ,y_pred)
    HIT10_MLP2.append(hit_10)
    auc_10 = auc10(y_case ,y_pred)
    AUC10_MLP2.append(auc_10)
    mae = mean_absolute_error(y_case, y_pred)
    MAE_MLP2.append(mae)
    
mae_test_MLP2=round(mean_absolute_error(y_test,final_preds_MLP2),3)
MAE_MLP2_avg = round(np.mean(MAE_MLP2),3)
HIT10_MLP2_avg = np.mean(HIT10_MLP2)
AUC10_MLP2_avg = round(np.mean(AUC10_MLP2),2)
print ("MAE for the test set:", mae_test_MLP2)
print ("AVG MAE for 10 random test cases: " ,round(np.mean(MAE_MLP2),3))
print ("AVG HIT@10 for 10 random test cases: ", np.mean(HIT10_MLP2))
print ("AUC@10 for 10 random test cases: " , round(np.mean(AUC10_MLP2),2))

MAE for the test set: 28.147
AVG MAE for 10 random test cases:  29.324
AVG HIT@10 for 10 random test cases:  5.1
AUC@10 for 10 random test cases:  0.73


## Conclusion:

In [34]:
#Variant 1:

compare_model= pd.DataFrame({"M.A.E": [mae_test_RF, mae_test_TRF, mae_test_MLP1,mae_test_MLP2],
                             "Hits @10": [np.mean(HIT10_RF), np.mean(HIT10_TRF), np.mean(HIT10_MLP1),HIT10_MLP2_avg], 
                             "AUC @10" :[round(np.mean(AUC10_RF),3), round(np.mean(AUC10_TRF),3), round(np.mean(AUC10_MLP1),3),AUC10_MLP2_avg], 
                             "Time Taken": [time_taken_rf,time_taken_TRF,time_taken_MLP1,time_taken_MLP2]},
                            index= ["RF", "Tuned RF","MLP (4)", "MLP(20,4)"])

In [35]:
compare_model

Unnamed: 0,AUC @10,Hits @10,M.A.E,Time Taken
RF,0.794,6.3,31.3,9.559909
Tuned RF,0.733,5.2,27.397,231.895602
MLP (4),0.689,4.4,29.433,892.422689
"MLP(20,4)",0.73,5.1,28.147,973.310087


* Random Forest outperforms Neural Networks in the proposed comment volume prediction model. 