In [None]:
# Project Outline & Problem Definition 

# The purpose of this codelab/notebook is to reproduce, test, and try to improve the findings from the 2015 paper, 
# '[Facebook] Comment Volume Prediction Using Decision Trees & Neural Networks.' 
# The goal of the exercise is to build & compare models that predict how many comments a particular FB post or 
# page will receive over a set period of time, using the provided data sets acquired from the [UC Irvine Machine 
# Learning Repository](https://archive.ics.uci.edu/ml/datasets/Facebook+Comment+Volume+Dataset)

# The predictive modelling techniques used in the original paper were Decision Trees (specifically, REP & M5P Trees) and 
# Neural Networks (specifically, a Multi-Layer Preceptron, and an RBF Network). The main libraries used were pandas,
# numpy, SKlearn, and tensorflow. 

# The hyperparameters used to tune the model were: 
    # Decision Trees:
        # MaxDepth =
        
    # Neural Nets: 
        # Number of hidden layers
        # Number of neurons per hidden layer
        # Activation functions 

# The metrics used to evaluate model performance were: Hits@10, AUC@10, MAE & Evaluation Time

In [1]:
# Import Libraries & Setup Environment
import pandas as pd
import numpy as np
import sklearn as sk
import tensorflow as tf

# to make this notebook's output stable across runs
np.random.seed(123)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [2]:
# Import Training Data into Dataframe
fb_comment_training_data = pd.read_csv("Features_Variant_1.csv",header=0)

In [3]:
# Import Test Data into Dataframe
fb_comment_test_data = pd.read_csv("Features_TestSet.csv",header=0)
y_test = fb_comment_test_data[["output"]].copy()

In [4]:
# Take a Look at Test Data
fb_comment_training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40949 entries, 0 to 40948
Data columns (total 53 columns):
likes            40949 non-null int64
checkins         40949 non-null int64
returns          40949 non-null int64
category         40949 non-null int64
min-cc1          40949 non-null int64
max-cc1          40949 non-null int64
avg-cc1          40949 non-null float64
median-cc1       40949 non-null float64
std-dev-cc1      40949 non-null float64
min-cc2          40949 non-null int64
max-cc2          40949 non-null int64
avg-cc2          40949 non-null float64
median-cc2       40949 non-null float64
std-dev-cc2      40949 non-null float64
min-cc3          40949 non-null int64
max-cc3          40949 non-null int64
avg-cc3          40949 non-null float64
median-cc3       40949 non-null float64
std-dev-cc3      40949 non-null float64
min-cc4          40949 non-null int64
max-cc4          40949 non-null int64
avg-cc4          40949 non-null float64
median-cc4       40949 non-null flo

In [5]:
# Take a Look at Imported Test Data 
fb_comment_test_data.head()

Unnamed: 0,likes,checkins,returns,category,min-cc1,max-cc1,avg-cc1,median-cc1,std-dev-cc1,min-cc2,...,fri_pub,sat_pub,sun_base,mon_base,tue_base,wed_base,thu_base,fri_base,sat_base,output
0,634995,0,463,1,1,17,7.0,2.0,6.663333,1,...,0,0,0,0,0,0,1,0,0,1
1,634995,0,463,1,1,17,7.0,2.0,6.663333,1,...,0,0,0,0,0,0,0,1,0,0
2,634995,0,463,1,1,17,7.0,2.0,6.663333,1,...,1,0,0,0,0,0,0,1,0,0
3,634995,0,463,1,1,17,7.0,2.0,6.663333,1,...,0,1,1,0,0,0,0,0,0,0
4,634995,0,463,1,1,17,7.0,2.0,6.663333,1,...,0,0,0,1,0,0,0,0,0,1


In [6]:
X_train, X_test, y_train, y_test = fb_comment_training_data[:40949], fb_comment_training_data[40949:], fb_comment_training_data[:40949], fb_comment_training_data[40949:]

In [7]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(40949, 53)
(0, 53)
(40949, 53)
(0, 53)


In [8]:
# Remove labels prior to transformations & scaling
from sklearn.model_selection import train_test_split
X_train_prepped = X_train.drop("output", axis=1)
y_train_prepped = X_train["output"].copy()

In [9]:
print(X_train_prepped.shape)
print(y_train_prepped.shape)

(40949, 52)
(40949,)


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer 

num_pipeline = Pipeline([('imputer', Imputer(strategy="median")), ('std_scaler', StandardScaler()),])
X_train_prepped_transformed = num_pipeline.fit_transform(X_train_prepped)
print(X_train_prepped_transformed)

[[-0.10003712 -0.22707468 -0.39967795 ..., -0.42147708 -0.40996237
   2.42938358]
 [-0.10003712 -0.22707468 -0.39967795 ..., -0.42147708  2.43924827
  -0.41162705]
 [-0.10003712 -0.22707468 -0.39967795 ..., -0.42147708 -0.40996237
   2.42938358]
 ..., 
 [ 0.86303905 -0.22367546  4.07635256 ...,  2.37260824 -0.40996237
  -0.41162705]
 [ 0.86303905 -0.22367546  4.07635256 ..., -0.42147708  2.43924827
  -0.41162705]
 [ 0.86303905 -0.22367546  4.07635256 ..., -0.42147708  2.43924827
  -0.41162705]]


In [None]:
from sklearn.tree import DecisionTreeRegressor
Decision_Tree = DecisionTreeRegressor(criterion='mae',max_depth=2)
Decision_Tree.fit(X_train_prepped_transformed,y_train)

In [None]:
predictions = Decision_Tree.predict(y_train)