## Modeling
For this predictive model, our target variable will be "up_votes", which we will use to test the accuracy of model on a holdout subset of testing data, measured by the RMSE value. We also investigate the feature importance of the explantory variables, to determine which features are most impactful in causing a post to receive more "up_votes" (going viral) and what causes the opposite (less/no "up_votes).

Target Variable: "up_votes"

Tokenization: 
- TF-IDF Vectorizor
- Bert Tokenizer

Train-Test-Split: 80/20

Models: 
- Sklearn's Logistic Regression (baseline)
- Sklearn's Elastic Net
- Sklearn's Random Forest
- XgBoost
- Tensorflow's Bert

In [1]:
!pip install sentencepiece
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import logging
logging.basicConfig(level=logging.INFO)

import tensorflow_hub as hub
import tokenization
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)

from sklearn.model_selection import train_test_split
#from sklearn.metrics import classification_report
from sklearn import metrics
import keras
import time



INFO:absl:Using /tmp/tfhub_modules to cache modules.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from datetime import datetime

from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/Eluvio_Challenge/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
data = pd.read_csv('Eluvio_DS_Challenge_cleaned_wrangled.csv', index_col=0)

In [28]:
data.head(2)

Unnamed: 0,time_created,up_votes,title,over_18,author,clock_time,hour,day_of_week,char_count,word_count,flesch_readability,flesch_grade,month,year
0,2008-01-25 03:34:06,3,Scores killed in Pakistan clashes,False,polar,03:34:06,3,4,33,5,83.32,2.9,1,2008
1,2008-01-25 03:34:35,2,Japan resumes refuelling mission,False,polar,03:34:35,3,4,32,4,33.58,9.6,1,2008


In [29]:
data = data[data.year==2016]

In [30]:
data.shape

(81892, 14)

In [31]:
data = data.iloc[:1000]

In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 427344 to 428343
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   time_created        1000 non-null   object 
 1   up_votes            1000 non-null   int64  
 2   title               1000 non-null   object 
 3   over_18             1000 non-null   bool   
 4   author              1000 non-null   object 
 5   clock_time          1000 non-null   object 
 6   hour                1000 non-null   int64  
 7   day_of_week         1000 non-null   int64  
 8   char_count          1000 non-null   int64  
 9   word_count          1000 non-null   int64  
 10  flesch_readability  1000 non-null   float64
 11  flesch_grade        1000 non-null   float64
 12  month               1000 non-null   int64  
 13  year                1000 non-null   int64  
dtypes: bool(1), float64(2), int64(7), object(4)
memory usage: 110.4+ KB


### Feature Engineering
For all the model except for the Bert Model, we will use a TF-IDF Vectorizer to convert the "title" column into tokens.

In [33]:
train,test =train_test_split(data,test_size = 0.3, random_state=42)
# train.loc[:]['text'] = train.text.astype(str)
# test.loc[:]['text'] = test.text.astype(str)

In [34]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [35]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [36]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001))(clf_output)
    #net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001))(net)
    #net = tf.keras.layers.Dropout(0.2)(net)
    #out = tf.keras.layers.Dense(1,kernel_initializer='normal')(net)
    out = tf.keras.layers.Dense(1)(net)


    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    optimizer = keras.optimizers.Adam(learning_rate=0.01)

    model.compile(loss='mean_squared_error', optimizer=optimizer)

    return model

In [37]:
start = time.time()

max_len = 150
train_input = bert_encode(train.title.values, tokenizer, max_len=max_len)

stop = time.time()
print(stop-start)

0.23097801208496094


In [38]:
from tensorflow.keras import regularizers

model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 150)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 150)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [39]:
# file_path/name for saved models

# saved_model_folder = "/content/drive/MyDrive/Eluvio_Challenge/"

#specific file path if saving to a mounted google Drive
model_folder = './'
mode_name = 'bert_model.h5'
file_path = model_folder+ mode_name

In [40]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(file_path, monitor='val_loss', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)

train_history = model.fit(
    train_input, train.up_votes.values, 
    validation_split=0.2,
    epochs=3,
    callbacks=[checkpoint, earlystopping],
    batch_size=32,
    verbose=1)

Epoch 1/3

Epoch 00001: val_loss improved from inf to 576202.43750, saving model to ./bert_model.h5
Epoch 2/3

Epoch 00002: val_loss improved from 576202.43750 to 561687.18750, saving model to ./bert_model.h5
Epoch 3/3

Epoch 00003: val_loss improved from 561687.18750 to 561605.37500, saving model to ./bert_model.h5


In [42]:
len(train.up_votes.values)

7000

In [41]:
start = time.time()

max_len = 150
test_input = bert_encode(test.title.values, tokenizer, max_len=max_len)

stop = time.time()
print(stop-start)

0.08962225914001465


In [42]:
test_pred = model.predict(test_input)


In [43]:
import sklearn.metrics as metrics
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    #mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    #print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [44]:
regression_results(test.up_votes.values, test_pred)

explained_variance:  0.0
r2:  -0.0002
MAE:  321.0181
MSE:  657135.5825
RMSE:  810.639


In [45]:
test.up_votes.values[:10]

array([   0, 3242,   20,    2,    5,  337,    6,    1,    0,  224])

In [46]:
test_pred[:10]

array([[184.53539],
       [184.53535],
       [184.53539],
       [184.53537],
       [184.53539],
       [184.53539],
       [184.53539],
       [184.53535],
       [184.53539],
       [184.53539]], dtype=float32)

In [52]:

i= 900
data.title[i:i+10]

428244    Thai Police Websites Hacked With  Failed Law  ...
428245    Pentagon shutters African drone base, moves ai...
428246    Iran’s role in and contribution to the war on ...
428247    Malaysia Airlines temporarily bans checked bag...
428248    US Pacific Fleet shrinks even as China grows m...
428249    Kuwait recalls envoy to Iran as Saudis say row...
428250     Saudi-Iran crisis widens as Kuwait recalls envoy
428251    Saudi Arabia sees  no effect  from break with ...
428252    Isis executes first female citizen journalist ...
428253    Floating CSIRO robots destined for remote isla...
Name: title, dtype: object

In [66]:
data.iloc[40]['title']

'Stripped of citizenship, Dominicans of Haitian descent face life in limbo'

In [73]:
data[data.over_18 == True]['author'].nunique()

285

In [74]:
data[data.over_18 == False]['author'].nunique()

85754

In [75]:
data.title.nunique()

500720

In [77]:
data.sort_values(by=['title','time_created'])

Unnamed: 0,time_created,up_votes,title,over_18,author,clock_time,hour,day_of_week,char_count,word_count,flesch_readability,flesch_grade,month,year
75745,2011-02-12 08:11:15,15,\t\r\n\r\nIsrael unprepared for Mubarak s fall...,False,Orangutan,08:11:15,8,5,116,18,44.75,11.5,2,2011
64012,2010-09-07 20:01:38,7,\t\r\n\r\nLooted artifacts returned to Iraq: \...,False,Orangutan,20:01:38,20,1,119,17,28.84,13.5,9,2010
75417,2011-02-09 19:41:02,29,\t\r\n\r\nProtests in Egypt escalate: \r\nThou...,False,Orangutan,19:41:02,19,2,102,14,31.89,12.3,2,2011
72611,2011-01-19 20:12:38,5,\t\r\n\r\nSwiss freeze Ben Ali finances: \r\nS...,False,Orangutan,20:12:38,20,2,111,15,64.71,8.0,1,2011
117794,2012-03-24 19:52:57,0,"\t\r\n\r\nTel Aviv: 1,000 march against Iran s...",False,jack_alexander,19:52:57,19,5,183,28,44.07,13.8,3,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37450,2009-06-23 19:30:16,0,"﻿ Shouting for human rights and democracy, whi...",False,WhoKilledTeddyBear,19:30:16,19,1,117,20,59.64,9.9,6,2009
457705,2016-04-30 03:41:11,12,﻿A Western Company Could Finally Be Held Accou...,False,DoremusJessup,03:41:11,3,5,227,39,40.35,17.3,4,2016
419124,2015-12-02 07:43:02,6,"﻿In South Korea, a Dictator’s Daughter Cracks ...",False,RespublicaCuriae,07:43:02,7,2,59,10,69.79,6.0,12,2015
235833,2013-12-12 01:49:42,2,"﻿Israel holds 5,000 Palestinians, including 20...",False,User_Name13,01:49:42,1,3,66,9,29.52,11.1,12,2013
