## Modeling
For this predictive model, our target variable will be "up_votes", which we will use to test the accuracy of model on a holdout subset of testing data, measured by the RMSE value. We also investigate the feature importance of the explantory variables, to determine which features are most impactful in causing a post to receive more "up_votes" (going viral) and what causes the opposite (less/no "up_votes).

Target Variable: "up_votes"

Tokenization: 
- TF-IDF Vectorizor
- Bert Tokenizer

Train-Test-Split: 80/20

Models: 
- Sklearn's Logistic Regression (baseline)
- Sklearn's Elastic Net
- Sklearn's Random Forest
- XgBoost
- Tensorflow's Bert

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from datetime import datetime

from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/Eluvio_Challenge/')

Mounted at /content/drive


In [3]:
data = pd.read_csv('Eluvio_DS_Challenge_cleaned_wrangled.csv', index_col=0)

In [4]:
data.head(2)

Unnamed: 0,time_created,up_votes,title,over_18,author,clock_time,hour,day_of_week,char_count,word_count,flesch_readability,flesch_grade,month,year
0,2008-01-25 03:34:06,3,Scores killed in Pakistan clashes,False,polar,03:34:06,3,4,33,5,83.32,2.9,1,2008
1,2008-01-25 03:34:35,2,Japan resumes refuelling mission,False,polar,03:34:35,3,4,32,4,33.58,9.6,1,2008


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 509236 entries, 0 to 509235
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   time_created        509236 non-null  object 
 1   up_votes            509236 non-null  int64  
 2   title               509236 non-null  object 
 3   over_18             509236 non-null  bool   
 4   author              509236 non-null  object 
 5   clock_time          509236 non-null  object 
 6   hour                509236 non-null  int64  
 7   day_of_week         509236 non-null  int64  
 8   char_count          509236 non-null  int64  
 9   word_count          509236 non-null  int64  
 10  flesch_readability  509236 non-null  float64
 11  flesch_grade        509236 non-null  float64
 12  month               509236 non-null  int64  
 13  year                509236 non-null  int64  
dtypes: bool(1), float64(2), int64(7), object(4)
memory usage: 54.9+ MB


### Feature Engineering
For all the model except for the Bert Model, we will use a TF-IDF Vectorizer to convert the "title" column into tokens.

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [7]:
w_tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    text = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    text = ' '.join(text)
    return text

title_series = data.title.apply(lemmatize_text)


final_stopwords_list = stopwords.words('english')
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=200000, 
                                   min_df=0.001, stop_words=final_stopwords_list, 
                                   use_idf=True, ngram_range=(1,2))

In [8]:
tfidf_X = tfidf_vectorizer.fit_transform(title_series)

In [9]:
# creating a TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_X.A, columns=tfidf_vectorizer.get_feature_names())

In [10]:
tfidf_df.head()

Unnamed: 0,000,000 people,10,10 000,10 year,100,100 000,11,12,13,14,15,150,16,17,18,19,1st,20,200,2008,2009,2010,2011,2012,2013,2014,2015,2016,2020,21,22,23,24,25,26,27,28,30,300,...,without,witness,woman,women,word,work,worker,workers,working,world,world cup,world largest,world news,world war,worldwide,worse,worst,worth,would,wounded,wrong,xi,yahoo,year,year ago,year old,year prison,years,yemen,yemeni,yet,york,young,youth,youtube,yr,zealand,zika,zimbabwe,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Dropping Colinear Columns and One-Hot Encoding Categorical Variables of Orginal Dataframe

In [11]:
data.head()

Unnamed: 0,time_created,up_votes,title,over_18,author,clock_time,hour,day_of_week,char_count,word_count,flesch_readability,flesch_grade,month,year
0,2008-01-25 03:34:06,3,Scores killed in Pakistan clashes,False,polar,03:34:06,3,4,33,5,83.32,2.9,1,2008
1,2008-01-25 03:34:35,2,Japan resumes refuelling mission,False,polar,03:34:35,3,4,32,4,33.58,9.6,1,2008
2,2008-01-25 03:42:03,3,US presses Egypt on Gaza border,False,polar,03:42:03,3,4,31,6,90.77,2.1,1,2008
3,2008-01-25 03:54:50,1,Jump-start economy: Give health care to all,False,fadi420,03:54:50,3,4,44,7,64.37,6.0,1,2008
4,2008-01-25 15:25:20,4,Council of Europe bashes EU&UN terror blacklist,False,mhermans,15:25:20,15,4,47,7,38.99,9.6,1,2008


In [12]:
data.drop(columns=['time_created', 'title','clock_time', 'author'], inplace=True)

In [13]:
data = pd.get_dummies(data,columns=['over_18','hour','month', 'day_of_week', 'year'],drop_first=True)

In [14]:
data.head()

Unnamed: 0,up_votes,char_count,word_count,flesch_readability,flesch_grade,over_18_True,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,year_2009,year_2010,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016
0,3,33,5,83.32,2.9,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,32,4,33.58,9.6,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,31,6,90.77,2.1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,1,44,7,64.37,6.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,4,47,7,38.99,9.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [15]:
from sklearn.preprocessing import StandardScaler

# scaling numeric variables to a mean of 0 with unit variance (std of 1).

scaler = StandardScaler()
data[['char_count', 'word_count','flesch_readability','flesch_grade']] = scaler.fit_transform(data[['char_count', 'word_count','flesch_readability','flesch_grade']])


In [16]:
tfidf_X.shape

(509236, 1969)

In [17]:
from scipy.sparse import hstack,csr_matrix

sparse_data = csr_matrix(data.drop(columns=['up_votes']).values)
sparse_data = hstack((sparse_data,tfidf_X))

In [18]:
col_names = data.columns.tolist() + tfidf_df.columns.tolist()

# Modeling

In [19]:
from sklearn.model_selection import train_test_split

# declaring target variable: y
y = data.up_votes.values # up_votes

# creating train test split for TFIDF Vectorizer:
tfidf_X_train, tfidf_X_test, tfidf_y_train, tfidf_y_test = train_test_split(sparse_data, y, test_size=0.25, random_state=42)

In [20]:
import sklearn.metrics as metrics
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    #mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    #print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [21]:
import numpy as np
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(tfidf_X_train, tfidf_y_train)
lr_preds = lr.predict(tfidf_X_test)

In [22]:
# Baseline Linear Regression Model
regression_results(tfidf_y_test, lr_preds)    

explained_variance:  0.0397
r2:  0.0396
MAE:  190.8411
MSE:  277134.463
RMSE:  526.4356


In [23]:
tfidf_y_test[:10]

array([  23,    0,  291,  794,   10, 6720,    8,    5,    9,    2])

In [24]:
lr_preds[:10]

array([137.08751287, -25.49727647, 273.45683316, 142.64492814,
       105.66919506, 302.16850768, -89.6791952 , 123.4122651 ,
       134.87531644, 206.21739782])

In [25]:
from sklearn.linear_model import ElasticNet

e_net = ElasticNet()
e_net.fit(tfidf_X_train, tfidf_y_train)
e_net_preds = e_net.predict(tfidf_X_test)

In [26]:
regression_results(tfidf_y_test, e_net_preds)    

explained_variance:  0.011
r2:  0.011
MAE:  181.0359
MSE:  285388.8256
RMSE:  534.218


In [28]:
print(e_net_preds[:10])
print(tfidf_y_test[:10])

[ 83.95481729  73.1521116  117.49245929 109.55620959  96.50634366
 164.85189902  74.51716844  93.91287962  84.16448078 105.37205357]
[  23    0  291  794   10 6720    8    5    9    2]


In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

# Use grid search to tune the parameters:

parametersGrid = {"max_iter": [1, 5, 10],
                  "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                  "l1_ratio": np.arange(0.0, 1.0, 0.1)}

eNet = ElasticNet()
grid = GridSearchCV(eNet, parametersGrid, scoring='neg_root_mean_squared_error', cv=10)
grid.fit(tfidf_X_train, tfidf_y_train)


  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)


In [None]:
Y_pred = grid.predict(X_test)