In [1]:
# inline plotting instead of popping out
%matplotlib inline

import os, itertools, csv, timeit, json, re
from datetime import datetime
from IPython.display import Image
from IPython.display import display
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
# sklearn
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier

from sklearn.metrics import accuracy_score, mean_squared_error, roc_curve, auc, roc_auc_score

In [3]:
from tqdm import tqdm

tqdm.pandas()

In [4]:
dirname = './data'
f_train = 'preprocessed_train_2.csv'
f_test = 'preprocessed_test_2.csv'
f_sub = 'sample_submission.csv'
p_train = os.path.join(dirname, f_train)
p_test = os.path.join(dirname, f_test)
p_sub = os.path.join(dirname, f_sub)

In [6]:
df_train = pd.read_csv(p_train)
df_train.rename(
    columns={'Id': 'id', 'Popularity': 'pop', 'Page content': 'content', 'author scalar': 'author_scalar'}, 
    inplace=True
)
pop_group = df_train.groupby(['pop'])
if 'Unnamed: 0' in df_train.columns:
    df_train.drop(['Unnamed: 0'], axis=1, inplace=True)
df_train.head()

Unnamed: 0,id,pop,content,article,title,topics,data_channel,see_also,p_year,p_month,...,title_pos_tag_7,title_pos_tag_8,title_pos_tag_9,title_pos_tag_10,title_pos_tag_11,title_pos_tag_12,title_pos_tag_13,title_pos_tag_14,title_pos_tag_15,num_topics
0,0,-1,"<html><head><div class=""article-info""> <span c...",There may be killer asteroids headed for Earth...,NASA's Grand Challenge: Stop Asteroids From De...,"['Asteroid', 'Asteroids', 'challenge', 'Earth'...",world,1,2013.0,6.0,...,IN,VBG,NN,NONE,NONE,NONE,NONE,NONE,NONE,73
1,1,1,"<html><head><div class=""article-info""><span cl...",Google took a stand of sorts against patent-la...,Google's New Open Source Patent Pledge: We Won...,"['Apps and Software', 'Google', 'open source',...",tech,1,2013.0,3.0,...,NN,PRP,MD,RB,VB,IN,VBN,RB,NONE,122
2,2,1,"<html><head><div class=""article-info""><span cl...",You've spend countless hours training to be an...,Ballin': 2014 NFL Draft Picks Get to Choose Th...,"['Entertainment', 'NFL', 'NFL Draft', 'Sports'...",entertainment,1,2014.0,5.0,...,TO,VB,PRP$,JJ,NN,NN,NONE,NONE,NONE,61
3,3,-1,"<html><head><div class=""article-info""><span cl...",Tired of the same old sports fails and news f...,Cameraperson Fails Deliver Slapstick Laughs,"['Sports', 'Video', 'Videos', 'Watercooler']",watercooler,1,2013.0,10.0,...,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,44
4,4,-1,"<html><head><div class=""article-info""><span cl...","At 6-foot-5 and 298 pounds, All-Pro NFL star J...",NFL Star Helps Young Fan Prove Friendship With...,"['Entertainment', 'instagram', 'instagram vide...",entertainment,1,2014.0,4.0,...,NN,IN,JJ,NN,NONE,NONE,NONE,NONE,NONE,66


In [7]:
df_test = pd.read_csv(p_test)
df_test.rename(
    columns={'Id': 'id', 'Page content': 'content'}, 
    inplace=True
)
if 'Unnamed: 0' in df_test.columns:
    df_test.drop(['Unnamed: 0'], axis=1, inplace=True)
df_test.head()

Unnamed: 0,id,content,article,title,topics,data_channel,see_also,p_year,p_month,p_day,...,title_pos_tag_7,title_pos_tag_8,title_pos_tag_9,title_pos_tag_10,title_pos_tag_11,title_pos_tag_12,title_pos_tag_13,title_pos_tag_14,title_pos_tag_15,num_topics
0,27643,"<html><head><div class=""article-info""><span cl...",Note to humanity: One Direction fandom ain't ...,Soccer Star Gets Twitter Death Threats After T...,"['Entertainment', 'Music', 'One Direction', 's...",entertainment,1,2013.0,9.0,9.0,...,IN,VBG,CD,NN,NN,NONE,NONE,NONE,NONE,63
1,27644,"<html><head><div class=""article-info""><span cl...",Shortly after announcing a hardware upgrade fo...,Google Glass Gets an Accessory Store,"['Gadgets', 'glass', 'Google', 'Google Glass',...",tech,1,2013.0,10.0,31.0,...,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,70
2,27645,"<html><head><div class=""article-info""><span cl...","Well, that was quick. Just hours after going o...",OUYA Gaming Console Already Sold Out on Amazon,"['amazon', 'amazon kindle', 'Business', 'Gaming']",business,1,2013.0,6.0,25.0,...,IN,NN,NONE,NONE,NONE,NONE,NONE,NONE,NONE,49
3,27646,"<html><head><div class=""article-info""><span cl...",Between Two Ferns: Oscar Buzz Edition Part 1 ...,'Between Two Ferns' Mocks Oscar Nominees,"['Between Two Ferns', 'Movies', 'The Oscars', ...",film,1,2013.0,2.0,13.0,...,NNS,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,94
4,27647,"<html><head><div class=""article-info""><span cl...",Ever since The Hurt Locker it seems like we h...,'American Sniper' Trailer: Looks Like Eastwood...,"['American Sniper', 'Awards', 'Bradley Cooper'...",entertainment,1,2014.0,10.0,3.0,...,NN,MD,VB,RB,IN,DT,NN,NONE,NONE,104


In [8]:
df_test.fillna(0, inplace=True)

In [9]:
def clean_text(text_to_clean, no_punc=True):
    res = text_to_clean.lower()
    res = re.sub(r"i'm", "i am", res)
    res = re.sub(r"he's", "he is", res)
    res = re.sub(r"she's", "she is", res)
    res = re.sub(r"it's", "it is", res)
    res = re.sub(r"that's", "that is", res)
    res = re.sub(r"what's", "what is", res)
    res = re.sub(r"where's", "where is", res)
    res = re.sub(r"how's", "how is", res)
#     res = re.sub(r"\'s", " is", res)
    res = re.sub(r"\'ll", " will", res)
    res = re.sub(r"\'ve", " have", res)
    res = re.sub(r"\'re", " are", res)
    res = re.sub(r"\'d", " would", res)
    res = re.sub(r"\'re", " are", res)
    res = re.sub(r"won't", "will not", res)
    res = re.sub(r"can't", "cannot", res)
    res = re.sub(r"n't", " not", res)
    res = re.sub(r"n'", "ng", res)
    res = re.sub(r"'bout", "about", res)
    res = re.sub(r"'til", "until", res)
    
    if no_punc:
        res = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", res)
    return res

In [10]:
stop_words = stopwords.words('english')

def preprocessor(text):
    porter = PorterStemmer()
    res = clean_text(text)
    
    tokens = [porter.stem(w) for w in re.split('\s+', res.strip()) \
        if w not in stop_words and re.match('[a-zA-Z]+', w)]
    
    return ' '.join(tokens)

## Sample popular & unpopular news

In [53]:
for c in pop_group.get_group(1).sample(n=5, random_state=1)['content']:
    print(c)
    print('\n\n\n')

<html><head><div class="article-info"><span class="byline "><a href="/author/laura-vitto/"><img alt="2016%2f09%2f21%2f3f%2flauravittoheadshotslowres1.b1191" class="author_image" src="http://i.amz.mshcdn.com/OAcgWfEl5UfjasprN9z1m4SEIXo=/90x90/2016%2F09%2F21%2F3f%2FLauraVittoheadshotslowres1.b1191.jpg"/></a><span class="author_name">By <a href="/author/laura-vitto/">Laura Vitto</a></span><time datetime="Tue, 28 Jan 2014 20:15:47 +0000">2014-01-28 20:15:47 UTC</time></span></div></head><body><h1 class="title">Vin Diesel's 7-Minute Dance Video Is His Greatest Work Yet</h1><figure class="article-image"></figure><article data-channel="entertainment"><section class="article-content"> <p>Vin Diesel is an actor, producer and, apparently, a huge Beyoncé fan.</p> <p>The <em>Fast and the Furious</em> star took to his <a href="http://mashable.com/category/facebook/" target="_blank">Facebook</a> page on Tuesday to share his latest (and possibly greatest) piece of work. The self-recorded clip shows t






<html><head><div class="article-info"><span class="byline "><a href="/author/connor-finnegan/"><img alt="2016%2f06%2f30%2fb0%2f201503270cheadshot_20.5abbf.2d457" class="author_image" src="http://i.amz.mshcdn.com/utF-AtG0D7uDgdK1Z4Ak7Ypvir0=/90x90/2016%2F06%2F30%2Fb0%2F201503270cHeadshot_20.5abbf.2d457.jpg"/></a><span class="author_name">By <a href="/author/connor-finnegan/">Connor Finnegan</a></span><time datetime="Tue, 10 Dec 2013 23:17:47 +0000">2013-12-10 23:17:47 UTC</time></span></div></head><body><h1 class="title">Vote for Your Favorite TV Show of 2013</h1><figure class="article-image"><img class="microcontent" data-fragment="lead-image" data-image="http://i.amz.mshcdn.com/KVuYINwWuMBc-jh1xbrwgoBig0Y=/950x534/2013%2F12%2F10%2F24%2Ftopshows.824b1.jpg" data-micro="1" data-url="null" src="http://i.amz.mshcdn.com/KVuYINwWuMBc-jh1xbrwgoBig0Y=/950x534/2013%2F12%2F10%2F24%2Ftopshows.824b1.jpg"/></figure><article data-channel="entertainment"><section class="article-content"> <p>In m

<html><head><div class="article-info"><span class="byline "><a href="/author/vignesh-ramachandran/"><img alt="Default-m" class="author_image" src="http://i.amz.mshcdn.com/raEPzavg65Jzvjy-L4U699QBlmQ=/90x90/default-m.jpg"/></a><span class="author_name">By <a href="/author/vignesh-ramachandran/">Vignesh Ramachandran</a></span><time datetime="Fri, 30 Aug 2013 00:49:06 +0000">2013-08-30 00:49:06 UTC</time></span></div></head><body><h1 class="title">This Basketball Will Coach You With Audible Feedback</h1><figure class="article-image"></figure><article data-channel="sports"><section class="article-content"> <div class="shift-to-hero"><div class="content-mash-video" data-autoplay="" data-embedurl="http://mashable.com/videos/embed?video=xVnmo4vC&amp;player=offsite" data-labels='["basketball", "kickstarter", "newsy", "sensor", "sports", "tech"]' data-playlist="http://content.jwplatform.com/manifests/xVnmo4vC.m3u8" data-template="postlead" data-thumbnail="http://content.jwplatform.com/thumbs/xV

In [54]:
for c in pop_group.get_group(-1).sample(n=5, random_state=1)['content']:
    print(c)
    print('\n\n\n')

<html><head><div class="article-info"><span class="byline basic"><span class="author_name">By <a href="/author/colin-gorenstein/">Colin Gorenstein</a></span><time datetime="Tue, 30 Sep 2014 16:50:49 +0000">2014-09-30 16:50:49 UTC</time></span></div></head><body><h1 class="title">Mindy Kaling and Elmo Are Undeniably Enthusiastic</h1><figure class="article-image"></figure><article data-channel="watercooler"><section class="article-content viral-video"> <div class="viral-video-lead"> <p><iframe allowfullscreen="" frameborder="0" height="360" src="https://www.youtube.com/embed/em5SSa-IVUU?enablejsapi=1&amp;" width="640"></iframe></p> <script src="http://a.amz.mshcdn.com/assets/lib/aab-7ce243b38b9cc2caec816aff811d3153.js" type="text/javascript"></script> </div> <p>Elmo is really enthusiastic about dancing with friends. <a href="http://mashable.com/category/mindy-kaling/">Mindy Kaling</a> is really enthusiastic about dancing with chickens. Chickens are, evidently, really enthusiastic about j

<html><head><div class="article-info"><span class="byline "><a href="/author/sandra-gonzalez/"><img alt="2016%2f06%2f29%2f62%2fhttpsd2mhye01h4nj2n.cloudfront.netmediazgkymde0lza4.a490c" class="author_image" src="http://i.amz.mshcdn.com/tNdeogX8LL5K1appzIRN425c_D0=/90x90/2016%2F06%2F29%2F62%2Fhttpsd2mhye01h4nj2n.cloudfront.netmediaZgkyMDE0LzA4.a490c.jpg"/></a><span class="author_name">By <a href="/author/sandra-gonzalez/">Sandra Gonzalez</a></span><time datetime="Mon, 30 Jun 2014 17:21:40 +0000">2014-06-30 17:21:40 UTC</time></span></div></head><body><h1 class="title">Denis Leary Is Heading Back to TV in 'Sex&Drugs&Rock&Roll' for FX</h1><figure class="article-image"><img alt="(From left) Denis Leary (&quot;Johnny Rock&quot;), John Corbett (&quot;Flash&quot;), John Ales (&quot;Rehab&quot;), and Robert Kelly (&quot;Bam Bam&quot;) star in FX's new comedy, &quot;Sex&amp;Drugs&amp;Rock&amp;Roll.&quot;" class="microcontent" data-fragment="lead-image" data-image="http://i.amz.mshcdn.com/n5fF-v

## Get published date

In [11]:
def get_public_datetime(content):
    """
    input: raw content
    output: article published time
    """
    soup = BeautifulSoup(content, 'html.parser')
    
    t = soup.time
    dt = datetime.strptime(t['datetime'], '%a, %d %b %Y %H:%M:%S %z')
    
    return dt

In [50]:
def populate_public_time_data(df):
    start = timeit.default_timer()
    
    for idx, r in df.iterrows():
        dt = get_public_datetime(r['content'])

        df.loc[idx, 'p_year'] = dt.year
        df.loc[idx, 'p_month'] = dt.month
        df.loc[idx, 'p_day'] = dt.day
        df.loc[idx, 'p_hour'] = dt.hour
        df.loc[idx, 'p_minute'] = dt.minute
        df.loc[idx, 'p_second'] = dt.second
        df.loc[idx, 'p_weekday'] = dt.weekday()
    
    stop = timeit.default_timer()
    print('Time: ', stop - start)  
    
    display(df.head())
    display(df.isna().sum())

In [207]:
populate_public_time_data(df_train)

Time:  184.87019739102107


Unnamed: 0,id,pop,content,p_year,p_month,p_day,p_hour,p_minute,p_second,p_weekday
0,0,-1,"<html><head><div class=""article-info""> <span c...",2013.0,6.0,19.0,15.0,4.0,30.0,2.0
1,1,1,"<html><head><div class=""article-info""><span cl...",2013.0,3.0,28.0,17.0,40.0,55.0,3.0
2,2,1,"<html><head><div class=""article-info""><span cl...",2014.0,5.0,7.0,19.0,15.0,20.0,2.0
3,3,-1,"<html><head><div class=""article-info""><span cl...",2013.0,10.0,11.0,2.0,26.0,50.0,4.0
4,4,-1,"<html><head><div class=""article-info""><span cl...",2014.0,4.0,17.0,3.0,31.0,43.0,3.0


id           0
pop          0
content      0
p_year       0
p_month      0
p_day        0
p_hour       0
p_minute     0
p_second     0
p_weekday    0
dtype: int64

In [171]:
populate_public_time_data(df_sample)

Time:  6.301303349959198


Unnamed: 0,id,pop,content,p_year,p_month,p_day,p_hour,p_minute,p_second,p_weekday
13516,13516,-1,"<html><head><div class=""article-info""><span cl...",2014.0,8.0,4.0,16.0,14.0,55.0,0.0
11607,11607,-1,"<html><head><div class=""article-info""><span cl...",2014.0,2.0,20.0,9.0,24.0,23.0,3.0
11866,11866,1,"<html><head><div class=""article-info""><span cl...",2013.0,4.0,24.0,17.0,43.0,38.0,2.0
25540,25540,-1,"<html><head><div class=""article-info""><span cl...",2013.0,10.0,22.0,15.0,30.0,17.0,1.0
26611,26611,-1,"<html><head><div class=""article-info""><span cl...",2014.0,1.0,6.0,20.0,39.0,46.0,0.0


id           0
pop          0
content      0
p_year       0
p_month      0
p_day        0
p_hour       0
p_minute     0
p_second     0
p_weekday    0
dtype: int64

## Find all tags & get useful tag number

In [None]:
def get_all_tag(content):
    soup = BeautifulSoup(content, 'html.parser')
    
    tags = set()
    for tag in soup.find_all():
        tags.add(tag.name)
        
    return tags

In [None]:
def get_all_df_tag(df):
    tags = set()
    
    start = timeit.default_timer()
    
    for idx, r in df.iterrows():
        tags = tags.union(get_all_tag(r['content']))
        
    stop = timeit.default_timer()
    print('Time: ', stop - start)
        
    return tags

In [None]:
all_tags = get_all_df_tag(df_sample)
all_tags

In [76]:
all_tags

{'a',
 'article',
 'b',
 'blockquote',
 'body',
 'br',
 'center',
 'del',
 'div',
 'div-class',
 'divclass',
 'em',
 'embed',
 'en',
 'figcaption',
 'figure',
 'footer',
 'glowinggoodies',
 'h1',
 'h2',
 'h3',
 'head',
 'header',
 'hr',
 'html',
 'i',
 'iframe',
 'img',
 'li',
 'link',
 'noscript',
 'object',
 'ol',
 'p',
 'param',
 'script',
 'section',
 'small',
 'span',
 'strong',
 'style',
 'sub',
 'time',
 'ul'}

In [None]:
def get_tag_num(content, tag):
    """
    input: raw content
    output: tag numbers
    """
    start = timeit.default_timer()
    
    soup = BeautifulSoup(content, 'html.parser')
    
    return len(soup.find_all(tag))

In [116]:
start = timeit.default_timer()

useful_tags = [
    'blockquote',
    'figcaption',
    'figure',
    'h1', 
    'h2',
    'h3',
    'iframe',
    'img',
    'strong'
]

for idx, r in df_sample.iterrows():
    for tag in useful_tags:
        col = 'num_tag_{}'.format(tag)
        df_sample.loc[idx, col] = get_tag_num(r['content'], tag)
    
stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  39.66087353701005


## Get title of news

In [149]:
# def get_title(content):
#     soup = BeautifulSoup(content, 'html.parser')
#     return soup.h1.string

In [172]:
# df_sample['title'] = df_sample['content'].progress_apply(get_title)

100%|██████████| 1000/1000 [00:04<00:00, 234.43it/s]


In [208]:
# df_train['title'] = df_train['content'].progress_apply(get_title)

100%|██████████| 27643/27643 [02:02<00:00, 226.20it/s]


In [17]:
def get_title_pos_tag(title):
    tag_pair = pos_tag(word_tokenize(clean_text(title)))
    tags = ""
    
    for s, tag in tag_pair:
        tags += tag + ' '
    return tags

In [None]:
df_train['title_pos_tag_seq'] = df_train['title'].progress_apply(get_title_pos_tag)
df_test['title_pos_tag_seq'] = df_test['title'].progress_apply(get_title_pos_tag)

In [19]:
def get_title_pos_tag_cols(df):
    def fill(arr, i):
        try:
            return arr[i]
        except:
            return 'NONE'
        
    start = timeit.default_timer()
    
#     df['title_pos_tag_arr'] =  df['title'].apply(lambda t: get_title_pos_tag(t).split()) 
    df['title_pos_tag_arr'] =  df['title_pos_tag_seq'].apply(lambda seq: seq.split())
    df['title_len'] =  df['title_pos_tag_seq'].apply(lambda seq: len(seq.split()))
    MAX_LEN = 15
        
    for i in range(MAX_LEN):
        col_name = 'title_pos_tag_{}'.format(i+1)
        df[col_name] = df['title_pos_tag_arr'].apply(lambda arr: fill(arr, i))

    stop = timeit.default_timer()
    print('Time: ', stop - start)  
    
    display(df.head())
    display(df.isna().sum())

In [20]:
get_title_pos_tag_cols(df_train)
get_title_pos_tag_cols(df_test)

Time:  0.1642745


Unnamed: 0,Id,Popularity,Page content,article,title,topics,data_channel,see_also,p_year,p_month,...,title_pos_tag_6,title_pos_tag_7,title_pos_tag_8,title_pos_tag_9,title_pos_tag_10,title_pos_tag_11,title_pos_tag_12,title_pos_tag_13,title_pos_tag_14,title_pos_tag_15
0,0,-1,"<html><head><div class=""article-info""> <span c...",There may be killer asteroids headed for Earth...,NASA's Grand Challenge: Stop Asteroids From De...,"['Asteroid', 'Asteroids', 'challenge', 'Earth'...",world,1,2013.0,6.0,...,NN,IN,VBG,NN,NONE,NONE,NONE,NONE,NONE,NONE
1,1,1,"<html><head><div class=""article-info""><span cl...",Google took a stand of sorts against patent-la...,Google's New Open Source Patent Pledge: We Won...,"['Apps and Software', 'Google', 'open source',...",tech,1,2013.0,3.0,...,NN,NN,PRP,MD,RB,VB,IN,VBN,RB,NONE
2,2,1,"<html><head><div class=""article-info""><span cl...",You've spend countless hours training to be an...,Ballin': 2014 NFL Draft Picks Get to Choose Th...,"['Entertainment', 'NFL', 'NFL Draft', 'Sports'...",entertainment,1,2014.0,5.0,...,VBP,TO,VB,PRP$,JJ,NN,NN,NONE,NONE,NONE
3,3,-1,"<html><head><div class=""article-info""><span cl...",Tired of the same old sports fails and news f...,Cameraperson Fails Deliver Slapstick Laughs,"['Sports', 'Video', 'Videos', 'Watercooler']",watercooler,1,2013.0,10.0,...,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
4,4,-1,"<html><head><div class=""article-info""><span cl...","At 6-foot-5 and 298 pounds, All-Pro NFL star J...",NFL Star Helps Young Fan Prove Friendship With...,"['Entertainment', 'instagram', 'instagram vide...",entertainment,1,2014.0,4.0,...,VB,NN,IN,JJ,NN,NONE,NONE,NONE,NONE,NONE


Id                   0
Popularity           0
Page content         0
article              0
title                0
topics               0
data_channel         0
see_also             0
p_year               0
p_month              0
p_day                0
p_hour               0
p_minute             0
p_second             0
p_weekday            0
author               0
author scalar        0
img_num              0
link_num             0
title_pos_tag_seq    0
title_pos_tag_arr    0
title_len            0
title_pos_tag_1      0
title_pos_tag_2      0
title_pos_tag_3      0
title_pos_tag_4      0
title_pos_tag_5      0
title_pos_tag_6      0
title_pos_tag_7      0
title_pos_tag_8      0
title_pos_tag_9      0
title_pos_tag_10     0
title_pos_tag_11     0
title_pos_tag_12     0
title_pos_tag_13     0
title_pos_tag_14     0
title_pos_tag_15     0
dtype: int64

Time:  0.105737


Unnamed: 0,Id,Page content,article,title,topics,data_channel,see_also,p_year,p_month,p_day,...,title_pos_tag_6,title_pos_tag_7,title_pos_tag_8,title_pos_tag_9,title_pos_tag_10,title_pos_tag_11,title_pos_tag_12,title_pos_tag_13,title_pos_tag_14,title_pos_tag_15
0,27643,"<html><head><div class=""article-info""><span cl...",Note to humanity: One Direction fandom ain't ...,Soccer Star Gets Twitter Death Threats After T...,"['Entertainment', 'Music', 'One Direction', 's...",entertainment,1,2013.0,9.0,9.0,...,NNS,IN,VBG,CD,NN,NN,NONE,NONE,NONE,NONE
1,27644,"<html><head><div class=""article-info""><span cl...",Shortly after announcing a hardware upgrade fo...,Google Glass Gets an Accessory Store,"['Gadgets', 'glass', 'Google', 'Google Glass',...",tech,1,2013.0,10.0,31.0,...,NN,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
2,27645,"<html><head><div class=""article-info""><span cl...","Well, that was quick. Just hours after going o...",OUYA Gaming Console Already Sold Out on Amazon,"['amazon', 'amazon kindle', 'Business', 'Gaming']",business,1,2013.0,6.0,25.0,...,RP,IN,NN,NONE,NONE,NONE,NONE,NONE,NONE,NONE
3,27646,"<html><head><div class=""article-info""><span cl...",Between Two Ferns: Oscar Buzz Edition Part 1 ...,'Between Two Ferns' Mocks Oscar Nominees,"['Between Two Ferns', 'Movies', 'The Oscars', ...",film,1,2013.0,2.0,13.0,...,VBP,NNS,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
4,27647,"<html><head><div class=""article-info""><span cl...",Ever since The Hurt Locker it seems like we h...,'American Sniper' Trailer: Looks Like Eastwood...,"['American Sniper', 'Awards', 'Bradley Cooper'...",entertainment,1,2014.0,10.0,3.0,...,IN,NN,MD,VB,RB,IN,DT,NN,NONE,NONE


Id                   0
Page content         0
article              1
title                0
topics               0
data_channel         0
see_also             0
p_year               1
p_month              1
p_day                1
p_hour               1
p_minute             1
p_second             1
p_weekday            1
author               0
author_scalar        0
img_num              0
link_num             0
title_pos_tag_seq    0
title_pos_tag_arr    0
title_len            0
title_pos_tag_1      0
title_pos_tag_2      0
title_pos_tag_3      0
title_pos_tag_4      0
title_pos_tag_5      0
title_pos_tag_6      0
title_pos_tag_7      0
title_pos_tag_8      0
title_pos_tag_9      0
title_pos_tag_10     0
title_pos_tag_11     0
title_pos_tag_12     0
title_pos_tag_13     0
title_pos_tag_14     0
title_pos_tag_15     0
dtype: int64

In [21]:
df_train['num_topics'] = df_train['topics'].apply(lambda arr: len(arr))
df_test['num_topics'] = df_test['topics'].apply(lambda arr: len(arr))

In [26]:
df_test.drop(['title_pos_tag_arr'], axis=1, inplace=True)

## Get first paragraph

In [15]:
def get_paragraph(content, idx):
    soup = BeautifulSoup(content, 'html.parser')
    all_p = soup.find_all('p')
    p_idx = 0
    res = ""
    
    for i in range(idx):
        try:
            p = all_p[p_idx].getText().lower()
            if 'see also' in p:
                p_idx += 1
                p = all_p[p_idx].getText().lower()
                res += ' ' + p
            else:
                res += ' ' + p
        except:
            res += ""
            
        p_idx += 1
        
    return res

In [None]:
df_train['para_1_3'] = df_train['content'].progress_apply(lambda c: get_paragraph(c, 3))
df_test['para_1_3'] = df_test['content'].progress_apply(lambda c: get_paragraph(c, 3))

In [None]:
df_train['para_1_3_len'] = df_train['para_1_3'].progress_apply(lambda s: len(preprocessor(s).split()))
df_test['para_1_3_len'] = df_test['para_1_3'].progress_apply(lambda s: len(preprocessor(s).split()))

In [29]:
df_train['article_len'] = df_train['article'].progress_apply(lambda s: s.split())
df_test['article_len'] = df_test['article'].progress_apply(lambda s: s.split())

100%|█████████████████████████████████████████████████████████████████████████| 27643/27643 [00:01<00:00, 24968.33it/s]
 39%|████████████████████████████▋                                             | 4601/11847 [00:00<00:00, 23640.60it/s]

AttributeError: 'int' object has no attribute 'split'

## tokenize words

In [96]:
count_topics = CountVectorizer(
    ngram_range=(1, 1),
    preprocessor=preprocessor,
#     max_features=100
)

doc_bag = count_topics.fit_transform(df_train['para_1'])

In [97]:
len(count_topics.get_feature_names())

25618

In [92]:
tfidf_topics = TfidfVectorizer(
    ngram_range=(1, 2),
    preprocessor=preprocessor,
    max_features=100
)

doc_tfidf = tfidf_topics.fit_transform(df_train['para_1'])

In [93]:
tfidf_topics.get_feature_names()

['accord',
 'also',
 'announc',
 'app',
 'appl',
 'around',
 'back',
 'big',
 'busi',
 'call',
 'citi',
 'come',
 'compani',
 'could',
 'day',
 'devic',
 'digit',
 'even',
 'event',
 'facebook',
 'featur',
 'final',
 'find',
 'first',
 'friday',
 'game',
 'get',
 'go',
 'good',
 'googl',
 'help',
 'internet',
 'iphon',
 'job',
 'know',
 'last',
 'launch',
 'like',
 'live',
 'look',
 'made',
 'make',
 'mani',
 'market',
 'mashabl',
 'may',
 'media',
 'might',
 'million',
 'mobil',
 'monday',
 'month',
 'much',
 'network',
 'new',
 'new york',
 'news',
 'next',
 'night',
 'one',
 'onlin',
 'peopl',
 'phone',
 'photo',
 'releas',
 'report',
 'said',
 'say',
 'see',
 'seri',
 'servic',
 'show',
 'sinc',
 'smartphon',
 'social',
 'space',
 'start',
 'state',
 'take',
 'technolog',
 'thing',
 'thursday',
 'time',
 'tuesday',
 'twitter',
 'two',
 'updat',
 'us',
 'use',
 'user',
 'video',
 'want',
 'way',
 'wednesday',
 'week',
 'work',
 'world',
 'would',
 'year',
 'york']

## Prepare pipeline

In [11]:
# df = pd.concat([df_train, df_train_oh_dc], axis=1)
df_sample = df_train.sample(n=2000, random_state=4)
df_sample.reset_index(inplace=True)
X = df_sample.drop(['pop'], axis=1)
y = df_sample['pop']

# X = df_train.drop(['pop'], axis=1)
# y = df_train['pop']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [12]:
TITLE_MAX_LEN = 15
title_pos_tag_col = ['title_pos_tag_{}'.format(i+1) for i in range(TITLE_MAX_LEN)]

all_pos_tag = set()

for col in df_train[title_pos_tag_col]:
    all_pos_tag = all_pos_tag.union(df_train[col].unique())     

In [13]:
class TitlePosTagEncodeTransformer():
    def __init__(self):
        self.le = LabelEncoder()
        self.le.fit(list(all_pos_tag))

    def transform(self, df, **transform_params):
        for col in df[title_pos_tag_col]:
            df[col] = self.le.transform(df[col])
        
        return df

    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}

In [30]:
names = []
pipes = []
TITLE_MAX_LEN = 5

date_col = ['p_year', 'p_month', 'p_day', 'p_hour', 'p_weekday']
used_col = ['author_scalar', 'num_topics', 'title_len', 'img_num', 'see_also']
title_pos_tag_col = ['title_pos_tag_{}'.format(i+1) for i in range(TITLE_MAX_LEN)]

topic_bow_vec = CountVectorizer(
    ngram_range=(1, 2), 
    preprocessor=preprocessor, 
    max_features=6000
)

title_bow_vec = CountVectorizer(
    ngram_range=(1, 2), 
    preprocessor=preprocessor, 
    max_features=4000
)

article_tfidf_vec = TfidfVectorizer(
    ngram_range=(1, 2), 
    preprocessor=preprocessor, 
    max_features=3000
)

pca_50 = TruncatedSVD(
    n_components=30,
    random_state=0
)

pca_300 = TruncatedSVD(
    n_components=300,
    random_state=0
)

data_channel_one_hot = OneHotEncoder(handle_unknown='ignore')
std_scaler = StandardScaler()
poly_scaler = PolynomialFeatures(2, include_bias=False)
title_pos_tag_trans = TitlePosTagEncodeTransformer()

p1 = Pipeline([
    ('colt', make_column_transformer(
            (data_channel_one_hot, ['data_channel']),
            (std_scaler, date_col+used_col),
            (poly_scaler, date_col+used_col),
            (title_pos_tag_trans, title_pos_tag_col)
    )),
#     ('pca', pca_50),
    ('clf', RidgeClassifier(alpha=10, normalize=True))
])
names.append('Ridge')
pipes.append(p1)

# p2 = Pipeline([
#     ('colt', make_column_transformer(
#             (data_channel_one_hot, ['data_channel']),
#             (std_scaler, date_col+used_col),
#             (poly_scaler, date_col+used_col),
#             (title_pos_tag_trans, title_pos_tag_col)
#     )),
#     ("clf", KNeighborsClassifier(n_neighbors = 100, p = 2))
# ]
# # , verbose=True
# )
# names.append('KNN')
# pipes.append(p2)

# p3 = Pipeline([
#     ('colt', make_column_transformer(
#             (topic_bow_vec, 'topics'),
#             (title_bow_vec, 'title'),
#             (para_tfidf_vec, 'para_1_3'),
#             (data_channel_one_hot, ['data_channel']),
#             (std_scaler, date_col+used_col),
#             (poly_scaler, date_col+used_col),
#             (title_pos_tag_trans, title_pos_tag_col)
#     )),
#     ('clf', SVC(kernel = "linear", random_state = 0, gamma = 0.001, C = 100.0))
# ]
# , verbose=True
# )
# names.append('SVC')
# pipes.append(p3)

# ada_clf = DecisionTreeClassifier(max_depth=1)
# p4 = Pipeline([
#     ('colt', make_column_transformer(
#             (topic_bow_vec, 'topics'),
#             (title_bow_vec, 'title'),
#             (para_tfidf_vec, 'para_1_3'),
#             (data_channel_one_hot, ['data_channel']),
#             (std_scaler, date_col+used_col),
#             (poly_scaler, date_col+used_col),
#             (title_pos_tag_trans, title_pos_tag_col)
#     )),
# #     ('pca', pca_300),
#     ('clf', AdaBoostClassifier(ada_clf, n_estimators=500, random_state=0, learning_rate=0.1))
# ]
# , verbose=True
# )
# names.append('Ada')
# pipes.append(p4)

bag_clf = DecisionTreeClassifier(max_depth=1)
p5 = Pipeline([
    ('colt', make_column_transformer(
            (topic_bow_vec, 'topics'),
            (title_bow_vec, 'title'),
#             (para_tfidf_vec, 'para_1_3'),
            (article_tfidf_vec, 'article'),
            (data_channel_one_hot, ['data_channel']),
            (std_scaler, date_col+used_col),
            (poly_scaler, date_col+used_col),
            (title_pos_tag_trans, title_pos_tag_col)
    )),
    ('clf', BaggingClassifier(
            base_estimator=bag_clf, 
            n_estimators=500, 
            max_samples=0.7,
            n_jobs=8, 
            random_state=0
    ))
]
, verbose=True
)
names.append('Bag')
pipes.append(p5)

#     ("clf", KNeighborsClassifier(n_neighbors = 30, p = 2))
#     ('dt', DecisionTreeClassifier(random_state=0))
#     ('gnb', GaussianNB())
#     ('mnb', MultinomialNB())
#     ('clf', RidgeClassifier(alpha=10, normalize=True))
#     ('clf', LogisticRegression(solver = "liblinear"))

In [26]:
# p3.fit(X_train, y_train)

In [27]:
# from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB

In [28]:
# CV
K = 10
start = timeit.default_timer()

print('[auc (5-fold cv)]')
for name, clf in zip(names, pipes):
    scores = cross_val_score(
        estimator=clf, 
        X=X_train, 
        y=y_train,
        cv=K, 
        scoring='roc_auc'
    )
    print('%s: %.3f (+/-%.3f)' % (name, scores.mean(), scores.std()))
    
stop = timeit.default_timer()
print('Time: ', stop - start) 

[auc (5-fold cv)]
Ridge: 0.584 (+/-0.025)
Bag: 0.561 (+/-0.038)
Time:  152.7318801


In [54]:
def fit_multiple_estimators(classifiers, X_list, y, sample_weights = None):

    # Convert the labels `y` using LabelEncoder, because the predict method is using index-based pointers
    # which will be converted back to original data later.
    le_ = LabelEncoder()
    le_.fit(y)
    transformed_y = le_.transform(y)

    # Fit all estimators with their respective feature arrays
    estimators_ = [clf.fit(X, y) if sample_weights is None \
           else clf.fit(X, y, sample_weights) \
           for clf, X in zip([clf for _, clf in classifiers], X_list)]

    return estimators_, le_


def predict_from_multiple_estimator(estimators, label_encoder, X_list, weights = None):

    # Predict 'soft' voting with probabilities

    pred1 = np.asarray([clf.predict_proba(X) for clf, X in zip(estimators, X_list)])
    pred2 = np.average(pred1, axis=0, weights=weights)
    pred = np.argmax(pred2, axis=1)

    # Convert integer predictions to original labels:
    return label_encoder.inverse_transform(pred)

In [55]:
# Voting
print('[Voting]')
K = 10

weights = list(itertools.permutations(range(0,3)))
# weights = [
#     (1, 0, 0),
#     (4, 0, 1),
#     (4, 1, 0),
#     (9, 0, 1),
#     (9, 1, 0)
# ]
# weights = [
#     (1, 0),
#     (0, 1),
#     (1, 1),
#     (2, 1),
#     (1, 2),
#     (3, 1),
#     (1, 3),
#     (3, 2),
#     (2, 3),
# ]

best_vt, best_w, best_score = None, (), -1
for weight in weights:
    l_weight = list(weight)
    kf = KFold(n_splits=K, random_state=0)
    
    scores = []
    for train_index, test_index in kf.split(df_sample):

        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
    
        y_preds = []
        for clf in pipes:
            clf.fit(X_train, y_train)
            
            try:
                y_pred = clf.predict_proba(X_test).T[1]
            except:
                d = clf.decision_function(X_test)
                y_pred = np.exp(d) / (1 + np.exp(d))
            
            y_preds.append(y_pred)
        
        y_pred = np.dot(np.transpose(y_preds), l_weight)
            
        score = roc_auc_score(y_test, y_pred)
        scores.append(score)
        
    mean = np.mean(scores)
    std = np.std(scores)
    print('%s: %.3f (+/- %.3f)' % (weight, mean, std))
    if best_score < mean:
        best_vt, best_w, best_score = clf, weight, mean

print('\nBest %s: %.3f' % (best_w, best_score))

[Voting]
[Pipeline] .............. (step 1 of 2) Processing colt, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   2.0s
[Pipeline] .............. (step 1 of 2) Processing colt, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   2.0s
[Pipeline] .............. (step 1 of 2) Processing colt, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   1.9s
[Pipeline] .............. (step 1 of 2) Processing colt, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   2.0s
[Pipeline] .............. (step 1 of 2) Processing colt, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   2.0s
[Pipeline] .............. (step 1 of 2) Processing colt, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   2.0s
[Pipeline] .............. (step 1 of 2) Processing colt, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   2.

In [136]:
p5.fit(X_train, y_train)

try:
    d = p5.decision_function(X_test)
    y_pred = np.exp(d) / (1 + np.exp(d))
except:
    y_pred = p5.predict_proba(X_test).T[1]

roc_auc_score(y_test, y_pred)

[Pipeline] .............. (step 1 of 2) Processing colt, total=   5.5s
[Pipeline] ............... (step 2 of 2) Processing clf, total=  13.9s


0.5846540632777883

## Generate prediction

In [169]:
X = df_train.drop(['pop'], axis=1)
y = df_train['pop']

In [170]:
df_f_test = pd.DataFrame(df_test, columns=X.columns)

In [171]:
X.shape

(27643, 39)

In [172]:
df_f_test.shape

(11847, 39)

In [177]:
p1.fit(X, y)
p4.fit(X, y)

d = p1.decision_function(df_f_test)
y_pred_1 = np.exp(d) / (1 + np.exp(d))

y_pred_4 = p4.predict_proba(df_f_test).T[1]

y_pred = y_pred_1 * 0.5 + y_pred_4 * 0.5

[Pipeline] .............. (step 1 of 2) Processing colt, total=  44.7s
[Pipeline] ............... (step 2 of 2) Processing clf, total= 2.3min


In [174]:
df_sub = pd.read_csv(p_sub)
df_sub['Popularity'] = y_pred
df_sub.head()

Unnamed: 0,Id,Popularity
0,27643,0.464656
1,27644,0.479776
2,27645,0.475722
3,27646,0.485299
4,27647,0.512926


In [178]:
df_sub = pd.read_csv(p_sub)
df_sub['Popularity'] = y_pred
df_sub.head()

Unnamed: 0,Id,Popularity
0,27643,0.48134
1,27644,0.49788
2,27645,0.493316
3,27646,0.503848
4,27647,0.497961


In [180]:
df_sub.to_csv('jason_sub_4.csv', index=False)

In [30]:
df_train.to_csv('./data/train_p.csv', index=False)
df_test.to_csv('./data/test_p.csv', index=False)