In [19]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [20]:
df_train = pd.read_csv('../datasets/train.csv')
df_test = pd.read_csv ('../datasets/test.csv')

In [21]:
X_train = df_train.iloc[:]['Page content'].values
y_train = df_train.iloc[:]['Popularity'].values
y_train[y_train==-1] = 0

X_test = df_test.iloc[:]['Page content'].values

In [22]:
def topic_preprocess(text):
    text = re.sub('topics: ', '', text.lower())
    text = re.sub(',', ' ,', text)
    return text

In [23]:
def author_preprocess(text):
    text = re.sub('By', '', text)
    text = re.sub('by', '', text)
    text = re.sub(',', ' ,', text)
    text = re.sub(' and ', ' , ', text)
    text = re.sub('&', ',', text)
    return text

In [39]:
def get_feature(html):
    soup = BeautifulSoup(html, 'html.parser')
    article_info = soup.head.find('div', {'class': 'article-info'})
    author = article_info.find('span', {'class': 'author_name'})
    if author != None:
        author = author.get_text()
    elif article_info.span != None:
        author = article_info.span.string
    else:
        author = article_info.a.string
    author = author_preprocess(author)

    topics = soup.find('footer', {'class':'article-topics'}).text
    topics = topic_preprocess(topics)

    channel = soup.find('article')['data-channel']

    try:
        date_time = article_info.time['datetime']
    except:
        date_time = 'Wed, 10 Oct 2014 15:00:43'
    
    match_obj = re.search('([\w]+),\s+([\d]+)\s+([\w]+)\s+([\d]+)\s+([\d]+):([\d]+):([\d]+)', date_time)
    day, date, month, year, hour, minute, second = match_obj.groups()
    day, month = day.lower(), month.lower()

    content = soup.find('section', {'class':'article-content'}).text
    len_content = len(content)

    h1_tag = soup.find('h1', {'class': 'title'})
    title = ""
    if h1_tag is not None:
        title = h1_tag.text
    title_bit = len(title)
    words = title.split()
    title_word_count = len(words)
    title_bit_count = title_bit - title_word_count + 1
    images = soup.find_all('img')
    img_count = len(images)

    return author, topics, channel, len_content, hour, day, date, month, year, title_bit_count, img_count

In [40]:
day_map = {'mon': 1, 'tue': 2, 'wed': 3,
           'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7}

month_map = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
             'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}

df_train = pd.DataFrame(columns=['author', 'topics','channel', 'len_content', 'hour', 'day', 'date', 'month', 'year','title_bit_count', 'img_count'])
for idx, x in enumerate(X_train):
    df_train.loc[idx] = get_feature(x)
df_train['day'] = df_train['day'].map(day_map)
df_train['month'] = df_train['month'].map(month_map)
df_train['title_bit_count'] = df_train['title_bit_count'].astype(np.int64)

df_test = pd.DataFrame(columns=['author', 'topics', 'channel','len_content', 'hour', 'day', 'date', 'month', 'year', 'title_bit_count', 'img_count'])
for idx, x in enumerate(X_test):
    df_test.loc[idx] = get_feature(x)
df_test['day'] = df_test['day'].map(day_map)
df_test['month'] = df_test['month'].map(month_map)
df_test['title_bit_count'] = df_test['title_bit_count'].astype(np.int64)


In [43]:
print(df_train.head())

              author                                             topics  \
0    Clara Moskowitz   asteroid , asteroids , challenge , earth , sp...   
1   Christina Warren   apps and software , google , open source , op...   
2          Sam Laird   entertainment , nfl , nfl draft , sports , te...   
3          Sam Laird             sports , video , videos , watercooler    
4    Connor Finnegan   entertainment , instagram , instagram video ,...   

         channel  len_content hour  day date  month  year  img_count  
0          world         3591   15    3   19      6  2013          1  
1           tech         1843   17    4   28      3  2013          2  
2  entertainment         6646   19    3   07      5  2014          2  
3    watercooler         1821   02    5   11     10  2013          1  
4  entertainment         8919   03    4   17      4  2014         52  


In [42]:
df_train.drop(['title_bit_count'], axis=1, inplace=True)
df_test.drop(['title_bit_count'], axis=1, inplace=True)

In [44]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_author(text):
    if type(text) == np.ndarray:
        text = text[0]
    authors = re.split(',', text)
    for idx, author in enumerate(authors):
        authors[idx] = re.sub(' ', '', author)
    return authors

# day/topic/author/ 0.574 +- 0.007
# day/topic/author/month 0.589 +- 0.007
# day/topic/author/month/hour 0.59 +- 0.007
# day/topic/author/month/hour/len_content 0.59 +- 0.008

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mygodimatomato/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

vect = ColumnTransformer(
    [('author', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [0]),
     ('topics', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [1]),
     ('channel', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [2])],
    n_jobs=-1,
    remainder='passthrough'
)

In [30]:
# vect = ColumnTransformer(
#     [
#      ('topics', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [0]),
#      ('channel', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [1])],
#     n_jobs=-1,
#     remainder='passthrough'
# )

In [46]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

n = 100
depth = 10

In [32]:
print(df_train.head())

                                              topics        channel  \
0   asteroid , asteroids , challenge , earth , sp...          world   
1   apps and software , google , open source , op...           tech   
2   entertainment , nfl , nfl draft , sports , te...  entertainment   
3             sports , video , videos , watercooler     watercooler   
4   entertainment , instagram , instagram video ,...  entertainment   

   len_content hour  day date  month  year  
0         3591   15    3   19      6  2013  
1         1843   17    4   28      3  2013  
2         6646   19    3   07      5  2014  
3         1821   02    5   11     10  2013  
4         8919   03    4   17      4  2014  


In [None]:
print(df_train.columns.values)

In [47]:
lbgm = Pipeline([('vect', vect),
                  ('clf', LGBMClassifier(n_estimators=n, 
                                         max_depth=depth, 
                                         learning_rate=0.1, 
                                         random_state=0,
                                         num_leaves=(2**(depth-1)), # 2^depth - 1
                                         min_data_in_leaf=(2**(depth-4)),
                                         n_jobs=-1,
                                         delta=0.1))])

scores = cross_val_score(estimator=lbgm, X=df_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))



[LightGBM] [Info] Number of positive: 10906, number of negative: 11208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.072122 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 924
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 263
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493172 -> initscore=-0.027315
[LightGBM] [Info] Start training from score -0.027315




[LightGBM] [Info] Number of positive: 10905, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.088695 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 927
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 265
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493127 -> initscore=-0.027496
[LightGBM] [Info] Start training from score -0.027496




[LightGBM] [Info] Number of positive: 10905, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.076149 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 927
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 264
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493127 -> initscore=-0.027496
[LightGBM] [Info] Start training from score -0.027496




[LightGBM] [Info] Number of positive: 10906, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.075993 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 928
[LightGBM] [Info] Number of data points in the train set: 22115, number of used features: 265
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493149 -> initscore=-0.027404
[LightGBM] [Info] Start training from score -0.027404




[LightGBM] [Info] Number of positive: 10906, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.090724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 930
[LightGBM] [Info] Number of data points in the train set: 22115, number of used features: 265
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493149 -> initscore=-0.027404
[LightGBM] [Info] Start training from score -0.027404
0.593 (+/-0.007)


In [34]:
cat = Pipeline([('vect', vect),
                    ('clf', CatBoostClassifier(iterations=30, learning_rate=0.2, depth =depth, random_state=0))])
scores = cross_val_score(estimator=cat, X=df_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))



0:	learn: 0.6863840	total: 127ms	remaining: 3.67s
1:	learn: 0.6817736	total: 248ms	remaining: 3.46s
2:	learn: 0.6794774	total: 374ms	remaining: 3.37s
3:	learn: 0.6748100	total: 496ms	remaining: 3.23s
4:	learn: 0.6705625	total: 633ms	remaining: 3.17s
5:	learn: 0.6662539	total: 770ms	remaining: 3.08s
6:	learn: 0.6632508	total: 904ms	remaining: 2.97s
7:	learn: 0.6617834	total: 1.03s	remaining: 2.85s
8:	learn: 0.6589587	total: 1.16s	remaining: 2.7s
9:	learn: 0.6579718	total: 1.27s	remaining: 2.54s
10:	learn: 0.6558091	total: 1.39s	remaining: 2.4s
11:	learn: 0.6552908	total: 1.52s	remaining: 2.28s
12:	learn: 0.6550178	total: 1.65s	remaining: 2.16s
13:	learn: 0.6546704	total: 1.77s	remaining: 2.03s
14:	learn: 0.6542466	total: 1.91s	remaining: 1.91s
15:	learn: 0.6525021	total: 2.04s	remaining: 1.78s
16:	learn: 0.6519477	total: 2.16s	remaining: 1.66s
17:	learn: 0.6513100	total: 2.27s	remaining: 1.52s
18:	learn: 0.6500686	total: 2.4s	remaining: 1.39s
19:	learn: 0.6483513	total: 2.54s	remaining:



0:	learn: 0.6862219	total: 106ms	remaining: 3.06s
1:	learn: 0.6827903	total: 228ms	remaining: 3.19s
2:	learn: 0.6775254	total: 355ms	remaining: 3.19s
3:	learn: 0.6729646	total: 492ms	remaining: 3.2s
4:	learn: 0.6691650	total: 627ms	remaining: 3.13s
5:	learn: 0.6664329	total: 750ms	remaining: 3s
6:	learn: 0.6652220	total: 872ms	remaining: 2.87s
7:	learn: 0.6642949	total: 994ms	remaining: 2.73s
8:	learn: 0.6630056	total: 1.13s	remaining: 2.63s
9:	learn: 0.6618352	total: 1.26s	remaining: 2.52s
10:	learn: 0.6614031	total: 1.38s	remaining: 2.39s
11:	learn: 0.6596859	total: 1.52s	remaining: 2.28s
12:	learn: 0.6591171	total: 1.64s	remaining: 2.15s
13:	learn: 0.6573307	total: 1.78s	remaining: 2.03s
14:	learn: 0.6565313	total: 1.9s	remaining: 1.9s
15:	learn: 0.6552470	total: 2.01s	remaining: 1.76s
16:	learn: 0.6543702	total: 2.12s	remaining: 1.62s
17:	learn: 0.6540276	total: 2.25s	remaining: 1.5s
18:	learn: 0.6535251	total: 2.37s	remaining: 1.37s
19:	learn: 0.6532988	total: 2.44s	remaining: 1.2



0:	learn: 0.6857452	total: 110ms	remaining: 3.19s
1:	learn: 0.6828230	total: 229ms	remaining: 3.21s
2:	learn: 0.6798122	total: 359ms	remaining: 3.23s
3:	learn: 0.6753386	total: 483ms	remaining: 3.14s
4:	learn: 0.6724501	total: 619ms	remaining: 3.1s
5:	learn: 0.6693675	total: 749ms	remaining: 2.99s
6:	learn: 0.6659492	total: 875ms	remaining: 2.87s
7:	learn: 0.6651743	total: 1.01s	remaining: 2.77s
8:	learn: 0.6650038	total: 1.02s	remaining: 2.39s
9:	learn: 0.6629969	total: 1.14s	remaining: 2.27s
10:	learn: 0.6622851	total: 1.26s	remaining: 2.19s
11:	learn: 0.6617815	total: 1.38s	remaining: 2.08s
12:	learn: 0.6613019	total: 1.51s	remaining: 1.98s
13:	learn: 0.6603488	total: 1.64s	remaining: 1.88s
14:	learn: 0.6598693	total: 1.77s	remaining: 1.77s
15:	learn: 0.6589028	total: 1.9s	remaining: 1.66s
16:	learn: 0.6579722	total: 2.03s	remaining: 1.55s
17:	learn: 0.6571430	total: 2.16s	remaining: 1.44s
18:	learn: 0.6568717	total: 2.28s	remaining: 1.32s
19:	learn: 0.6565355	total: 2.42s	remaining



0:	learn: 0.6879104	total: 102ms	remaining: 2.94s
1:	learn: 0.6810080	total: 210ms	remaining: 2.94s
2:	learn: 0.6738829	total: 314ms	remaining: 2.83s
3:	learn: 0.6680827	total: 420ms	remaining: 2.73s
4:	learn: 0.6648205	total: 527ms	remaining: 2.63s
5:	learn: 0.6599657	total: 637ms	remaining: 2.55s
6:	learn: 0.6580511	total: 737ms	remaining: 2.42s
7:	learn: 0.6570717	total: 861ms	remaining: 2.37s
8:	learn: 0.6538682	total: 964ms	remaining: 2.25s
9:	learn: 0.6533396	total: 1.06s	remaining: 2.13s
10:	learn: 0.6517646	total: 1.16s	remaining: 2s
11:	learn: 0.6513099	total: 1.26s	remaining: 1.89s
12:	learn: 0.6500747	total: 1.35s	remaining: 1.77s
13:	learn: 0.6493194	total: 1.45s	remaining: 1.66s
14:	learn: 0.6478824	total: 1.55s	remaining: 1.55s
15:	learn: 0.6473533	total: 1.66s	remaining: 1.45s
16:	learn: 0.6469946	total: 1.76s	remaining: 1.35s
17:	learn: 0.6464995	total: 1.86s	remaining: 1.24s
18:	learn: 0.6459930	total: 1.97s	remaining: 1.14s
19:	learn: 0.6449004	total: 2.07s	remaining:



0:	learn: 0.6867157	total: 102ms	remaining: 2.96s
1:	learn: 0.6818964	total: 225ms	remaining: 3.15s
2:	learn: 0.6790286	total: 334ms	remaining: 3.01s
3:	learn: 0.6729256	total: 435ms	remaining: 2.83s
4:	learn: 0.6714334	total: 543ms	remaining: 2.71s
5:	learn: 0.6668700	total: 646ms	remaining: 2.58s
6:	learn: 0.6628875	total: 750ms	remaining: 2.46s
7:	learn: 0.6597836	total: 868ms	remaining: 2.39s
8:	learn: 0.6571923	total: 974ms	remaining: 2.27s
9:	learn: 0.6565510	total: 1.08s	remaining: 2.16s
10:	learn: 0.6561344	total: 1.19s	remaining: 2.06s
11:	learn: 0.6553570	total: 1.29s	remaining: 1.94s
12:	learn: 0.6540694	total: 1.41s	remaining: 1.85s
13:	learn: 0.6536800	total: 1.51s	remaining: 1.73s
14:	learn: 0.6528510	total: 1.61s	remaining: 1.61s
15:	learn: 0.6520746	total: 1.71s	remaining: 1.5s
16:	learn: 0.6516581	total: 1.81s	remaining: 1.39s
17:	learn: 0.6513529	total: 1.91s	remaining: 1.28s
18:	learn: 0.6504209	total: 2.02s	remaining: 1.17s
19:	learn: 0.6500772	total: 2.12s	remainin

In [35]:
xgboost = Pipeline([('vect', vect),
                    ('clf', XGBClassifier(n_estimators=n, max_depth=10, learning_rate=0.1, random_state=0))])
scores = cross_val_score(estimator=xgboost, X=df_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))



0.590 (+/-0.006)


In [36]:
# voting classifier
from sklearn.ensemble import VotingClassifier
vote = VotingClassifier(estimators=[('lbgm', lbgm), ('cat', cat), ('xgboost', xgboost)], voting='soft', weights=[0.5, 0.35,0.35])
scores = cross_val_score(estimator=vote, X=df_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))



[LightGBM] [Info] Number of positive: 10906, number of negative: 11208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.782307 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 720
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 196
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493172 -> initscore=-0.027315
[LightGBM] [Info] Start training from score -0.027315




0:	learn: 0.6863840	total: 111ms	remaining: 3.21s
1:	learn: 0.6817736	total: 218ms	remaining: 3.05s
2:	learn: 0.6794774	total: 330ms	remaining: 2.97s
3:	learn: 0.6748100	total: 436ms	remaining: 2.84s
4:	learn: 0.6705625	total: 546ms	remaining: 2.73s
5:	learn: 0.6662539	total: 658ms	remaining: 2.63s
6:	learn: 0.6632508	total: 765ms	remaining: 2.51s
7:	learn: 0.6617834	total: 857ms	remaining: 2.35s
8:	learn: 0.6589587	total: 962ms	remaining: 2.24s
9:	learn: 0.6579718	total: 1.06s	remaining: 2.13s
10:	learn: 0.6558091	total: 1.17s	remaining: 2.02s
11:	learn: 0.6552908	total: 1.27s	remaining: 1.91s
12:	learn: 0.6550178	total: 1.39s	remaining: 1.81s
13:	learn: 0.6546704	total: 1.5s	remaining: 1.71s
14:	learn: 0.6542466	total: 1.62s	remaining: 1.62s
15:	learn: 0.6525021	total: 1.73s	remaining: 1.51s
16:	learn: 0.6519477	total: 1.83s	remaining: 1.4s
17:	learn: 0.6513100	total: 1.95s	remaining: 1.3s
18:	learn: 0.6500686	total: 2.05s	remaining: 1.19s
19:	learn: 0.6483513	total: 2.15s	remaining:







[LightGBM] [Info] Number of positive: 10905, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.802488 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 729
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493127 -> initscore=-0.027496
[LightGBM] [Info] Start training from score -0.027496




0:	learn: 0.6862219	total: 116ms	remaining: 3.35s
1:	learn: 0.6827903	total: 245ms	remaining: 3.43s
2:	learn: 0.6775254	total: 373ms	remaining: 3.35s
3:	learn: 0.6729646	total: 492ms	remaining: 3.2s
4:	learn: 0.6691650	total: 610ms	remaining: 3.05s
5:	learn: 0.6664329	total: 729ms	remaining: 2.92s
6:	learn: 0.6652220	total: 827ms	remaining: 2.72s
7:	learn: 0.6642949	total: 922ms	remaining: 2.54s
8:	learn: 0.6630056	total: 1.02s	remaining: 2.39s
9:	learn: 0.6618352	total: 1.13s	remaining: 2.25s
10:	learn: 0.6614031	total: 1.23s	remaining: 2.13s
11:	learn: 0.6596859	total: 1.34s	remaining: 2.01s
12:	learn: 0.6591171	total: 1.45s	remaining: 1.9s
13:	learn: 0.6573307	total: 1.56s	remaining: 1.78s
14:	learn: 0.6565313	total: 1.66s	remaining: 1.66s
15:	learn: 0.6552470	total: 1.77s	remaining: 1.54s
16:	learn: 0.6543702	total: 1.87s	remaining: 1.43s
17:	learn: 0.6540276	total: 1.96s	remaining: 1.31s
18:	learn: 0.6535251	total: 2.06s	remaining: 1.19s
19:	learn: 0.6532988	total: 2.11s	remaining



[LightGBM] [Info] Number of positive: 10905, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.781301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 720
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 196
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493127 -> initscore=-0.027496
[LightGBM] [Info] Start training from score -0.027496




0:	learn: 0.6857452	total: 125ms	remaining: 3.62s
1:	learn: 0.6828230	total: 270ms	remaining: 3.79s
2:	learn: 0.6798122	total: 424ms	remaining: 3.81s
3:	learn: 0.6753386	total: 566ms	remaining: 3.68s
4:	learn: 0.6724501	total: 709ms	remaining: 3.54s
5:	learn: 0.6693675	total: 855ms	remaining: 3.42s
6:	learn: 0.6659492	total: 1s	remaining: 3.29s
7:	learn: 0.6651743	total: 1.15s	remaining: 3.16s
8:	learn: 0.6650038	total: 1.17s	remaining: 2.73s
9:	learn: 0.6629969	total: 1.32s	remaining: 2.64s
10:	learn: 0.6622851	total: 1.45s	remaining: 2.51s
11:	learn: 0.6617815	total: 1.6s	remaining: 2.4s
12:	learn: 0.6613019	total: 1.75s	remaining: 2.29s
13:	learn: 0.6603488	total: 1.9s	remaining: 2.17s
14:	learn: 0.6598693	total: 2.04s	remaining: 2.04s
15:	learn: 0.6589028	total: 2.16s	remaining: 1.89s
16:	learn: 0.6579722	total: 2.28s	remaining: 1.74s
17:	learn: 0.6571430	total: 2.4s	remaining: 1.6s
18:	learn: 0.6568717	total: 2.52s	remaining: 1.46s
19:	learn: 0.6565355	total: 2.63s	remaining: 1.31







[LightGBM] [Info] Number of positive: 10906, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.796535 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 726
[LightGBM] [Info] Number of data points in the train set: 22115, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493149 -> initscore=-0.027404
[LightGBM] [Info] Start training from score -0.027404




0:	learn: 0.6879104	total: 115ms	remaining: 3.33s
1:	learn: 0.6810080	total: 219ms	remaining: 3.07s
2:	learn: 0.6738829	total: 344ms	remaining: 3.1s
3:	learn: 0.6680827	total: 455ms	remaining: 2.96s
4:	learn: 0.6648205	total: 564ms	remaining: 2.82s
5:	learn: 0.6599657	total: 660ms	remaining: 2.64s
6:	learn: 0.6580511	total: 754ms	remaining: 2.48s
7:	learn: 0.6570717	total: 850ms	remaining: 2.34s
8:	learn: 0.6538682	total: 948ms	remaining: 2.21s
9:	learn: 0.6533396	total: 1.06s	remaining: 2.12s
10:	learn: 0.6517646	total: 1.16s	remaining: 2s
11:	learn: 0.6513099	total: 1.26s	remaining: 1.9s
12:	learn: 0.6500747	total: 1.37s	remaining: 1.8s
13:	learn: 0.6493194	total: 1.49s	remaining: 1.7s
14:	learn: 0.6478824	total: 1.6s	remaining: 1.6s
15:	learn: 0.6473533	total: 1.72s	remaining: 1.5s
16:	learn: 0.6469946	total: 1.83s	remaining: 1.4s
17:	learn: 0.6464995	total: 1.93s	remaining: 1.29s
18:	learn: 0.6459930	total: 2.04s	remaining: 1.18s
19:	learn: 0.6449004	total: 2.14s	remaining: 1.07s
2







[LightGBM] [Info] Number of positive: 10906, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.799299 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 724
[LightGBM] [Info] Number of data points in the train set: 22115, number of used features: 198
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493149 -> initscore=-0.027404
[LightGBM] [Info] Start training from score -0.027404




0:	learn: 0.6867157	total: 120ms	remaining: 3.48s
1:	learn: 0.6818964	total: 245ms	remaining: 3.43s
2:	learn: 0.6790286	total: 357ms	remaining: 3.21s
3:	learn: 0.6729256	total: 483ms	remaining: 3.14s
4:	learn: 0.6714334	total: 595ms	remaining: 2.98s
5:	learn: 0.6668700	total: 704ms	remaining: 2.81s
6:	learn: 0.6628875	total: 828ms	remaining: 2.72s
7:	learn: 0.6597836	total: 952ms	remaining: 2.62s
8:	learn: 0.6571923	total: 1.06s	remaining: 2.48s
9:	learn: 0.6565510	total: 1.16s	remaining: 2.32s
10:	learn: 0.6561344	total: 1.27s	remaining: 2.19s
11:	learn: 0.6553570	total: 1.39s	remaining: 2.08s
12:	learn: 0.6540694	total: 1.52s	remaining: 1.99s
13:	learn: 0.6536800	total: 1.65s	remaining: 1.88s
14:	learn: 0.6528510	total: 1.77s	remaining: 1.77s
15:	learn: 0.6520746	total: 1.9s	remaining: 1.66s
16:	learn: 0.6516581	total: 2.02s	remaining: 1.54s
17:	learn: 0.6513529	total: 2.13s	remaining: 1.42s
18:	learn: 0.6504209	total: 2.26s	remaining: 1.31s
19:	learn: 0.6500772	total: 2.38s	remainin



0.599 (+/-0.008)


In [37]:
vote.fit(X=df_train.values, y=y_train)



[LightGBM] [Info] Number of positive: 13632, number of negative: 14011
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.952066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 798
[LightGBM] [Info] Number of data points in the train set: 27643, number of used features: 234
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493145 -> initscore=-0.027423
[LightGBM] [Info] Start training from score -0.027423




0:	learn: 0.6857099	total: 136ms	remaining: 3.95s
1:	learn: 0.6812321	total: 278ms	remaining: 3.9s
2:	learn: 0.6763264	total: 390ms	remaining: 3.51s
3:	learn: 0.6745379	total: 502ms	remaining: 3.27s
4:	learn: 0.6723675	total: 619ms	remaining: 3.1s
5:	learn: 0.6710049	total: 746ms	remaining: 2.98s
6:	learn: 0.6699518	total: 870ms	remaining: 2.86s
7:	learn: 0.6684991	total: 1.01s	remaining: 2.77s
8:	learn: 0.6678560	total: 1.13s	remaining: 2.63s
9:	learn: 0.6673394	total: 1.25s	remaining: 2.5s
10:	learn: 0.6641136	total: 1.36s	remaining: 2.35s
11:	learn: 0.6639111	total: 1.39s	remaining: 2.09s
12:	learn: 0.6617132	total: 1.51s	remaining: 1.97s
13:	learn: 0.6611223	total: 1.61s	remaining: 1.84s
14:	learn: 0.6601364	total: 1.72s	remaining: 1.72s
15:	learn: 0.6599651	total: 1.82s	remaining: 1.59s
16:	learn: 0.6595684	total: 1.92s	remaining: 1.47s
17:	learn: 0.6582361	total: 2.02s	remaining: 1.35s
18:	learn: 0.6557444	total: 2.12s	remaining: 1.23s
19:	learn: 0.6554132	total: 2.23s	remaining:



In [38]:
y_pred = vote.predict_proba(df_test.values)[:, 1]
result = pd.DataFrame(columns=['Id', 'Popularity'])
result['Id'] = np.arange(27643, 27643+len(y_pred))
result['Popularity'] = y_pred
result.to_csv('prediction.csv', index=False)



Predict
popo_version : 
	feature :
		'author', 'topics','channel', 'len_content', 'hour', 'day', 'date', 'month', 'year', 'title_bit_count'
	separate model_acc : 
		lgbm 	: 0.593 (+/-0.007)
		cat  	: 0.596 (+/-0.006)
		xgboost : 0.592 (+/-0.006)
	Final : 	  0.601 (+/-0.007)
	upload : 	  0.59224


popo_version : 
	feature : 
		'author', 'topics','channel', 'len_content', 'hour', 'day', 'date', 'month', 'year','title_bit_count', 'img_count'
	separate model_acc :
		lgbm 	: 0.592 (+/-0.006)
		cat		: 0.595 (+/-0.010)
		xgboost	: 0.592 (+/-0.005)
	Final : 	  0.601 (+/-0.007)
	upload : 	  0.59127

popo_version :
	feature :
		
	separate model_acc :
		lgbm	: 0.593 (+/-0.008)
		cat 	: 0.594 (+/-0.009)
		xgboost : 0.590 (+/-0.006)
	Final :		  0.599 (+/-0.008)
