In [14]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [15]:
df_train = pd.read_csv('./datasets/train.csv')
df_test = pd.read_csv ('./datasets/test.csv')

In [16]:
X_train = df_train.iloc[:]['Page content'].values
y_train = df_train.iloc[:]['Popularity'].values
y_train[y_train==-1] = 0

X_test = df_test.iloc[:]['Page content'].values

In [17]:
def topic_preprocess(text):
    text = re.sub('topics: ', '', text.lower())
    text = re.sub(',', ' ,', text)
    return text

In [18]:
def author_preprocess(text):
    text = re.sub('By', '', text)
    text = re.sub('by', '', text)
    text = re.sub(',', ' ,', text)
    text = re.sub(' and ', ' , ', text)
    text = re.sub('&', ',', text)
    return text

In [25]:
def get_feature(html):
    soup = BeautifulSoup(html, 'html.parser')
    article_info = soup.head.find('div', {'class': 'article-info'})
    author = article_info.find('span', {'class': 'author_name'})
    if author != None:
        author = author.get_text()
    elif article_info.span != None:
        author = article_info.span.string
    else:
        author = article_info.a.string
    author = author_preprocess(author)

    topics = soup.find('footer', {'class':'article-topics'}).text
    topics = topic_preprocess(topics)

    channel = soup.find('article')['data-channel']

    try:
        date_time = article_info.time['datetime']
    except:
        date_time = 'Wed, 10 Oct 2014 15:00:43'
    
    match_obj = re.search('([\w]+),\s+([\d]+)\s+([\w]+)\s+([\d]+)\s+([\d]+):([\d]+):([\d]+)', date_time)
    day, date, month, year, hour, minute, second = match_obj.groups()
    day, month = day.lower(), month.lower()

    content = soup.find('section', {'class':'article-content'}).text
    len_content = len(content)

    h1_tag = soup.find('h1', {'class': 'title'})
    title = ""
    if h1_tag is not None:
        title = h1_tag.text
    title_bit = len(title)
    words = title.split()
    title_word_count = len(words)
    title_bit_count = title_bit - title_word_count + 1

    return author, topics, channel, len_content, hour, day, date, month, year, title_bit_count

In [26]:
day_map = {'mon': 1, 'tue': 2, 'wed': 3,
           'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7}

month_map = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
             'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}

df_train = pd.DataFrame(columns=['author', 'topics','channel', 'len_content', 'hour', 'day', 'date', 'month', 'year','title_bit_count'])
for idx, x in enumerate(X_train):
    df_train.loc[idx] = get_feature(x)
df_train['day'] = df_train['day'].map(day_map)
df_train['month'] = df_train['month'].map(month_map)
df_train['title_bit_count'] = df_train['title_bit_count'].astype(np.int64)

df_test = pd.DataFrame(columns=['author', 'topics', 'channel','len_content', 'hour', 'day', 'date', 'month', 'year', 'title_bit_count'])
for idx, x in enumerate(X_test):
    df_test.loc[idx] = get_feature(x)
df_test['day'] = df_test['day'].map(day_map)
df_test['month'] = df_test['month'].map(month_map)
df_test['title_bit_count'] = df_test['title_bit_count'].astype(np.int64)

In [29]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_author(text):
    if type(text) == np.ndarray:
        text = text[0]
    authors = re.split(',', text)
    for idx, author in enumerate(authors):
        authors[idx] = re.sub(' ', '', author)
    return authors

# day/topic/author/ 0.574 +- 0.007
# day/topic/author/month 0.589 +- 0.007
# day/topic/author/month/hour 0.59 +- 0.007
# day/topic/author/month/hour/len_content 0.59 +- 0.008

[nltk_data] Downloading package stopwords to /home/popo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

vect = ColumnTransformer(
    [('author', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [0]),
     ('topics', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [1]),
     ('channel', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [2])],
    n_jobs=-1,
    remainder='passthrough'
)

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

n = 100
depth = 10

In [32]:
lbgm = Pipeline([('vect', vect),
                  ('clf', LGBMClassifier(n_estimators=n, 
                                         max_depth=depth, 
                                         learning_rate=0.1, 
                                         random_state=0,
                                         num_leaves=(2**(depth-1)), # 2^depth - 1
                                         min_data_in_leaf=(2**(depth-4)),
                                         n_jobs=-1,
                                         delta=0.1))])

scores = cross_val_score(estimator=lbgm, X=df_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))



[LightGBM] [Info] Number of positive: 10906, number of negative: 11208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 925
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 263
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493172 -> initscore=-0.027315
[LightGBM] [Info] Start training from score -0.027315




[LightGBM] [Info] Number of positive: 10905, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003446 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 927
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 265
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493127 -> initscore=-0.027496
[LightGBM] [Info] Start training from score -0.027496




[LightGBM] [Info] Number of positive: 10905, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 926
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 264
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493127 -> initscore=-0.027496
[LightGBM] [Info] Start training from score -0.027496




[LightGBM] [Info] Number of positive: 10906, number of negative: 11209
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011635 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 928
[LightGBM] [Info] Number of data points in the train set: 22115, number of used features: 265
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493149 -> initscore=-0.027404
[LightGBM] [Info] Start training from score -0.027404




[LightGBM] [Info] Number of positive: 10906, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003522 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 926
[LightGBM] [Info] Number of data points in the train set: 22115, number of used features: 265
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493149 -> initscore=-0.027404
[LightGBM] [Info] Start training from score -0.027404
0.593 (+/-0.007)


In [33]:
cat = Pipeline([('vect', vect),
                    ('clf', CatBoostClassifier(iterations=30, learning_rate=0.2, depth =depth, random_state=0))])
scores = cross_val_score(estimator=cat, X=df_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))



0:	learn: 0.6879861	total: 128ms	remaining: 3.71s
1:	learn: 0.6813579	total: 208ms	remaining: 2.91s
2:	learn: 0.6791610	total: 277ms	remaining: 2.49s
3:	learn: 0.6773376	total: 344ms	remaining: 2.23s
4:	learn: 0.6745119	total: 414ms	remaining: 2.07s
5:	learn: 0.6708197	total: 480ms	remaining: 1.92s
6:	learn: 0.6675173	total: 547ms	remaining: 1.8s
7:	learn: 0.6665346	total: 612ms	remaining: 1.68s
8:	learn: 0.6640268	total: 679ms	remaining: 1.58s
9:	learn: 0.6630005	total: 740ms	remaining: 1.48s
10:	learn: 0.6617715	total: 808ms	remaining: 1.4s
11:	learn: 0.6598526	total: 869ms	remaining: 1.3s
12:	learn: 0.6593806	total: 930ms	remaining: 1.22s
13:	learn: 0.6590016	total: 992ms	remaining: 1.13s
14:	learn: 0.6581765	total: 1.06s	remaining: 1.06s
15:	learn: 0.6569954	total: 1.13s	remaining: 985ms
16:	learn: 0.6563587	total: 1.19s	remaining: 908ms
17:	learn: 0.6556750	total: 1.25s	remaining: 833ms
18:	learn: 0.6550407	total: 1.28s	remaining: 744ms
19:	learn: 0.6546232	total: 1.34s	remaining:

In [34]:
xgboost = Pipeline([('vect', vect),
                    ('clf', XGBClassifier(n_estimators=n, max_depth=10, learning_rate=0.1, random_state=0))])
scores = cross_val_score(estimator=xgboost, X=df_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))

0.592 (+/-0.006)


In [35]:
# voting classifier
from sklearn.ensemble import VotingClassifier
vote = VotingClassifier(estimators=[('lbgm', lbgm), ('cat', cat), ('xgboost', xgboost)], voting='soft', weights=[0.5, 0.35,0.35])
scores = cross_val_score(estimator=vote, X=df_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))

[LightGBM] [Info] Number of positive: 10906, number of negative: 11208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 925
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 263
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493172 -> initscore=-0.027315
[LightGBM] [Info] Start training from score -0.027315
0:	learn: 0.6879861	total: 61.7ms	remaining: 1.79s
1:	learn: 0.6813579	total: 125ms	remaining: 1.75s
2:	learn: 0.6791610	total: 188ms	remaining: 1.7s
3:	learn: 0.6773376	total: 258ms	remaining: 1.68s
4:	learn: 0.6745119	total: 323ms	remaining: 1.61s
5:	learn: 0.6708197	total: 385ms	remaining: 1.54s
6:	learn: 0.6675173	total: 449ms	remaining: 1.48s
7:	learn: 0.6665346	total: 515ms	remaining: 1.42s
8:	learn: 0.6640268	total: 580ms	remaining

In [36]:
vote.fit(X=df_train.values, y=y_train)

[LightGBM] [Info] Number of positive: 13632, number of negative: 14011
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004749 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1033
[LightGBM] [Info] Number of data points in the train set: 27643, number of used features: 316
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493145 -> initscore=-0.027423
[LightGBM] [Info] Start training from score -0.027423
0:	learn: 0.6842682	total: 83.2ms	remaining: 2.41s
1:	learn: 0.6805248	total: 158ms	remaining: 2.21s
2:	learn: 0.6763087	total: 228ms	remaining: 2.05s
3:	learn: 0.6724134	total: 298ms	remaining: 1.94s
4:	learn: 0.6706803	total: 371ms	remaining: 1.85s
5:	learn: 0.6695040	total: 444ms	remaining: 1.77s
6:	learn: 0.6653593	total: 511ms	remaining: 1.68s
7:	learn: 0.6638339	total: 578ms	remaining: 1.59s
8:	learn: 0.6632792	total: 644ms	remaini

In [37]:
y_pred = vote.predict_proba(df_test.values)[:, 1]
result = pd.DataFrame(columns=['Id', 'Popularity'])
result['Id'] = np.arange(27643, 27643+len(y_pred))
result['Popularity'] = y_pred
result.to_csv('prediction.csv', index=False)



Predict