詹博允	112062524	
吳柏諭	112062585	
李呂元	112062677	
張博智	111062704

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
### **feature engineering**

---
取出新聞的
1.   topics(文章標題)
2.   len_content()
3.   hour(發布時間中的小時)
4.   day(發布時間中的星期幾)
5.   date(發布時間中的日期)
6.   month(發布的月份)
7.   year(發布年分)
8.   title_bit_count(文章標題之bit數量)
9.   author(文章之作者名稱)
### **使用features**

---


最後採用下列項目作為model's features
1.   author(CountVectorizer)
2.   topics(CountVectorizer)
3.   len_content
4.   hour
5.   day
6.   date
7.   month
8.   year
9.   title_bit_count
10.   channel(CountVectorizer)

In [None]:
df_train = pd.read_csv('../datasets/train.csv')
df_test = pd.read_csv ('../datasets/test.csv')

In [None]:
X_train = df_train.iloc[:]['Page content'].values
y_train = df_train.iloc[:]['Popularity'].values
y_train[y_train==-1] = 0

X_test = df_test.iloc[:]['Page content'].values

In [None]:
def topic_preprocess(text):
    text = re.sub('topics: ', '', text.lower())
    text = re.sub(',', ' ,', text)
    return text

In [None]:
def author_preprocess(text):
    text = re.sub('By', '', text)
    text = re.sub('by', '', text)
    text = re.sub(',', ' ,', text)
    text = re.sub(' and ', ' , ', text)
    text = re.sub('&', ',', text)
    return text

In [None]:
def get_feature(html):
    soup = BeautifulSoup(html, 'html.parser')
    article_info = soup.head.find('div', {'class': 'article-info'})
    author = article_info.find('span', {'class': 'author_name'})
    if author != None:
        author = author.get_text()
    elif article_info.span != None:
        author = article_info.span.string
    else:
        author = article_info.a.string
    author = author_preprocess(author)

    topics = soup.find('footer', {'class':'article-topics'}).text
    topics = topic_preprocess(topics)

    channel = soup.find('article')['data-channel']

    try:
        date_time = article_info.time['datetime']
    except:
        date_time = 'Wed, 10 Oct 2014 15:00:43'
    
    match_obj = re.search('([\w]+),\s+([\d]+)\s+([\w]+)\s+([\d]+)\s+([\d]+):([\d]+):([\d]+)', date_time)
    day, date, month, year, hour, minute, second = match_obj.groups()
    day, month = day.lower(), month.lower()

    content = soup.find('section', {'class':'article-content'}).text
    len_content = len(content)

    h1_tag = soup.find('h1', {'class': 'title'})
    title = ""
    if h1_tag is not None:
        title = h1_tag.text
    title_bit = len(title)
    words = title.split()
    title_word_count = len(words)
    title_bit_count = title_bit - title_word_count + 1
    images = soup.find_all('img')
    img_count = len(images)

    return author, topics, channel, len_content, hour, day, date, month, year, title_bit_count, img_count

In [None]:
day_map = {'mon': 1, 'tue': 2, 'wed': 3,
           'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7}

month_map = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
             'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}

df_train = pd.DataFrame(columns=['author', 'topics','channel', 'len_content', 'hour', 'day', 'date', 'month', 'year','title_bit_count', 'img_count'])
for idx, x in enumerate(X_train):
    df_train.loc[idx] = get_feature(x)
df_train['day'] = df_train['day'].map(day_map)
df_train['month'] = df_train['month'].map(month_map)
df_train['title_bit_count'] = df_train['title_bit_count'].astype(np.int64)

df_test = pd.DataFrame(columns=['author', 'topics', 'channel','len_content', 'hour', 'day', 'date', 'month', 'year', 'title_bit_count', 'img_count'])
for idx, x in enumerate(X_test):
    df_test.loc[idx] = get_feature(x)
df_test['day'] = df_test['day'].map(day_map)
df_test['month'] = df_test['month'].map(month_map)
df_test['title_bit_count'] = df_test['title_bit_count'].astype(np.int64)


In [None]:
print(df_train.head())

In [None]:
df_train.drop(['title_bit_count'], axis=1, inplace=True)
df_test.drop(['title_bit_count'], axis=1, inplace=True)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_author(text):
    if type(text) == np.ndarray:
        text = text[0]
    authors = re.split(',', text)
    for idx, author in enumerate(authors):
        authors[idx] = re.sub(' ', '', author)
    return authors

# day/topic/author/ 0.574 +- 0.007
# day/topic/author/month 0.589 +- 0.007
# day/topic/author/month/hour 0.59 +- 0.007
# day/topic/author/month/hour/len_content 0.59 +- 0.008

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

vect = ColumnTransformer(
    [('author', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [0]),
     ('topics', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [1]),
     ('channel', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [2])],
    n_jobs=-1,
    remainder='passthrough'
)

In [None]:
# vect = ColumnTransformer(
#     [
#      ('topics', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [0]),
#      ('channel', CountVectorizer(tokenizer=tokenizer_author, lowercase=False), [1])],
#     n_jobs=-1,
#     remainder='passthrough'
# )

In [None]:
### How do you build the classifier?
- 將資料通過`ColumnTransformer`，把前處理的資料整合成`vect`。
- 我們是使用了三個模型，`LBGMClassifier`/`CatboostClassifier`/`XGBoostClassifier`來做預測。我們原本還有使用到`RandomForrestClassifier`等架構來預測，但是我們發現效果不彰，所以後來沒有特別採用。
- 然後再把三個模型的預測結果用`Voteclassifier`來合併投票，他們的權重分別是[0.5,0.35,0.35]。
- 至於投票結果依照`roc_auc`輸出，我們是使用`cross_val_score`，然後將valid/train set依照`1:4`的比例做訓練(`cv=5`)。

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

n = 100
depth = 10

In [None]:
print(df_train.head())

In [None]:
print(df_train.columns.values)

In [None]:
lbgm = Pipeline([('vect', vect),
                  ('clf', LGBMClassifier(n_estimators=n, 
                                         max_depth=depth, 
                                         learning_rate=0.1, 
                                         random_state=0,
                                         num_leaves=(2**(depth-1)), # 2^depth - 1
                                         min_data_in_leaf=(2**(depth-4)),
                                         n_jobs=-1,
                                         delta=0.1))])

scores = cross_val_score(estimator=lbgm, X=df_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))

In [None]:
cat = Pipeline([('vect', vect),
                    ('clf', CatBoostClassifier(iterations=30, learning_rate=0.2, depth =depth, random_state=0))])
scores = cross_val_score(estimator=cat, X=df_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))

In [None]:
xgboost = Pipeline([('vect', vect),
                    ('clf', XGBClassifier(n_estimators=n, max_depth=10, learning_rate=0.1, random_state=0))])
scores = cross_val_score(estimator=xgboost, X=df_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))

In [None]:
# voting classifier
from sklearn.ensemble import VotingClassifier
vote = VotingClassifier(estimators=[('lbgm', lbgm), ('cat', cat), ('xgboost', xgboost)], voting='soft', weights=[0.5, 0.35,0.35])
scores = cross_val_score(estimator=vote, X=df_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))

In [None]:
vote.fit(X=df_train.values, y=y_train)

In [None]:
y_pred = vote.predict_proba(df_test.values)[:, 1]
result = pd.DataFrame(columns=['Id', 'Popularity'])
result['Id'] = np.arange(27643, 27643+len(y_pred))
result['Popularity'] = y_pred
result.to_csv('prediction.csv', index=False)

Predict
popo_version : 
	feature :
		'author', 'topics','channel', 'len_content', 'hour', 'day', 'date', 'month', 'year', 'title_bit_count'
	separate model_acc : 
		lgbm 	: 0.593 (+/-0.007)
		cat  	: 0.596 (+/-0.006)
		xgboost : 0.592 (+/-0.006)
	Final : 	  0.601 (+/-0.007)
	upload : 	  0.59224


popo_version : 
	feature : 
		'author', 'topics','channel', 'len_content', 'hour', 'day', 'date', 'month', 'year','title_bit_count', 'img_count'
	separate model_acc :
		lgbm 	: 0.592 (+/-0.006)
		cat		: 0.595 (+/-0.010)
		xgboost	: 0.592 (+/-0.005)
	Final : 	  0.601 (+/-0.007)
	upload : 	  0.59127

popo_version :
	feature :
		
	separate model_acc :
		lgbm	: 0.593 (+/-0.008)
		cat 	: 0.594 (+/-0.009)
		xgboost : 0.590 (+/-0.006)
	Final :		  0.599 (+/-0.008)


In [None]:
# Conclusion #
1. 選對 model 比努力更重要 -> lightGBM, XGBoost 很好用！
2. 發現日期很重要 -> 只用日期去 train acc 就 58.5%
3. 如何讓 model 有一般性 -> 因為發現最後在比的是誰的 model 比較 generalize
4. 對資料分析有更深的理解 -> 在 year 這筆 feature, 因為 bias 不大我以為對 prediction 的影響不大, 但是後來發現其 prediction weight 其實很大\