In [1]:
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

import pandas as pd
import numpy as np
import re 
import ast
import itertools

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split

from bs4 import BeautifulSoup

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
n = 100

'Month', 'Weekday', 'Year', 'Day', 'Hour', 'title_bit_count', 'img_count', 'link_count', 'title_word_count', 'content_len', 'content_word_count'

-----------------------------------------------------------
括號內是 acc > 0.58 的權重
'Month', 'Weekday', 'Year', 'Hour' -> 100% (100%)

'Day' -> 85% (72%)

'img_count' -> 89% (65%)

'channel' -> 81%

'content_len' -> 49% (48%) -> 59%

'author' -> 56%

'title_bit_count' -> 36% (48%) -> 53%

'link_count' -> 40% (46%) -> 37%

'title_word_count' -> 36% (49%) -> 37% (drop?)

'content_word_count' -> 38% (47%) (drop)

'categories_count' -> 16% (15%) (drop)

'Minutes' -> 跟 categories_count 差不多 (drop)

**author and channel is important**
channel 權重大於 author

In [7]:
df = pd.read_csv('../datasets_processed/train_processed_2.csv')
print(df.columns.values)

['Id' 'Popularity' 'Page content' 'img_count' 'link_count' 'title'
 'title_word_count' 'title_bit_count' 'content' 'content_len'
 'content_word_count' 'categories' 'categories_count' 'channel' 'author'
 'Weekday' 'Year' 'Month' 'Day' 'Hour' 'Minutes' 'Sec' 'Timezone']


In [51]:
# extra features : title, content, categories
X_train = df[['Month', 'Weekday', 'Year', 'Day', 'Hour', 'img_count', 'content_len', 'channel', 'author', 'categories']]
y_train = df.iloc[:]['Popularity'].values
y_train[y_train==-1] = 0

In [46]:
# X_train['title'] = X_train['title'].apply(ast.literal_eval)
X_train['categories'] = X_train['categories'].apply(ast.literal_eval)


In [38]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')
stemmer = PorterStemmer()

def tokenizer_author(text):
    if type(text) == np.ndarray:
        text = text[0]
    authors = re.split(',', text)
    for idx, author in enumerate(authors):
        authors[idx] = re.sub(' ', '', author)
    return authors
    
def tokenizer_channel(text):
    if type(text) == np.ndarray:
        text = text[0]
    channels = re.split(',', text)
    for idx, channel in enumerate(channels):
        channels[idx] = re.sub(' ', '', channel)
    return channels

def tokenizer_title(list):
    # word-stemming
    stemmed_title = [stemmer.stem(word) for word in list]
    # remove stopwords
    cleaned_title = [word for word in stemmed_title if word not in stop]
    return cleaned_title


def tokenizer_categories(list):   
    # word-stemming
    stemmed_categories = [stemmer.stem(word) for word in list]
    # remove stopwords
    cleaned_categories = [word for word in stemmed_categories if word not in stop]
    return cleaned_categories

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mygodimatomato/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# X_train['title'] = X_train['title'].apply(tokenizer_title)
# X_train['title'].to_csv('../datasets_processed/title.csv', index=False)

In [47]:
# preprocess the author
vectorizer = CountVectorizer(tokenizer=tokenizer_author, lowercase=False)
vectorized_data = vectorizer.fit_transform(X_train['author'])
vectorized_author = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())

# preprocess the channel
vectorizer = CountVectorizer(tokenizer=tokenizer_channel, lowercase=False)
vectorized_data = vectorizer.fit_transform(X_train['channel'])
vectorized_channel = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())

# preprocess the title
# vectorizer = CountVectorizer(tokenizer=tokenizer_title, lowercase=False)
# vectorized_data = vectorizer.fit_transform(X_train['title'])
# vectorized_title = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())

# preprocess the categories
vectorizer = CountVectorizer(tokenizer=tokenizer_categories, lowercase=False)
vectorized_data = vectorizer.fit_transform(X_train['categories'])
vectorized_categories = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
# print(vectorized_categories.shape)

In [21]:
print(X_train.columns.values)

['Month' 'Weekday' 'Year' 'Day' 'Hour' 'img_count' 'content_len' 'channel'
 'author' 'categories']


In [52]:
# Doing standardization for numerical features
scaler = StandardScaler()
X_train[['Month', 'Weekday', 'Year', 'Day', 'Hour', 'img_count', 'content_len', ]] = scaler.fit_transform(X_train[['Month', 'Weekday', 'Year', 'Day', 'Hour', 'img_count', 'content_len']])

In [53]:
X_train = X_train.drop(['author'], axis=1)
X_train = X_train.drop(['channel'], axis=1)
X_train = X_train.drop(['categories'], axis=1)
# X_train = pd.concat([X_train, vectorized_author], axis=1)
# X_train = pd.concat([X_train, vectorized_channel], axis=1)
X_train = pd.concat([X_train, vectorized_categories], axis=1)


In [54]:
lbgm = LGBMClassifier(n_estimators=n, max_depth=10, learning_rate=0.1, random_state=0)
scores = cross_val_score(estimator=lbgm, X=X_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))

[LightGBM] [Info] Number of positive: 10906, number of negative: 11208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2283
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 669
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493172 -> initscore=-0.027315
[LightGBM] [Info] Start training from score -0.027315
[LightGBM] [Info] Number of positive: 10905, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023826 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2233
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 656
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493127 -> initscore=-0.027496


In [None]:
com = []
acc = []
attributes = X_train.columns.values
for r in range(1, len(attributes) + 1):
    for combination in itertools.combinations(attributes, r):
        # if combination.__contains__('Month') and combination.__contains__('Weekday') and combination.__contains__('Year') and \
        #     combination.__contains__('Day') and combination.__contains__('Hour') and combination.__contains__('img_count') and \
        #     combination.__contains__('channel'):
            tmp = X_train[list(combination)]
            if combination.__contains__('author'):
                tmp = tmp.drop(['author'], axis=1)
                tmp = pd.concat([tmp, vectorized_author], axis=1)
            if combination.__contains__('categories'):
            # if combination.__contains__('title'):
            #     tmp = tmp.drop(['title'], axis=1)
            #     tmp = pd.concat([tmp, vectorized_title], axis=1)
            if combination.__contains__('channel'):
                tmp = tmp.drop(['channel'], axis=1)
                tmp = pd.concat([tmp, vectorized_channel], axis=1)
            # if combination.__contains__('content'):
            lbgm = LGBMClassifier(n_estimators=n, max_depth=10, learning_rate=0.1, random_state=1, num_leaves=100)
            scores = cross_val_score(estimator=lbgm, X=tmp.values, y=y_train, cv=5, scoring='roc_auc')
            com.append(combination)
            acc.append([scores.mean(), scores.std()])


tmp = pd.DataFrame({'combination': com, 'accuracy': acc})

In [None]:
# output validation result
sorted_tmp = tmp.sort_values(by=['accuracy'], ascending=False)
sorted_tmp.to_csv('../training_output/lgbm_2_acc.csv', index=False)

In [None]:
rf = RandomForestClassifier(n_estimators=10, random_state=15)
scores = cross_val_score(estimator=rf, X=tmp.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))

In [None]:
com = []
acc = []
attributes = X_train.columns.values
for r in range(1, len(attributes) + 1):
    for combination in itertools.combinations(attributes, r):
        # if combination.__contains__('Month') and combination.__contains__('Weekday') and combination.__contains__('Year') and \
        #     combination.__contains__('Day') and combination.__contains__('Hour') and combination.__contains__('img_count') and \
        #     combination.__contains__('channel'):
            tmp = X_train[list(combination)]
            if combination.__contains__('author'):
                tmp = tmp.drop(['author'], axis=1)
                tmp = pd.concat([tmp, vectorized_author], axis=1)
            # if combination.__contains__('categories'):
            # if combination.__contains__('title'):
            #     tmp = tmp.drop(['title'], axis=1)
            #     tmp = pd.concat([tmp, vectorized_title], axis=1)
            if combination.__contains__('channel'):
                tmp = tmp.drop(['channel'], axis=1)
                tmp = pd.concat([tmp, vectorized_channel], axis=1)
            rf = RandomForestClassifier(n_estimators=10, random_state=15)
            scores = cross_val_score(estimator=rf, X=tmp.values, y=y_train, cv=5, scoring='roc_auc')
            com.append(combination)
            acc.append([scores.mean(), scores.std()])


tmp = pd.DataFrame({'combination': com, 'accuracy': acc})

In [None]:
tmp = pd.DataFrame({'combination': com, 'accuracy': acc})

In [None]:
# output validation result
sorted_tmp = tmp.sort_values(by=['accuracy'], ascending=False)
sorted_tmp.to_csv('../training_output/random_forest_acc.csv', index=False)

In [55]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(estimator=knn, X=X_train.values, y=y_train, cv=5, scoring='roc_auc')
print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))

0.543 (+/-0.005)


In [83]:
from sklearn.neighbors import KNeighborsClassifier
# X_train = pd.concat([X_train, vectorized_author], axis=1)
# X_train = pd.concat([X_train, vectorized_channel], axis=1)
# X_train = pd.concat([X_train, vectorized_categories], axis=1)
X_train = df[['Month', 'Weekday', 'Year', 'Day', 'Hour', 'img_count', 'categories', 'author', 'channel']]
com = []
acc = []
attributes = X_train.columns.values
for r in range(1, len(attributes) + 1):
    for combination in itertools.combinations(attributes, r):
      if combination.__contains__('Month') and combination.__contains__('Weekday') and combination.__contains__('Year') :
        tmp = X_train[list(combination)]
        if combination.__contains__('author'):
          tmp = tmp.drop(['author'], axis=1)
          tmp = pd.concat([tmp, vectorized_author], axis=1)
        if combination.__contains__('categories'):
          tmp = tmp.drop(['categories'], axis=1)
          tmp = pd.concat([tmp, vectorized_categories], axis=1)
        if combination.__contains__('channel'):
          tmp = tmp.drop(['channel'], axis=1)
          tmp = pd.concat([tmp, vectorized_channel], axis=1)
        knn = KNeighborsClassifier(n_neighbors=400)
        scores = cross_val_score(estimator=knn, X=tmp.values, y=y_train, cv=5, scoring='roc_auc')
        com.append(combination)
        acc.append([scores.mean(), scores.std()])
        print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))
      

0.579 (+/-0.009)
0.574 (+/-0.007)
0.575 (+/-0.009)
0.577 (+/-0.011)
0.581 (+/-0.011)
0.582 (+/-0.011)
0.585 (+/-0.011)
0.568 (+/-0.005)
0.568 (+/-0.004)
0.574 (+/-0.006)
0.574 (+/-0.006)
0.573 (+/-0.006)
0.572 (+/-0.009)
0.576 (+/-0.009)
0.576 (+/-0.009)
0.577 (+/-0.009)
0.576 (+/-0.008)
0.578 (+/-0.011)
0.579 (+/-0.011)
0.580 (+/-0.009)
0.581 (+/-0.011)
0.585 (+/-0.012)
0.562 (+/-0.003)
0.570 (+/-0.005)
0.569 (+/-0.004)
0.568 (+/-0.004)
0.570 (+/-0.004)
0.569 (+/-0.004)
0.569 (+/-0.005)
0.574 (+/-0.006)
0.574 (+/-0.006)
0.572 (+/-0.007)
0.572 (+/-0.007)
0.572 (+/-0.009)
0.571 (+/-0.009)
0.578 (+/-0.009)
0.577 (+/-0.008)
0.579 (+/-0.009)
0.577 (+/-0.008)
0.579 (+/-0.009)
0.579 (+/-0.012)
0.583 (+/-0.010)
0.562 (+/-0.003)
0.563 (+/-0.003)
0.562 (+/-0.003)
0.570 (+/-0.004)
0.569 (+/-0.004)
0.569 (+/-0.004)
0.570 (+/-0.003)
0.571 (+/-0.004)
0.570 (+/-0.004)
0.574 (+/-0.006)
0.572 (+/-0.007)
0.573 (+/-0.007)
0.572 (+/-0.009)
0.579 (+/-0.008)
0.579 (+/-0.009)
0.562 (+/-0.004)
0.563 (+/-0.00

In [84]:
# output validation result
tmp = pd.DataFrame({'combination': com, 'accuracy': acc})
sorted_tmp = tmp.sort_values(by=['accuracy'], ascending=False)
sorted_tmp.to_csv('../training_output/KNN_600_acc.csv', index=False)