In [1]:
import pandas as pd
import numpy as np
from numpy import std, mean, sqrt, median
import matplotlib.pyplot as plt
from scipy.stats import expon, reciprocal
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from numpy.random import randint
import warnings
warnings.filterwarnings(action='ignore')
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression 
import lightgbm as lgb
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from konlpy.tag import Kkma, Okt  
import pickle
kkma = Kkma()
okt = Okt()

# dataset.csv file 명세

## Text feature
### Structure
* title_length: 제목의 길이
* main_length: 본문의 길이
* celebrity: 본문에서 유명인 포함 수
* title_scoop_binary: 제목에 ‘단독’, ‘속보’가 있는지 여부
* exclusive_report: 제목에 '단독' 포함 여부
* breaking_news: 제목에 '속보' 포함 여부


### Semantic
* text_positive: (본문 긍정단어 합 / 단어 수)
* text_negative: (본문 부정단어 합 / 단어 수)
* follow_up: 이전 기사들과의 유사도 중 최댓값
* Doc2vec_title: 기사 제목의 Doc2vec 결과벡터(kkma)
* Doc2vec_text: 기사 본문의 Doc2vec 결과벡터(kkma)


## Visual feature
### Image
* image_counts : 이미지의 수
* people_in_image_counts: 이미지 내 사람의 수
* graph_image_counts: 그래프  수
* picture_image_counts: 사진의 수
* Image_Class: 이미지 입력 시 전이학습모델(Pretrained model)의 마지막 활성화결과값(1000개) -> 미포함


### Video
* video_counts : 비디오의 수


## Meta feature
* publisher: 신문사명
* topic_cate: 뉴스의 카테고리 분류

## LightGBM Kkma

In [2]:
df = pd.read_csv("dataset.csv")
X = df.drop(columns=['label', 'cluster_7'])
y = df['cluster_7']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0, stratify = y)

columns_unscaling = []
for column in X.columns:
    if 'publisher' in column:
        columns_unscaling.append(column)
    elif 'topic_cate' in column:
        columns_unscaling.append(column)

In [3]:
# split columns for scaling
X_train_doc = X_train[X_train.columns.intersection(columns_unscaling)] # categorical features
X_test_doc = X_test[X_test.columns.intersection(columns_unscaling)]
X_train_for_scaling = X_train.drop(columns=columns_unscaling) # other features
X_test_for_scaling = X_test.drop(columns=columns_unscaling)

# scaling
full_pipeline = Pipeline([('scaler', StandardScaler())])
X_train_prepared = full_pipeline.fit_transform(X_train_for_scaling)
X_test_prepared = full_pipeline.transform(X_test_for_scaling)

# merge again
X_train_prepared = np.concatenate((X_train_prepared, X_train_doc.to_numpy()), axis=1)
X_test_prepared = np.concatenate((X_test_prepared, X_test_doc.to_numpy()), axis=1)

print(X_train_prepared.shape)
print(X_test_prepared.shape)

(21599, 295)
(9257, 295)


In [None]:
param_distribs = {
    'boosting_type': ['gbdt', 'dart', 'rf'],
    'num_leaves': list(range(10,10000)),
    'learning_rate': reciprocal(.0001, 1000),
    'n_estimators': list(range(10, 10000)),
    'objective': ['multiclass'],
    'min_child_weight': reciprocal(.0001, 1000),
    'min_child_samples': list(range(1,10000)),
    'subsample_for_bin': list(range(1, 100000))
}
model = lgb.LGBMClassifier()

grid = RandomizedSearchCV(model, param_distributions=param_distribs, scoring = 'accuracy', n_iter = 10, cv=5)
grid.fit(X_train_prepared, y_train)

print('optimal train score: {:.3f}'.format(grid.best_score_))
print('test score: {:.3f}'.format(grid.score(X_test_prepared, y_test)))
print('optimal parameter: {}'.format(grid.best_params_))

In [None]:
final_model = grid.best_estimator_
print(final_model.score(X_test_prepared, y_test))

y_pred = final_model.predict(X_test_prepared)
print(classification_report(y_test, y_pred))

In [None]:
final_model = grid.best_estimator_
print(final_model.score(X_test_prepared, y_test))

y_pred = final_model.predict(X_test_prepared)
print(classification_report(y_test, y_pred))

In [None]:
X.corr()

In [None]:
tmp2 = pd.DataFrame({'features': X.columns, 'importance': final_model.feature_importances_})
tmp2.sort_values(by='importance', ascending=False)