In [None]:
!pip3 install -U numpy pandas sklearn matplotlib bs4 yfinance pandas-datareader gensim wordcloud

In [None]:
import os
import sys
import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 100)

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn.cluster import *
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
class NewsVisual(object):
    def __init__(self, file_path:str, method:str="tf-idf", n_features:int=256):
        df = pd.read_csv(file_path)
        self.X = df["content"].values
        self.method = method
        self.n_features = n_features

    def tf_idf_method(self):
        vec = TfidfVectorizer(max_features=self.n_features, stop_words="english")
        vec.fit(self.X)
        features = vec.transform(self.X)
        return features.toarray()

    def doc2vec_method(self):
        all_content_train = list()
        for idx, em in enumerate(self.X):
            all_content_train.append(TaggedDocument(em, [idx]))

        d2v_model = Doc2Vec(
            all_content_train, 
            size=self.n_features,
            window=5,
            min_count=4,
            workers=4,
            dm=1,
        )

        d2v_model.train(
            all_content_train,
            total_examples=d2v_model.corpus_count,
            epochs=32,
            start_alpha=0.025
        )

        return d2v_model.docvecs.doctag_syn0


    def words2img(self, df):
        text = " ".join(i for i in df.content)
        wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
        return wordcloud


    def show(self, n_clusters:int=5, word_cloud:bool=True, cluster_news:bool=True, label2cluster=None):
        random_state = 1234

        if self.method == "tf-idf":
            X = self.tf_idf_method()
        elif self.method == "doc2vec":
            X = self.doc2vec_method()

        model = KMeans(n_clusters, random_state=random_state)
        labels = model.fit_predict(X)

        if word_cloud:
            df = pd.DataFrame(labels)
            df.insert(df.shape[1], "content", self.X[df.index])

            ## word cloud graph
            plt.figure(figsize=(12, 7))
            for idx, i in enumerate(range(n_clusters)):
                c_df = df[df[0]==i]
                if cluster_news:
                    print("cluster: {}".format(i))
                    print(c_df.sample(20)["content"])

                wordcloud = self.words2img(df[df[0]==i])

                plt.subplot(221+idx)
                plt.title('cluster: {}, size: {}'.format(i, c_df.shape[0]))
                plt.imshow(wordcloud, interpolation="bilinear")
                plt.axis("off")

            plt.show()

        # reduce the features to 3D
        pca = PCA(n_components=3)
        reduced_features = pca.fit_transform(X)
        cluster_centers = pca.transform(model.cluster_centers_)
        # print(reduced_features.shape)
        # print(cluster_centers.shape)
        # draw 3D
        ax = plt.figure().add_subplot(111, projection='3d')
        ax.scatter(
            reduced_features[:,0],
            reduced_features[:,1],
            reduced_features[:,2],
            c=labels,
            cmap='viridis',
            alpha=0.5
        )
        for idx, j in enumerate(cluster_centers):
            j = j.tolist()
            tmp_label = label2cluster[idx] if label2cluster else str(idx)
            ax.text(j[0], j[1], j[2], tmp_label, color="red", weight='bold')

        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.set_zlabel('Z')
        # ax.legend()
        plt.show()
    

In [None]:
nv = NewsVisual(
    r"news\MSFT_20201118.csv",
    method="tf-idf",
    n_features=1024
)
nv.show(4, word_cloud=True, label2cluster=['related', 'negative', 'postive', 'objective'])

## cluster: 0 -> related
## cluster: 1 -> negative
## cluster: 2 -> postive
## cluster: 3 -> objective

In [None]:
nv = NewsVisual(
    r"news\MSFT_20201118.csv",
    method="doc2vec",
    n_features=1024
)
nv.show(4)

## 