In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns

In [None]:
plt.rc('font', family='gulim')

In [None]:
train = pd.read_csv('split_sen.csv')

In [None]:
train

In [None]:
def remove_punct(_df, col):
    _df[col] = _df[col].str.replace('[^a-zA-Z가-힣0-9\s]', '', regex=True)
    _df[col] = _df[col].str.replace('^\s+', '', regex=True)
    _df[col] = _df[col].replace('', np.nan)
    _df.dropna(inplace=True)

In [None]:
remove_punct(train, col='0')
train.isnull().sum()

In [None]:
train

In [None]:
df_stopwords = pd.read_csv('stopword_ko.txt', header=None)
stop_words = df_stopwords[0].tolist()


In [None]:
from konlpy.tag import Okt
okt = Okt()

In [None]:
def tokenizer(_words):
    words = okt.pos(_words, stem=True)
    return [w for w, p in words if w not in stop_words]

In [None]:
X_train = [tokenizer(sten) for sten in train['0']]
X_train[:5]

# GloVe모델

In [None]:
from glove import Corpus, Glove

In [None]:
# !pip install glove_python_binary

In [None]:
# corpus 생성
corpus = Corpus()
corpus.fit(X_train, window=20)

In [None]:
# model
glove = Glove(no_components=128, learning_rate=0.01)     # 0.05
%time glove.fit(corpus.matrix, epochs=50, no_threads=4, verbose=False)    # Wall time: 8min 32s
glove.add_dictionary(corpus.dictionary)

# save
# glove.save(DATA_DIR + '/glove_w20_epoch50.model')

In [None]:
glove.save('glove_w20_epoch50.model')

In [None]:
# load glove
glove_model = Glove.load('glove_w20_epoch50.model')

In [None]:
# word dict 생성
word_dict = {}
for word in  glove_model.dictionary.keys():
    word_dict[word] = glove_model.word_vectors[glove_model.dictionary[word]]
print('[Success !] Lengh of word dict... : ', len(word_dict))

In [None]:
word_dict['우울증']

In [None]:
def sent2vec_glove(tokens, embedding_dim=128):
    '''문장 token 리스트를 받아서 임베딩 시킨다.'''
  
    size = len(tokens)
    matrix = np.zeros((size, embedding_dim))
    word_table = word_dict     # glove word_dict

    for i, token in enumerate(tokens):
        vector = np.array([
            word_table[t] for t in token
            if t in word_table
        ])

        if vector.size != 0:
            final_vector = np.mean(vector, axis=0)
            matrix[i] = final_vector

    return matrix

In [None]:
# sklearn
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
# 시각화
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# 문장 임베딩
sentence_glove = sent2vec_glove(X_train)
sentence_glove.shape

# k-means

In [None]:
from tqdm import tqdm

In [None]:
# 최적의 K 찾기 : 군집 갯수 k 찾기
from sklearn import metrics
from scipy.spatial.distance import cdist

# K=50 개의 클러스터에 대해서 시각화
distortions = []
K = range(2, 50)
tqdm.pandas()

for k in K:
    k_means = KMeans(n_clusters=k, random_state=42).fit(sentence_glove)
    k_means.fit(sentence_glove)
    distortions.append(sum(np.min(cdist(sentence_glove, k_means.cluster_centers_, 'euclidean'), axis=1)) / sentence_glove.shape[0])
    
    print('Found distortion for {} clusters'.format(k))

# Visualization
X_line = [K[0], K[-1]]
Y_line = [distortions[0], distortions[-1]]

sns.set(rc={'figure.figsize':(20,20)})
# Plot the elbow
plt.plot(K, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('Optimal K')
plt.savefig("Optimal_K.png")
plt.show()

In [None]:
# clustering
k = 20
kmeans = KMeans(n_clusters=k, random_state=2021)
y_pred = kmeans.fit_predict(sentence_glove)

# tsne
tsne = TSNE(verbose=1, perplexity=100, random_state=2021)     # perplexity : 유사정도
X_embedded = tsne.fit_transform(sentence_glove)
print('Embedding shape 확인', X_embedded.shape)

# 시각화
sns.set(rc={'figure.figsize':(20,20)})
# colors
palette = sns.hls_palette(20, l=.4, s=.9) 
# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred,
                legend='full',palette=palette)     # kmeans로 예측

plt.title('t-SNE with KMeans Labels and Glove Embedding')
plt.savefig("t-sne_question_glove_embedding.png")
plt.show()

In [None]:
train['result'] = y_pred

In [None]:
train

In [None]:
train.to_csv('glove-kmeans.csv', encoding = 'utf-8-sig')

In [None]:
a = train.sort_values(by='result')

In [None]:
a.to_csv('kmeans20_result.csv', encoding = 'utf-8-sig')