## 8-7. 문서 군집화 소개와 실습(Opinion Review 데이터 세트)
비슷한 텍스트 구성의 문서를 군집화. 동일한 군집에 속하는 문서를 같은 카테고리 소속으로 분류. 비지도 학습 기반 동작.

 ### Opinion Review 데이터 세트를 이용한 문서 군집화 수행하기

In [1]:
import pandas as pd
import glob, os

from google.colab import drive
drive.mount('/content/drive')

path =r"/content/drive/MyDrive/ESAA/topics"
all_files = glob.glob(os.path.join(path,'*.data'))
filename_list = []
opinion_text = []

for file_ in all_files:
  df = pd.read_table(file_, index_col = None, header=0, encoding='latin1')
  filename_ = file_.split('/')[-1]
  filename = filename_.split('.')[0]
  filename_list.append(filename)
  opinion_text.append(df.to_string())

document_df = pd.DataFrame({'filename': filename_list, 'opinion_text':opinion_text})
document_df.head()

Mounted at /content/drive


Unnamed: 0,filename,opinion_text
0,display_garmin_nuvi_255W_gps,...
1,features_windows7,...
2,fonts_amazon_kindle,...
3,accuracy_garmin_nuvi_255W_gps,...
4,comfort_toyota_camry_2007,...


In [2]:
from nltk.stem import WordNetLemmatizer
import nltk
import string
nltk.download('punkt')
nltk.download('wordnet')

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
lemmar = WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english',
                             ngram_range=(1,2), min_df=0.05, max_df=0.85)
# opinion_text 칼럼 값으로 피처 벡터화 수행
feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])



In [4]:
from sklearn.cluster import KMeans

# 5개 집합으로 군집화 수행
km_cluster = KMeans(n_clusters=5, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_



In [5]:
document_df['cluster_label']=cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,display_garmin_nuvi_255W_gps,...,0
1,features_windows7,...,3
2,fonts_amazon_kindle,...,3
3,accuracy_garmin_nuvi_255W_gps,...,0
4,comfort_toyota_camry_2007,...,1


In [7]:
document_df[document_df['cluster_label']==0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
3,accuracy_garmin_nuvi_255W_gps,...,0
10,directions_garmin_nuvi_255W_gps,...,0
0,display_garmin_nuvi_255W_gps,...,0
24,satellite_garmin_nuvi_255W_gps,...,0
48,speed_garmin_nuvi_255W_gps,...,0
49,updates_garmin_nuvi_255W_gps,...,0
46,voice_garmin_nuvi_255W_gps,...,0


In [8]:
document_df[document_df['cluster_label']==1].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
14,comfort_honda_accord_2008,...,1
4,comfort_toyota_camry_2007,...,1
5,gas_mileage_toyota_camry_2007,...,1
15,interior_honda_accord_2008,...,1
30,interior_toyota_camry_2007,...,1
25,mileage_honda_accord_2008,...,1
38,performance_honda_accord_2008,...,1
23,quality_toyota_camry_2007,...,1
33,seats_honda_accord_2008,...,1
50,transmission_toyota_camry_2007,...,1


In [9]:
document_df[document_df['cluster_label']==2].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
6,bathroom_bestwestern_hotel_sfo,...,2
20,room_holiday_inn_london,...,2
19,rooms_bestwestern_hotel_sfo,...,2
28,rooms_swissotel_chicago,...,2


In [10]:
document_df[document_df['cluster_label']==3].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
7,battery-life_amazon_kindle,...,3
12,battery-life_ipod_nano_8gb,...,3
11,battery-life_netbook_1005ha,...,3
9,buttons_amazon_kindle,...,3
17,eyesight-issues_amazon_kindle,...,3
1,features_windows7,...,3
2,fonts_amazon_kindle,...,3
35,keyboard_netbook_1005ha,...,3
37,navigation_amazon_kindle,...,3
41,performance_netbook_1005ha,...,3


In [11]:
document_df[document_df['cluster_label']==4].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
16,food_holiday_inn_london,...,4
13,food_swissotel_chicago,...,4
8,free_bestwestern_hotel_sfo,...,4
27,location_bestwestern_hotel_sfo,...,4
29,location_holiday_inn_london,...,4
36,parking_bestwestern_hotel_sfo,...,4
39,price_holiday_inn_london,...,4
22,service_bestwestern_hotel_sfo,...,4
18,service_holiday_inn_london,...,4
34,service_swissotel_hotel_chicago,...,4


In [12]:
from sklearn.cluster import KMeans

# 3개의 집합으로 군집화
km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

# 소속 군집을 cluster_label 칼럼으로 할당하고 cluster_label 값으로 정렬
document_df['cluster_label']=cluster_label
document_df.sort_values(by='cluster_label')



Unnamed: 0,filename,opinion_text,cluster_label
25,mileage_honda_accord_2008,...,0
38,performance_honda_accord_2008,...,0
33,seats_honda_accord_2008,...,0
30,interior_toyota_camry_2007,...,0
23,quality_toyota_camry_2007,...,0
15,interior_honda_accord_2008,...,0
14,comfort_honda_accord_2008,...,0
50,transmission_toyota_camry_2007,...,0
5,gas_mileage_toyota_camry_2007,...,0
4,comfort_toyota_camry_2007,...,0


### 군집별 핵심 단어 추출하기
단어 피처가 군집의 중심 기준으로 얼마나 가깝게 위치해 있는지 제공

In [13]:
cluster_centers = km_cluster.cluster_centers_
print('cluster_centers shape : ', cluster_centers.shape)
print(cluster_centers)

cluster_centers shape :  (3, 4611)
[[0.         0.00092551 0.         ... 0.         0.         0.        ]
 [0.         0.00099499 0.00174637 ... 0.         0.00183397 0.00144581]
 [0.01005322 0.         0.         ... 0.00706287 0.         0.        ]]


In [14]:
# 군집별 top n 핵심 단어, 그 단어의 중심 위치 상댓값, 대상 파일명 반환
def get_cluster_details(cluster_model, cluster_data, feature_names, clusters_num, top_n_features=10):
  cluster_details={}

  # cluster_centers array의 값이 큰 순으로 정렬된 인덱스 값을 반환
  # 군집 중심점 별 할당된 word 피처들의 거리값이 큰 순으로 값을 구하기 위함
  centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[:,::-1]

  # 개별 군집별로 반복하면서 핵심 단어, 그 단어의 중심 위치 상댓값, 대상 파일명 입력
  for cluster_num in range(clusters_num):
    # 개별 군집별 정보를 담을 데이터 초기화
    cluster_details[cluster_num]={}
    cluster_details[cluster_num]['cluster']=cluster_num

    # cluster_centers_.argsort()[:,::-1]로 구한 인덱스를 이용해 top n 피처 단어 구함
    top_feature_indexes = centroid_feature_ordered_ind[cluster_num, :top_n_features]
    top_features = [feature_names[ind] for ind in top_feature_indexes]

    # top_feature_indexes 를 이용해 해당 피처 단어의 중심 위치 상댓값 구함
    top_feature_values = cluster_model.cluster_centers_[cluster_num, top_feature_indexes].tolist()

    # cluster_details 딕셔너리 객체에 개별 군집별 핵심 단어와 중심 위치 상댓값, 해당 파일명 입력
    cluster_details[cluster_num]['top_features']=top_features
    cluster_details[cluster_num]['top_features_value']=top_feature_values
    filenames = cluster_data[cluster_data['cluster_label']==cluster_num]['filename']
    filenames = filenames.values.tolist()

    cluster_details[cluster_num]['filenames'] = filenames

  return cluster_details

In [15]:
def print_cluster_details(clsuter_details):
  for cluster_num, cluster_detail in cluster_details.items():
    print('###### Cluster {0}'.format(cluster_num))
    print('Top features:', cluster_detail['top_features'])
    print('Reviews 파일명 :', cluster_detail['filenames'][:7])
    print('==================================================')

In [17]:
feature_names = tfidf_vect.get_feature_names_out()

cluster_details = get_cluster_details(cluster_model=km_cluster, cluster_data= document_df,
                                      feature_names= feature_names, clusters_num=3, top_n_features=10)
print_cluster_details(cluster_details)

###### Cluster 0
Top features: ['interior', 'seat', 'mileage', 'comfortable', 'gas', 'gas mileage', 'transmission', 'car', 'performance', 'quality']
Reviews 파일명 : ['comfort_toyota_camry_2007', 'gas_mileage_toyota_camry_2007', 'comfort_honda_accord_2008', 'interior_honda_accord_2008', 'quality_toyota_camry_2007', 'mileage_honda_accord_2008', 'interior_toyota_camry_2007']
###### Cluster 1
Top features: ['room', 'hotel', 'service', 'staff', 'food', 'location', 'bathroom', 'clean', 'price', 'parking']
Reviews 파일명 : ['bathroom_bestwestern_hotel_sfo', 'free_bestwestern_hotel_sfo', 'food_swissotel_chicago', 'food_holiday_inn_london', 'service_holiday_inn_london', 'rooms_bestwestern_hotel_sfo', 'room_holiday_inn_london']
###### Cluster 2
Top features: ['screen', 'battery', 'keyboard', 'battery life', 'life', 'kindle', 'direction', 'video', 'size', 'voice']
Reviews 파일명 : ['display_garmin_nuvi_255W_gps', 'features_windows7', 'fonts_amazon_kindle', 'accuracy_garmin_nuvi_255W_gps', 'battery-life_a