# 2차 클러스터링 

## 필요한 파일, 라이브러리 로드

In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans

In [None]:
realfinal_kmeans=pd.read_csv('realfinal_kmeans.csv')

In [None]:
realfinal_kmeans #x와 y는 2차원으로 임베딩한 문장 백터. xy는 해당 두 값을 1차원으로 차원축소한 값

## 차원축소

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=2)

In [None]:
realfinal_kmeans2=realfinal_kmeans[['xy','평점','관람여부','스포여부','날짜차이','댓글길이']]

In [None]:
data=pca.fit_transform(realfinal_kmeans2)
print(data.shape)
print(data)

In [None]:
pca.explained_variance_ratio_

## 군집화

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans=KMeans(n_clusters=3, random_state=10)

In [None]:
kmeans.fit(data)

In [None]:
cluster=kmeans.predict(data)
cluster

In [None]:
df=pd.DataFrame(data,columns=['PCA1','PCA2'])
df['cluster']=cluster

## k 찾기

In [None]:
!pip install yellowbrick

In [None]:
ks = range(1,10)
inertias = []

for k in ks:
    model = KMeans(n_clusters=k)
    model.fit(realfinal_kmeans2)
    inertias.append(model.inertia_)
    

# Plot ks vs inertias
plt.figure(figsize=(4, 4))

plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

## 시각화

In [None]:
import seaborn as sns

In [None]:
sns.lmplot(x='PCA1',y='PCA2',data=df,hue='cluster',fit_reg=False)

## 라벨링 붙이기

In [None]:
realfinal_kmeans['라벨']=cluster # 정황, 댓글 같이본거 

In [None]:
realfinal_kmeans[realfinal_kmeans['origin_index']==3000000]['라벨'].value_counts() # 가짜리뷰 500개 중 461개가 같은 군집에 속함

In [None]:
df.to_csv('context_embedding_pca.csv',index=False)

In [None]:
realfinal_kmeans.to_csv('context_embedding.csv',index=False)

In [None]:
fake.to_csv('fake.csv',index=False)

# 3차 클러스터링 ( 정황정보 또한 차원축소 ) 

## 정황컬럼 pca1 축소하기

In [None]:
realfinal_kmeans3=realfinal_kmeans[['평점','관람여부','스포여부','날짜차이','댓글길이']]

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=1)

In [None]:
data=pca.fit_transform(realfinal_kmeans3)
print(data.shape)
print(data)

In [None]:
pca.explained_variance_ratio_

In [None]:
data

In [None]:
realfinal_kmeans['정황축소']=data

In [None]:
final_df=realfinal_kmeans[['xy','정황축소']]

## 군집화&차원축소

In [None]:
kmeans=KMeans(n_clusters=3, random_state=10)

In [None]:
final_df

In [None]:
kmeans.fit(final_df)

In [None]:
cluster=kmeans.labels_
final_df['cluster']=cluster

## 시각화

In [None]:
sns.lmplot(x='xy',y='정황축소',data=final_df,hue='cluster',fit_reg=False)

In [None]:
realfinal_kmeans['cluster2']=cluster

In [None]:
realfinal_kmeans['라벨3']=final_df['cluster']

In [None]:
realfinal_kmeans[realfinal_kmeans['origin_index']==3000000]['라벨3'].value_counts() # 가짜리뷰 500개 중 461개 같은 군집에 속함

In [None]:
df_final=pd.DataFrame(data,columns=['PCA1','PCA2'])
df_final['cluster']=cluster
df_final