In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
data = pd.read_csv("https://raw.githubusercontent.com/haram4th/ablearn/main/shopping-data.csv")

In [12]:
data

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
...,...,...,...,...,...
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Genre                   200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [14]:
data.describe(include='all')

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200,200.0,200.0,200.0
unique,,2,,,
top,,Female,,,
freq,,112,,,
mean,100.5,,38.85,60.56,50.2
std,57.879185,,13.969007,26.264721,25.823522
min,1.0,,18.0,15.0,1.0
25%,50.75,,28.75,41.5,34.75
50%,100.5,,36.0,61.5,50.0
75%,150.25,,49.0,78.0,73.0


# 계층적 군집 분석
* 비지도학습 
* 데이터가 적을 경우 사용
* 거리 계산 지표: 유클리드, 맨하탄, 코사인유사도
    * 단일연결법(single linkage)-최단연결법: 군집과 군집 사이에서 가장 가까운 데이터를 기준으로 묶음
    * 완전연결법(complete linkage)-최장연결법: 군집과 군집 사이에서 가장 먼 데이터를 기준으로 묶음.
    * 평균연결법(average linkage): 군집과 군집 사이의 모든 데이터의 평균 데이터로 계산(이상치에 덜 민감하다)
    * 중심연결법(centroid linkage): 군집의 중심점 사이의 거리를 거리로 측정한 방법, 계산이 빠르다
    * 와드연결법(ward linkage): 군집내 오차가 최소가 되는 데이터로 계산, 군집내 분산이 최소, 조밀한 군집
    
* 계층적 군집의 시각화는 덴드로그램으로 한다

In [15]:
import scipy.cluster.hierarchy as shc

In [16]:
data = pd.get_dummies(data)
data

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Genre_Female,Genre_Male
0,1,19,15,39,False,True
1,2,21,15,81,False,True
2,3,20,16,6,True,False
3,4,23,16,77,True,False
4,5,31,17,40,True,False
...,...,...,...,...,...,...
195,196,35,120,79,True,False
196,197,45,126,28,True,False
197,198,32,126,74,False,True
198,199,32,137,18,False,True


In [17]:
data2=data.copy()
data2

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Genre_Female,Genre_Male
0,1,19,15,39,False,True
1,2,21,15,81,False,True
2,3,20,16,6,True,False
3,4,23,16,77,True,False
4,5,31,17,40,True,False
...,...,...,...,...,...,...
195,196,35,120,79,True,False
196,197,45,126,28,True,False
197,198,32,126,74,False,True
198,199,32,137,18,False,True


In [18]:
data=data.drop("CustomerID", axis=1)
data

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Genre_Female,Genre_Male
0,19,15,39,False,True
1,21,15,81,False,True
2,20,16,6,True,False
3,23,16,77,True,False
4,31,17,40,True,False
...,...,...,...,...,...
195,35,120,79,True,False
196,45,126,28,True,False
197,32,126,74,False,True
198,32,137,18,False,True


In [19]:
plt.switch_backend('TkAgg')

In [20]:
plt.figure(figsize=(30,10))
plt.title("Single linkage Dendrogram")
dend = shc.dendrogram(shc.linkage(data, method = 'single'))
plt.show()

In [21]:
plt.figure(figsize=(30,10))
plt.title("Complete linkage Dendrogram")
dend = shc.dendrogram(shc.linkage(data, method = 'complete'))
plt.show()

In [23]:
plt.figure(figsize=(30,10))
plt.title("Centroid linkage Dendrogram")
dend = shc.dendrogram(shc.linkage(data, method = 'centroid'))
plt.show()

In [24]:
plt.figure(figsize=(30,10))
plt.title("Ward linkage Dendrogram")
dend = shc.dendrogram(shc.linkage(data, method = 'ward'))
plt.show()

In [25]:
from sklearn.cluster import AgglomerativeClustering

In [28]:
cluster = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='single')
result=cluster.fit_predict(data)
#plt.figure(figsize=(10,5))
#plt.scatter(data[:,0], data[:,1], c=cluster.labels_, cmap='rainbow')
#plt.show()
result

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 3, 1, 2, 1, 2, 1,
       4, 1], dtype=int64)

In [30]:
data['result']=result

In [31]:
data.columns

Index(['Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Genre_Female',
       'Genre_Male', 'result'],
      dtype='object')

In [32]:
data.groupby('result')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Genre_Female',
       'Genre_Male']].mean()

Unnamed: 0_level_0,Age,Annual Income (k$),Spending Score (1-100),Genre_Female,Genre_Male
result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,40.369427,52.490446,43.101911,0.566879,0.433121
1,32.692308,86.538462,82.128205,0.538462,0.461538
2,46.0,123.0,22.0,1.0,0.0
3,33.0,113.0,8.0,0.0,1.0
4,32.0,137.0,18.0,0.0,1.0


# 비계층적 군집 - 데이터 수가 많을 때 사용
* k-means(k평균 군집): 거리를 기반으로 군집을 형성, 이상치에 민감함

In [52]:
data = pd.read_csv("https://raw.githubusercontent.com/haram4th/ADsP/main/06%EA%B3%A0%EA%B0%9D%EC%9D%B4%ED%83%88%EC%98%88%EC%B8%A1.csv", encoding='cp949')


In [54]:
data

Unnamed: 0,회원ID,성별,고연령,배우자,피부양자,가입기간,전화서비스,2회선이상,인터넷서비스,온라인보안,...,기기보호서비스,기술지원,스트리밍TV,스트리밍Movies,약정옵션,온라인고지서,지불수단,월요금,합산요금,이탈여부
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


하향식방법 vs 상향식방법

In [55]:
data.info(0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   회원ID        7043 non-null   object 
 1   성별          7043 non-null   object 
 2   고연령         7043 non-null   int64  
 3   배우자         7043 non-null   object 
 4   피부양자        7043 non-null   object 
 5   가입기간        7043 non-null   int64  
 6   전화서비스       7043 non-null   object 
 7   2회선이상       7043 non-null   object 
 8   인터넷서비스      7043 non-null   object 
 9   온라인보안       7043 non-null   object 
 10  온라인백업       7043 non-null   object 
 11  기기보호서비스     7043 non-null   object 
 12  기술지원        7043 non-null   object 
 13  스트리밍TV      7043 non-null   object 
 14  스트리밍Movies  7043 non-null   object 
 15  약정옵션        7043 non-null   object 
 16  온라인고지서      7043 non-null   object 
 17  지불수단        7043 non-null   object 
 18  월요금         7043 non-null   float64
 19  합산요금        7043 non-null  

In [56]:
data2=data.copy()

In [57]:
data=data.drop('회원ID',axis=1)
data

Unnamed: 0,성별,고연령,배우자,피부양자,가입기간,전화서비스,2회선이상,인터넷서비스,온라인보안,온라인백업,기기보호서비스,기술지원,스트리밍TV,스트리밍Movies,약정옵션,온라인고지서,지불수단,월요금,합산요금,이탈여부
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [58]:
data['합산요금'] = data['합산요금'].apply(lambda x: x.replace(' ', '0') if x == ' ' else x)

In [59]:
data['합산요금'] = data['합산요금'].astype('float')

In [61]:
data = pd.get_dummies(data, drop_first=True)

In [62]:
from sklearn.cluster import KMeans

In [63]:
km = KMeans(n_clusters=5)
km.fit(data) 
result = km.labels_
result

array([2, 0, 2, ..., 2, 2, 4])

In [64]:
data['cluster'] = result

In [65]:
data[['이탈여부_Yes', 'cluster']]

Unnamed: 0,이탈여부_Yes,cluster
0,False,2
1,False,0
2,True,2
3,False,0
4,True,2
...,...,...
7038,False,0
7039,False,4
7040,False,2
7041,True,2


In [66]:
from sklearn.metrics import accuracy_score

In [67]:
print(accuracy_score(data['이탈여부_Yes'], data['cluster']))

0.20303847792134033


In [68]:
data[data['cluster'] == 0]

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes,cluster
1,0,34,56.95,1889.50,True,False,False,True,False,False,...,False,False,True,False,False,False,False,True,False,0
3,0,45,42.30,1840.75,True,False,False,False,True,False,...,False,False,True,False,False,False,False,False,False,0
6,0,22,89.10,1949.40,True,False,True,True,False,True,...,False,False,False,False,True,True,False,False,False,0
16,0,52,20.65,1022.95,False,False,False,True,False,False,...,True,False,True,False,False,False,False,True,False,0
19,0,21,90.05,1862.90,False,False,False,True,False,False,...,False,True,False,False,True,False,True,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7017,0,51,20.65,1020.75,False,False,False,True,False,False,...,True,False,False,True,False,False,False,False,False,0
7025,0,18,95.05,1679.40,False,False,False,True,False,True,...,False,True,False,False,True,False,False,False,False,0
7035,0,19,78.70,1495.10,True,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,0
7037,0,72,21.15,1419.40,False,False,False,True,False,False,...,True,False,False,True,True,False,False,False,False,0


In [69]:
cluster1 = data[data['cluster'] == 0]
cluster1

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes,cluster
1,0,34,56.95,1889.50,True,False,False,True,False,False,...,False,False,True,False,False,False,False,True,False,0
3,0,45,42.30,1840.75,True,False,False,False,True,False,...,False,False,True,False,False,False,False,False,False,0
6,0,22,89.10,1949.40,True,False,True,True,False,True,...,False,False,False,False,True,True,False,False,False,0
16,0,52,20.65,1022.95,False,False,False,True,False,False,...,True,False,True,False,False,False,False,True,False,0
19,0,21,90.05,1862.90,False,False,False,True,False,False,...,False,True,False,False,True,False,True,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7017,0,51,20.65,1020.75,False,False,False,True,False,False,...,True,False,False,True,False,False,False,False,False,0
7025,0,18,95.05,1679.40,False,False,False,True,False,True,...,False,True,False,False,True,False,False,False,False,0
7035,0,19,78.70,1495.10,True,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,0
7037,0,72,21.15,1419.40,False,False,False,True,False,False,...,True,False,False,True,True,False,False,False,False,0


Elbow method: inertia_(응집도)를 출력하고 시각화 한뒤 응집도의 감소폭이 급격한 변화가 사라지는 지점을 최적 군집의 수로 채택하는 것

In [70]:
print(km.inertia_)

1330299698.5389411


In [71]:
distance=[]
for i in range(1,21):
    km=KMeans(n_clusters=i)
    km.fit(data)
    distance.append(km.inertia_)
distance

[36194983012.77158,
 7935386722.034387,
 3706749365.289404,
 2117700648.2745929,
 1330880292.9130325,
 920881835.7943147,
 672298214.8165377,
 527379586.83259696,
 417121475.7176914,
 330099747.3525141,
 270671546.3646239,
 237835436.32074022,
 197488750.39418453,
 169063469.32123616,
 149613047.24520317,
 133407427.0305265,
 121051154.01777913,
 108199713.60658118,
 94405108.7546079,
 87090402.56285428]

In [73]:
plt.figure(figsize=(20,10))
sns.lineplot(x=range(1,21),y=distance, markers='o',markersize=10,markerfacecolor='red')
plt.show()

실루엣 지수를 출력해서 최적 군집 개수 구하기
* 실루엣 지수를 구하기 위해서는 최소 2개 이상의 군집이 필요

In [75]:
from sklearn.metrics import silhouette_score

In [76]:
silhouette_scores=[]
for i in range(2,21):
    km2 = KMeans(n_clusters=i)
    km2.fit(data)
    labels=km2.labels_
    silhouette_scores.append(silhouette_score(data, labels))
silhouette_scores

[0.7031208633546057,
 0.6447861695984312,
 0.602124604907737,
 0.596883069223226,
 0.59498384829488,
 0.5844653042817637,
 0.5698416866541225,
 0.5641868645858997,
 0.56291720632056,
 0.5619124283365252,
 0.5615737736941645,
 0.5494232168773625,
 0.5505737641873526,
 0.5414772451845266,
 0.5352784115300979,
 0.5353179928287777,
 0.5334048549205427,
 0.5295927919413034,
 0.5232303557367441]

In [79]:
plt.figure(figsize=(20,10))
sns.lineplot(x=range(2,21),y=silhouette_scores, markers='o',markersize=10, markerfacecolor='red')
plt.show()

# K-means로 군집분석을 할 때 주으점
* k_means는 평균값을 이용하고 주로 유클리드 거리를 사용하기 때문에 이상치에 민감
* 군집분석하기 전에 데이터의 스케일을 미리 맞춰주는 것이 중요

In [80]:
data3=data.drop('cluster', axis=1)

In [81]:
data3

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍TV_Yes,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False
1,0,34,56.95,1889.50,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,True,False,False,True,True
3,0,45,42.30,1840.75,True,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
4,0,2,70.70,151.65,False,False,False,True,False,False,...,False,False,False,False,False,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,True,True,True,True,False,True,...,True,False,True,True,False,True,False,False,True,False
7039,0,72,103.20,7362.90,False,True,True,True,False,True,...,True,False,True,True,False,True,True,False,False,False
7040,0,11,29.60,346.45,False,True,True,False,True,False,...,False,False,False,False,False,True,False,True,False,False
7041,1,4,74.40,306.60,True,True,False,True,False,True,...,False,False,False,False,False,True,False,False,True,True


In [85]:
X=data3.drop('이탈여부_Yes', axis=1)

In [82]:
from sklearn.preprocessing import MinMaxScaler

In [86]:
mm = MinMaxScaler()
scaled_X = mm.fit_transform(data3)
scaled_X

array([[0.        , 0.01388889, 0.11542289, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.47222222, 0.38507463, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.02777778, 0.35422886, ..., 0.        , 1.        ,
        1.        ],
       ...,
       [0.        , 0.15277778, 0.11293532, ..., 1.        , 0.        ,
        0.        ],
       [1.        , 0.05555556, 0.55870647, ..., 0.        , 1.        ,
        1.        ],
       [0.        , 0.91666667, 0.86965174, ..., 0.        , 0.        ,
        0.        ]])

In [87]:
scaled_X=pd.DataFrame(scaled_X, columns=X.columns)

ValueError: Shape of passed values is (7043, 31), indices imply (7043, 30)