In [8]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import weight_norm
from torch.utils.data import Dataset, DataLoader
import os
import sys
#import umap
import umap.umap_ as umap
from sklearn import mixture
import easydict
import datetime
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from google.colab import files
from sklearn.cluster import KMeans, SpectralClustering
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import StandardScaler
from matplotlib import font_manager, rc
import warnings; warnings.filterwarnings("always"); warnings.filterwarnings(action='ignore')


In [9]:
CSVDATA = pd.read_csv('/content/clustering_data_1901_2004.csv', index_col = 0)  # Unnamed: 0 제거
CSVDATA = CSVDATA[['year', 'month', '가맹점소재지2', '성별', '연령대별', '연평균소득추정', '가구 및 전자 제품', '교육', '교통', '기타',
       '대형 판매', '보험', '서비스', '식료품', '여행 및 숙박', '오락 및 문화', '음식점', '의료',
       '의류 및 잡화', '자동차', '전자상거래', '종합소매', '예외처리', '합계%']]

In [None]:
SEED = 87
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

In [None]:
SEED = 2021
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

class creditDataloader(Dataset):
    def __init__(self, csv):

        self.data = csv.iloc[:, 6:22] # 원본데이터는 4:21 ('예외처리' 포함)
        self.data = self.data.to_numpy()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

class Autoencoder(nn.Module):
    def __init__(self, numLayers, encoders=False):
        super().__init__()
        self.layers = nn.ModuleList()
        if encoders:
            for i in range(len(numLayers) - 2):
                self.layers.append(nn.Linear(numLayers[i], numLayers[i+1]))
                self.layers.append(nn.ReLU())
            self.layers.append(nn.Linear(numLayers[-2], numLayers[-1]))
        else:
            for i in range(len(numLayers) - 2):
                self.layers.append(nn.Linear(numLayers[i], numLayers[i+1]))
                self.layers.append(nn.ReLU())
            self.layers.append(nn.Linear(numLayers[-2], numLayers[-1]))
            for i in range(len(numLayers) - 1, 1, -1):
                self.layers.append(nn.Linear(numLayers[i], numLayers[i-1]))
                self.layers.append(nn.ReLU())
            self.layers.append(nn.Linear(numLayers[1], numLayers[0]))

    def forward(self, x):
        y = x
        for i in range(len(self.layers)):
            y = self.layers[i](y)
        return y


def Clu_model(hl, cluster):
    hl = hl.cpu().data.numpy()
    hle = umap.UMAP(random_state=20205289, n_components= 3).fit_transform(hl) 
    # clustering on new manifold of autoencoded embedding
    gmm = mixture.GaussianMixture(covariance_type= "full", n_components=cluster, random_state=20205289).fit(hle)
    y_pred_prob = gmm.predict_proba(hle)
    y_predict = y_pred_prob.argmax(1)
    return y_predict, hle

In [None]:
if __name__ == '__main__':

    for cl in [5,7]: #바꿔봐야할것 
        args = easydict.EasyDict({
            "epochs": 30, # 바꿔봐야할것 
            "ae_weights": None,
            "data": CSVDATA,
            "gpu": 0,
            "n_clusters": cl,
            "batch_size" : 64})  # 바꿔봐야할것 batch : 32 ~ 64

        dataset = creditDataloader(CSVDATA)
        trainLoader = DataLoader(dataset= dataset, batch_size= args.batch_size, shuffle=True, num_workers=16) #num_works = 16
        
        print(trainLoader)
        
        device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
        net = Autoencoder(numLayers=[16, 300, 300, 600, args.n_clusters]) #바꿔봐야할것   # 원본데이터는 16->17 ('예외처리'포함)
        
        optimizer = torch.optim.Adam(net.parameters(), lr= 0.001)   #torch.optim.AdamW(net.parameters(), lr = 0.001)
        net.to(device)

        def train(model, optimizer, loss_fn, train_loader, n_epochs, device):
            for epoch in range(n_epochs):
                loss_train = 0.0
                for data in train_loader:
                    data = data.to(device=device)
                    data = data.view(data.shape[0], -1)
                    data = torch.tensor(data, dtype=torch.float32)
                    outputs = model(data)
                    loss = loss_fn(outputs, data)
                    loss_train += loss.item()
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                if (epoch + 1) % 1 == 0:
                    print('{} Epoch {}, Training loss {}'.format(datetime.datetime.now(), epoch + 1, loss_train / len(train_loader)))
            return model


        trained_model  = train(net, optimizer, loss_fn= nn.MSELoss(), train_loader= trainLoader, n_epochs= args.epochs, device = device) #Loss : nn.SmoothL1Loss

        encoder = nn.Sequential(*[net.layers[i] for i in range(7)])
        encoder.to(device)
        

        trainloader_GMM = DataLoader(dataset= dataset, batch_size= len(dataset), shuffle=False, num_workers=16)
        for data in trainloader_GMM:
            data = data.to(device)
            data = data.view(data.shape[0], -1)
            data = torch.tensor(data, dtype=torch.float32)
            hl = encoder(data)
            latent = hl.cpu().data.numpy()
            #pd.DataFrame(latent).to_csv("latent_{}.csv".format(args.n_clusters))
            pd.DataFrame(latent).to_csv("ulatent_epochs{0}_batchsize{1}_cluster{2}.csv".format(args.epochs,args.batch_size,args.n_clusters))
            print('살려라')
            pred, hle = Clu_model(hl, args.n_clusters) 

        save1 = pd.DataFrame(pred)
        save2 = pd.DataFrame(hle)
        save1.to_csv('upred_epochs{0}_batchsize{1}_cluster{2}.csv'.format(args.epochs,args.batch_size,args.n_clusters))
        save2.to_csv('uhle_epochs{0}_batchsize{1}_cluster{2}.csv'.format(args.epochs,args.batch_size,args.n_clusters))

In [None]:
save1.to_csv('pred_epochs{0}_batchsize{1}_cluster{2}.csv'.format(args.epochs,args.batch_size,args.n_clusters))
save2.to_csv('hle_epochs{0}_batchsize{1}_cluster{2}.csv'.format(args.epochs,args.batch_size,args.n_clusters))

In [None]:
import pandas as pd
pred = pd.read_csv('pred_epochs30_batchsize64_cluster8.csv')
hle = pd.read_csv('hle_epochs30_batchsize64_cluster8.csv')

print(len(pred), len(hle))
pred = pred.drop([pred.columns[0]],axis=1)
hle = hle.drop([hle.columns[0]],axis=1)

pred = pred.to_numpy()
pred = (pred.T)[0]
hle = hle.to_numpy()

In [None]:
""" 클러스터링 2차원 시각화 """

visual = pd.DataFrame(hle)
visual['pred'] = pred

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,6)
visual.plot.scatter(x=0, y=1, c='pred', colormap='viridis')

In [None]:
""" 클러스터 특성 시각화 """

CSVDATA['pred'] = pred
#CSVDATA.columns = ['가맹점소재지2', '성별', '연령대별', '연평균소득추정', 'furniture and appliance', 'education', 'transportation', 'etc',
#       'mart', 'insurance', 'service', '수지하투', 'grocery', 'travel', 'fun', 'restaurant', 'hospital',
#       'clothes', 'vehicle', 'e-commerce', 'retail', '합계%', 'pred']

import matplotlib.pyplot as plt
import collections
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from matplotlib import font_manager, rc

def stat_summary(df, feature):
    
    #df = df.drop(['Unnamed: 0'],axis=1)
    #df.drop(df.columns[12:23], axis = 1, inplace=True)
    
    df_avg = df.groupby('pred').mean()
    df_std = df.groupby('pred').std()
    
    df_avg = df_avg[feature] ; df_avg = df_avg.T
    df_std = df_std[feature] ; df_std = df_std.T

    pre = list(df_avg.columns)
    predict = []
    for i in range(len(df_avg.columns)):
        for j in range(len(list(df_avg.index))):
            i = str(i)
            predict.append(i)

    col = list(df_avg.index)
    col = col*len(df_avg.columns)

    """avg, sd 칼럼 리스트"""
    avg_list = []
    for i in range(len(df_avg.columns)):
        for j in range(len(list(df_avg[i]))):
            avg_list.append(df_avg[i][j])
    std_list = []
    for i in range(len(df_std.columns)):
        for j in range(len(list(df_std[i]))):
            std_list.append(df_std[i][j])

    dfdf = pd.DataFrame()
    dfdf['predict'] = predict
    dfdf['column_name'] = col
    dfdf['avg'] = avg_list
    dfdf['std'] = std_list
    
    return dfdf

def error_bar(df, feature):
    
    df = stat_summary(df, feature)
    
    """1)각 클러스터"""
    df['predict'] = df['predict'].astype(int)
    predict_dict = collections.Counter(df['predict'])
    predict_list = list(predict_dict.keys())
    column_dict = collections.Counter(df['column_name'])
    column_list = list(column_dict.keys())
    clusters={}
    for x in range(len(predict_list)):
        clusters["{0}".format(x)] = df[df['predict']==predict_list[x]]
    avg_dict = {}
    error_dict = {}
    for key in clusters.keys():
        avg_dict['{0}'.format(key)] = list(clusters[key]['avg'])
        error_dict['{0}'.format(key)] = list(clusters[key]['std'])
    labels = column_list
    x_pos = np.arange(len(labels))
        
    """2)모든 클러스터"""
    plt.rcParams["figure.figsize"] = (30,10)
    df = pd.DataFrame()
    df['feature'] = column_list
    for i in predict_list:
        df['cluster_{0}_avg'.format(i)] = avg_dict[str(i)]
        df['cluster_{0}_sd'.format(i)] = error_dict[str(i)]
    avg_list = [] 
    for i in range(len(predict_list)):
        avg_list.append('cluster_{0}_avg'.format(i))
    sd_list = []
    for i in range(len(predict_list)):
        sd_list.append('cluster_{0}_sd'.format(i))
    #tab20c
    ax = df.plot.bar(x = 'feature',
                     y = avg_list,
                    #yerr = df[sd_list].T.values,
                     rot=15, fontsize=23, ecolor='lightgrey', #capsize=7,
                     color=tuple([
                                        'tomato', 'khaki',
                    'mediumseagreen', 'mediumpurple', "mediumorchid",  "silver", "palegreen", 'olive'])) 
                                  
#     ["dodgerblue", "deepskyblue", "lightskyblue", "lightblue", "paleturquoise",
#     "sienna", "orangered", "tomato", "coral", "darksalmon", "salmon", "lightsalmon", "mistyrose",
#     "darkgreen", "green", "seagreen", "mediumseagreen", "darkseagreen", "lightgreen", "palegreen",
#     "rebeccapurple", "blueviolet", "darkorchid", "mediumorchid", "mediumpurple", "plum", "thistle",
#     "dimgrey", "grey", "darkgrey", "silver", "lightgrey", "whitesmoke"
#     'darkgoldenrod','goldenrod','olive','darkkhaki', 'gold' ,'khaki']
     

    ax.legend(loc='upper center', bbox_to_anchor=(1.025, 1),
              ncol=1, fancybox=True, shadow=True)
    ax.yaxis.grid(True)
    
    plt.tight_layout()
    
    #plt.savefig('error_bar_{0}.png'.format(feature[0]), dpi=300)
    plt.savefig('error_bar_all.png', dpi=300)
    plt.show()


""" error bar 실행 """

df = CSVDATA
#feature = ['furniture and appliance', 'education', 'transportation', 'etc','mart', 'insurance', 'service', 'grocery', 'travel', 'fun', 'hospital','clothes', 'vehicle', 'e-commerce', 'retail']
feature =  ['가구 및 전자 제품', '교육', '교통', '기타', '대형 판매', '보험', '서비스', '식료품', '여행 및 숙박', '오락 및 문화', '음식점', '의료', '의류 및 잡화', '자동차', '전자상거래', '종합소매']
error_bar(df, feature)

In [None]:
df['성별'] = df['성별'].replace({'남성': 'male', '여성': 'female'})

In [None]:
df['연평균소득추정'] = df['연평균소득추정'].replace({'B1':'1', 'B2':'1',
                                       'B3':'2', 'B4':'2',
                                       'B5':'3', 'B6':'3',
                                       'B7':'4', 'B8':'4',
                                       'B9':'5', 'B10':'5', 'B11':'5',})

In [None]:
def pie_chart(cluster, feature):

  plt.rcParams["figure.figsize"] = (6,6)
  plt.title('cluster {0}'.format(cluster))
  df[df.pred==cluster][feature].value_counts().plot.pie(autopct='%.1f')