In [None]:
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline
import pandas as pd 

from sklearn import svm 
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
data = np.load('data.npy')

In [None]:
data = np.mean(data, axis=1)

In [None]:
data.shape

In [None]:
np.save('classification_data', data)

In [None]:
key = pd.read_csv('key.csv')

In [None]:
area_list = key['ccf']

In [None]:
len(area_list)

In [None]:
area_list_unique = []
for area in area_list:
    if area not in area_list_unique:
        area_list_unique.append(area)

In [None]:
area_list_unique

In [None]:
area_ind_list = []
for key in area_list_unique: 
    area_ind= [i for i, area in enumerate(area_list) if area==key]
    area_ind_list.append(area_ind)

In [None]:
area_ind_len = []
for i in range(8):
    area_ind_len.append(len(area_ind_list[i]))

In [None]:
area_ind_list[5]

In [None]:
data = np.delete(data, 222, axis=0)

In [None]:
data.shape

In [None]:
np.save('classification_data', data)

In [None]:
area_list = np.asarray(area_list)

In [None]:
area_list = np.delete(area_list, 222)

In [None]:
area_list.shape

In [None]:
np.save('classification_area', area_list)

In [None]:
area_list_unique = np.asarray(area_list_unique)
area_list_unique = np.delete(area_list_unique, 5)

In [None]:
np.save('classification_area_list_unique', area_list_unique)

In [None]:
area_list_unique = area_list_unique.tolist()

In [None]:
area_ind_list = []
for area in area_list:
    area_ind_list.append(area_list_unique.index(area))

In [None]:
np.save('classification_area_ind_list', area_ind_list)

In [None]:
len(area_ind_list)

## Unsupervised Classification 

## k-means

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=7, random_state=0).fit(data)

In [None]:
prediction = kmeans.labels_

In [None]:
def plot(prediction, title):
    plt.figure(figsize=(14,6))
    for i_label in range(7):
        pred_ind = [i for i, pred in enumerate(prediction) if pred==i_label]
        area_num = np.zeros((7,))
        for i_pred in pred_ind:
            area = area_ind_list[i_pred]
            area_num[area]+=1
        
        labels = area_list_unique
        sizes = area_num

        plt.subplot(2,4,i_label+1)
        plt.pie(sizes, labels=labels, autopct='%1.1f%%')
        plt.axis('equal')
        plt.title('Cluster '+str(i_label+1))
        plt.savefig(title+'.png', dpi=200)

In [None]:
plot(prediction, 'k-means')

### GMM

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
gmm = GaussianMixture(n_components=7)

In [None]:
data.shape

In [None]:
gmm.fit(data)

In [None]:
labels = gmm.predict(data)

In [None]:
plot(labels, 'GMM')

## Supervised Classification

In [None]:
data.shape

In [None]:
area_ind_list = np.asarray(area_ind_list)
area_ind_list.shape
label = area_ind_list

In [None]:
def get_data(data, label, num_train=250, num_test=100):
    train_ind = np.random.choice(data.shape[0], num_train, replace=False)
    left_ind = [i for i in np.arange(data.shape[0]) if i not in train_ind]
    test_i= np.random.choice(data.shape[0]-num_train, num_test, replace=False)
    test_ind = [left_ind[i] for i in test_i]
    
    train_data = [data[i,:] for i in train_ind]
    train_label = [label[i] for i in train_ind]
    test_data = [data[i,:] for i in test_ind]
    test_label = [label[i] for i in test_ind]

    train_data = np.asarray(train_data)
    train_label = np.asarray(train_label)
    test_data = np.asarray(test_data)
    test_label = np.asarray(test_label)
    return train_data, train_label, test_data, test_label

In [None]:
def test_KNeigh(data, label, num_trial=100):
    acc_list = np.zeros((num_trial,))
    for i_trial in range(num_trial):
        train_data, train_label, test_data, test_label = get_data(data, label)
        neigh = KNeighborsClassifier(n_neighbors=3)
        neigh.fit(train_data, train_label)
        pred = neigh.predict(test_data)
        acc = 100*np.sum((pred-test_label)==0)/50
        acc_list[i_trial] = acc
    return acc_list

In [None]:
def test_NaiveB(data, label, num_trial=100):
    acc_list = np.zeros((num_trial,))
    for i_trial in range(num_trial):
        train_data, train_label, test_data, test_label = get_data(data, label)
        gnb = GaussianNB()
        pred = gnb.fit(train_data, train_label).predict(test_data)
        acc = 100*np.sum((pred-test_label)==0)/50
        acc_list[i_trial] = acc
    return acc_list

In [None]:
from sklearn.svm import SVC
def test_SVC(data, label, num_trial=100):
    acc_list = np.zeros((num_trial,))
    for i_trial in range(num_trial):
        train_data, train_label, test_data, test_label = get_data(data, label)
        clf = SVC(gamma='auto')
        clf.fit(train_data, train_label)
        score_SVM = clf.score(test_data, test_label)
        acc_list[i_trial] = 100*score_SVM
    return acc_list

In [None]:
from sklearn.tree import DecisionTreeClassifier
def test_Tree(data, label, num_trial=100):
    acc_list = np.zeros((num_trial,))
    for i_trial in range(num_trial):
        train_data, train_label, test_data, test_label = get_data(data, label)
        clf = DecisionTreeClassifier()
        clf.fit(train_data, train_label)
        score_RF = clf.score(test_data, test_label)
        acc_list[i_trial] = 100*score_RF
    return acc_list

In [None]:
acc_list = [[] for i in range(4)]
acc_list[0] = test_KNeigh(data, label)
acc_list[1] = test_NaiveB(data, label)
acc_list[2] = test_SVC(data, label)
acc_list[3] = test_Tree(data, label)

In [None]:
acc_mean = np.zeros((4,))
acc_std = np.zeros((4,))
for i in range(4):
    acc_mean[i] = np.mean(acc_list[i])
    acc_std[i] = np.std(acc_list[i])

In [None]:
plt.figure(figsize=(3,3))
plt.bar(np.arange(4), acc_mean, 0.5, yerr=acc_std)
plt.xlabel('Classification Methods')
plt.ylabel('Accuracy')
plt.ylim([0,100])
plt.xticks(np.arange(4), ('KNeigh', 'NaiveB', 'SVC', 'Tree'))