In [1]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
import json
import pprint
from sklearn.mixture import BayesianGaussianMixture
from skbayes.mixture_models import VBBMM

In [3]:
import seaborn as sns
sns.set(color_codes=True)

# Preprocess

### まず実験結果の読み込み

In [4]:
targetDir = "../data/output/Result1/Result/"
speedDir = "../data/output/Result1/Speed/"
brakeDir = "../data/output/Result1/Brake/"
accelDIr = "../data/output/Result1/Accel/"
steeringPositiveDir = "../data/output/Result1/SteeringPositive/"
steeringNegativeDir = "../data/output/Result1/SteeringNegative/"
aheadDistance = "../data/output/Result1/AheadDistance/"

### 読み込んだデータを元に data-frame を作成

In [5]:
df1 = pd.read_csv(speedDir + "result.csv")
df2 = pd.read_csv(brakeDir + "result.csv")
df2.columns = [["id", "brake"]]
df3 = pd.read_csv(accelDIr + "result.csv")
df3.columns = [["id", "accel"]]
df4 = pd.read_csv(steeringPositiveDir + "result.csv")
df4.columns = [["id", "steeringPositive"]]
df5 = pd.read_csv(steeringNegativeDir + "result.csv")
df5.columns = [["id", "steeringNegative"]]
df6 = pd.read_csv(aheadDistance + "result.csv")
df6.columns = [["id", "near", "intermediate", "far"]]

IOError: File ../data/output/Result1/Speed/result.csv does not exist

### Merge

In [None]:
df = pd.merge(pd.merge(pd.merge(pd.merge(df1, df2, on="id"), df3, on="id"), df4, on="id"), df5, on="id")
df = pd.merge(df, df6, on="id")
gridSize = len(df)

### Rename

In [None]:
features =  ["red", "yellow", "green", "slow", "brake", "accel", "steeringPositive",  "steeringNegative", "near", "intermediate", "far"]

### 必要なデータだけ取り出す (通った経路だけ)

In [None]:
mf = df[df['red'] + df['yellow'] + df['green'] + df['stop'] > 0]
mf = mf[["red", "yellow", "green", "stop", "brake", "accel", "steeringPositive",  "steeringNegative", "near", "intermediate", "far"]]
mf.columns = features

### 確認

In [None]:
mf.describe()

In [None]:
colors = [
    "#FE2400",
    "#EC2127",
    "#D21E4E",
    "#B21D72",
    "#951E93",
    "#7421B1",
    "#5023D0",
    "#3025EA",
    "#082EFD",
    "#1147CE",
    "#247395",
    "#389D51",
    "#49C900",
    "#81D503",
    "#AFE400",
    "#DAF201",
    "#FFFF01",
    "#FEE600",
    "#FECD02",
    "#FEAF07",
    "#FF9400",
    "#FE7701",
    "#FD5800",
    "#FF3900",
]

names = [
    "Red",
    "Carmine",
    "CardinaL",
    "Violet",
    "Magenta",
    "Purple",
    "Azure",
    "Ultramarine",
    "Blue",
    "Teal",
    "Turquoise",
    "Pine",
    "Green",
    "Chartreuse",
    "Olieve",
    "Khaki",
    "Yellow",
    "Mustard",
    "Fulvous",
    "Apricot",
    "Orange",
    "Tangerine",
    "Titan",
    "Cerise"
]

# VBGMM

### 各特徴料を正規化する下準備

In [None]:
speeds = ["red", "yellow", "green", "slow"]
pedals = ["brake", "accel"]
steerings = ["steeringPositive", "steeringNegative"]
distances = ["near", "intermediate", "far"]

### 各特徴料を正規化

In [None]:
bf = mf.copy()
bf["sumSpeed"] = bf["red"] + bf["yellow"] + bf["green"] + bf["slow"]
bf["sumPedal"] = bf["brake"] + bf["accel"] 
bf["sumSteering"] = bf["steeringPositive"] + bf["steeringNegative"]
bf["sumDistance"] = bf["near"] + bf["intermediate"] + bf["far"]

for key in speeds:
    bf[key] = bf[key] / bf["sumSpeed"]
    
for key in pedals:
    bf[key] = bf[key] / bf["sumPedal"]
    
for key in steerings:
    bf[key] = bf[key] / bf["sumSteering"]

for key in distances:
    bf[key] = bf[key] / bf["sumDistance"]

bf = bf.fillna(0)    
    
bf = bf[features]

### ハイパーパラメータの設定

In [None]:
components = 24
prior = 0.0001
cov_type = 'full'
cov_prior = 0.3* np.identity(len(features))
max_iter = 200

### 一番良い seed 値をグリッドサーチする

In [None]:
def gridSearch():
    result = []
    for k in range(20):
        sd = k
        np.random.seed(sd)
        random.seed(sd)
    
        vb = BayesianGaussianMixture(
            n_components=components,
            weight_concentration_prior=prior, 
            covariance_type=cov_type,
            covariance_prior=cov_prior,
            max_iter=max_iter
        ).fit(bf)
        result.append({
                "seed": sd,
                "classes": len(set(vb.predict(bf))),
                "lower bounds": vb.lower_bound_
        })

    return sorted(result, key=lambda res: res["lower bounds"])

pprint.pprint(gridSearch())

### 上で得られた一番良いシード値に設定

In [None]:
sd = 15
np.random.seed(sd)
random.seed(sd)

### 実際に Bayesian Gaussian Mixture mode でクラスタリング

In [None]:
vb = BayesianGaussianMixture(
    n_components=components,
    weight_concentration_prior=prior, 
    covariance_type=cov_type,
    covariance_prior=cov_prior,
    max_iter=max_iter    
).fit(bf)
print "class size:      ", len(set(vb.predict(bf)))
print "classes:          ", set(vb.predict(bf))
print "lower bound: ", vb.lower_bound_

### 結果の確認

In [None]:
vb.weights_

In [None]:
ws = ["%02f" % w for w in vb.weights_]

In [None]:
ref = pd.DataFrame(vb.means_)
ref.columns = [features]
ref["weight"] = ws
ref.sort_values("weight", ascending=False)

In [None]:
ref.sort_values("weight", ascending=False)[:len(set(vb.predict(bf)))]

# Plot histgram of each clusters

### ヒストグラムの表示と保存

In [None]:
dir = "./img/Result1/clusters-" + str(sd)
if not os.path.exists(dir):
    os.mkdir(dir)

ref = ref[features]
for i in range(components):
    ref.ix[i].plot(kind='bar', color=colors[i])
    plt.title(i)
    plt.ylim(0, 1)
    plt.savefig(dir + "/cluster" + str(i) + ".png")
    plt.show()

# Save

### 結果の保存

In [None]:
pred = zip(mf.index, vb.predict(bf))
result = np.zeros(gridSize)

for index, klass in pred:
    result[index] = klass + 1
    
np.savetxt(targetDir + "clusters.csv", result, delimiter=",", fmt="%d")

with open(dir + '/VBSetting.json', 'w') as outfile:
    data = {
        "n_components": components,
        "weight_concentration_prior": prior,
        "seed": sd
    }
    json.dump(data, outfile)