In [39]:
import json
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
%matplotlib inline
import math
import random as rand


def l2_pairwise_distance(v1, v2):
    nrow = len(v1)
    ncol = len(v2)
    dist_mat = [[0 for x in range(ncol)] for y in range(nrow)] 
    for i in range(nrow):
        for j in range(ncol):
            dist_mat[i][j] = math.sqrt((v1[i] - v2[j])**2)
    return dist_mat

def calculate_error(k_mean_matrix):
    return sum([min(dist) for dist in k_mean_matrix])


def k_mean(X_, n_center=3, n_iter=100):
    results = []

    for k in range(5):
        error_value = 0
        rand.seed(None)
        centers = sorted([rand.uniform(0.0, 100.0) for i in range(n_center)])
        min_dist_idx = [0]*len(X_)
        i = 0
        while i < n_iter:
            failed = False
            dist_mat = l2_pairwise_distance(X_, centers)
            error_value = calculate_error(dist_mat)
            min_dist_idx = [dist.index(min(dist)) for dist in dist_mat]
            old_center = centers
            centers = [0]*n_center
            count = [0]*n_center
            for j in range(len(X_)):
                centers[min_dist_idx[j]] += X_[j]
                count[min_dist_idx[j]] += 1

            for j in range(n_center):
                if count[j] == 0:
                    centers = sorted([rand.uniform(0.0, 100.0) for i in range(n_center)])
                    failed = True
                    break;

            if failed == True:
                i = 0
                continue;

            for j in range(n_center):
                centers[j] = centers[j] / (count[j])
            i += 1
        results.append((centers, min_dist_idx, error_value))
    print results
    
    return min(results, key=lambda x: x[2])

In [3]:
RANKING_FILE = "./CSSPropertiesRanking.json"
RareNonInherit_FILE = "./CSSRareNonInheritedProperties.json"
RareInherit_FILE = "./CSSRareInheritedProperties.json"
ranking = {}
with open(RANKING_FILE, "r") as fo:
    ranking = json.load(fo)
RareNonInherit = []
with open(RareNonInherit_FILE, "r") as fo:
    RareNonInherit = json.load(fo)
RareInherit = []
with open(RareInherit_FILE, "r") as fo:
    RareInherit = json.load(fo)

In [27]:
sorted_properties_name = sorted(ranking.keys(), key=lambda x: -float(ranking[x]))

sorted_properties_name = dict(zip(sorted_properties_name, np.arange(len(ranking)) / float(len(ranking))))

# best is: sorted(RareNonInherit, key=lambda x: -float(ranking[x]))
css_properties_name = sorted(RareInherit, key=lambda x: -float(ranking[x]))# sorted(ranking.keys(), key=lambda x: -float(ranking[x]))
css_properties_score = sorted([float(ranking[x]) for x in css_properties_name], key=lambda x: -x)
css_properties = zip(css_properties_name, css_properties_score)
print css_properties_score
kmeans = KMeans(n_clusters=2, random_state=0).fit(np.log(np.array(css_properties_score)).reshape(-1,1))

css_properties = zip(css_properties_name, css_properties_score, kmeans.labels_)
css_properties = sorted(css_properties, key=lambda x: -x[1])
print kmeans.cluster_centers_.reshape(1,-1)
print kmeans.labels_


[42.436061, 41.777993, 31.392838, 26.97879, 12.555355, 10.367758, 10.040729, 10.00433, 9.807688, 8.624961, 7.946455, 3.891355, 2.522299, 1.573846, 0.758564, 0.112396, 0.073329, 0.053684, 0.026315, 0.002809, 0.000804, 1e-06]
[[-5.45387586  2.17814601]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0]


In [41]:
k_mean(np.log(css_properties_score), n_center=2, n_iter=100)

[([-5.4538758649328418, 2.1781460078616202], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 33.67017651936835), ([-5.4538758649328418, 2.1781460078616202], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 33.67017651936835), ([-5.4538758649328418, 2.1781460078616202], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 33.67017651936835), ([-5.4538758649328418, 2.1781460078616202], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 33.67017651936835), ([-5.4538758649328418, 2.1781460078616202], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 33.67017651936835)]


([-5.4538758649328418, 2.1781460078616202],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 33.67017651936835)

In [42]:
# def bump_dectection(list_with_score, number_of_bump):#, conv_mask=[1 -1]):
#     bump_point = [((list_with_score[i][1] - list_with_score[i+10][1]), i) for i in np.arange(len(list_with_score)-10)]
#     bump_point = sorted(bump_point, key=lambda x: -x[0])
# #     print bump_point
#     return [list_with_score[bump_point[i][1]] for i in np.arange(min(number_of_bump, len(bump_point)))]

# # bump_dectection(css_properties, 3)
# plt.hist(css_properties_score, bins=10);
# plt.show()
cnt = 0
for i in np.arange(len(css_properties) - 1):
    cnt += 1
    if (css_properties[i][2] != css_properties[i+1][2]):
        print sorted_properties_name[css_properties[i][0]]
print sorted_properties_name[css_properties[-1][0]]

0.488245931284
0.902350813743


In [5]:
css_properties_name = RareInherit
css_properties_score = [float(ranking[x]) for x in RareInherit]
# css_properties = zip(css_properties_name, css_properties_score)

kmeans = KMeans(n_clusters=3, random_state=0).fit(np.array(css_properties_score).reshape(-1,1))

css_properties = zip(css_properties_name, css_properties_score, kmeans.labels_)
css_properties = sorted(css_properties, key=lambda x: -x[1])
print kmeans.cluster_centers_
print kmeans.labels_

# def bump_dectection(list_with_score, number_of_bump):#, conv_mask=[1 -1]):
#     bump_point = [((list_with_score[i][1] - list_with_score[i+10][1]), i) for i in np.arange(len(list_with_score)-10)]
#     bump_point = sorted(bump_point, key=lambda x: -x[0])
# #     print bump_point
#     return [list_with_score[bump_point[i][1]] for i in np.arange(min(number_of_bump, len(bump_point)))]

# # bump_dectection(css_properties, 3)
# plt.hist(css_properties_score, bins=10);
# plt.show()
cnt = 0
for i in np.arange(len(css_properties) - 1):
    cnt += 1
    if (css_properties[i][2] != css_properties[i+1][2]):
        print sorted_properties_name[css_properties[i][0]]
print sorted_properties_name[css_properties[-1][0]]

[[  0.819582  ]
 [ 35.6464205 ]
 [  9.90675371]]
[2 2 0 2 0 1 1 0 0 1 2 0 0 0 2 0 0 0 1 2 2 0]
0.191681735986
0.332730560579
0.902350813743


In [50]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(np.array(ranking.values()).reshape(-1,1))
print kmeans.cluster_centers_
print kmeans.labels_
temp = zip(ranking.keys(), kmeans.labels_)
print sorted(temp, key=lambda x: sorted_properties_name[x[0]])
print sorted_properties_name["border-left-color"]

[[  3.26019559]
 [ 51.25934548]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0
 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1
 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 