In [19]:
import numpy as np
import math

# VADS from NRC mapping
VADS =           [(0.969, 0.583, 0.726),
                  (0.929, 0.837, 0.803),
                  (0.167, 0.865, 0.657),
                  (0.167, 0.718, 0.342),
                  (0.854, 0.46, 0.889),
                  (0.635, 0.469, 0.5),
                  (0.255, 0.667, 0.277),
                  (0.75, 0.755, 0.463),
                  (0.896, 0.692, 0.647),
                  (0.115, 0.49, 0.336),
                  (0.085, 0.551, 0.367),
                  (0.052, 0.775, 0.317),
                  (0.143, 0.685, 0.226),
                  (0.896, 0.684, 0.731),
                  (0.073, 0.84, 0.293),
                  (0.885, 0.441, 0.61),
                  (0.07, 0.64, 0.474),
                  (0.98, 0.824, 0.794),
                  (1.0, 0.519, 0.673),
                  (0.163, 0.915, 0.241),
                  (0.949, 0.565, 0.814),
                  (0.729, 0.634, 0.848),
                  (0.554, 0.51, 0.836),
                  (0.844, 0.278, 0.481),
                  (0.103, 0.673, 0.377),
                  (0.052, 0.288, 0.164),
                  (0.875, 0.875, 0.562),
                  (0.469, 0.184, 0.357)]
vads = np.array(VADS)
vs = vads[:,0]

In [75]:
import plotly.express as px

# we want to make this plot sparser
def plot(vector):
    fig = px.histogram(vector, nbins=100, marginal="rug")
    fig.show()

# we want to max this metric:
def min_dist_pair(vector):
    min_dist = math.inf
    for i in range(len(vector)):
        for j in range(i + 1, len(vector)):
            if (vector[i] - vector[j]) != 0:
                min_dist = min(min_dist, abs(vector[i] - vector[j]))
    return min_dist

plot(vs)

In [76]:
# Test many scaler, to see who behaves as we desire
from sklearn.preprocessing import MinMaxScaler,RobustScaler,StandardScaler, Normalizer, MaxAbsScaler,QuantileTransformer, PowerTransformer

scalers = [MinMaxScaler(),RobustScaler(), StandardScaler(), Normalizer(), MaxAbsScaler(), QuantileTransformer(), PowerTransformer()]
vs = vs.reshape(-1, 1)
for scaler in scalers:
    print(f">>> Trying scaler: {scaler}")
    scaled_vs = scaler.fit_transform(vs)
    print(scaled_vs)
    plot(scaled_vs)
    print(f"Min-Dist-Pair metric returned {min_dist_pair(scaled_vs)}")

>>> Trying scaler: MinMaxScaler()
[[0.96729958]
 [0.92510549]
 [0.12130802]
 [0.12130802]
 [0.84599156]
 [0.6149789 ]
 [0.21413502]
 [0.73628692]
 [0.89029536]
 [0.0664557 ]
 [0.03481013]
 [0.        ]
 [0.09599156]
 [0.89029536]
 [0.0221519 ]
 [0.87869198]
 [0.01898734]
 [0.97890295]
 [1.        ]
 [0.11708861]
 [0.94620253]
 [0.71413502]
 [0.52953586]
 [0.83544304]
 [0.05379747]
 [0.        ]
 [0.86814346]
 [0.43987342]]


Min-Dist-Pair metric returned [0.00316456]
>>> Trying scaler: RobustScaler()
[[ 0.49817093]
 [ 0.44496176]
 [-0.5686731 ]
 [-0.5686731 ]
 [ 0.34519455]
 [ 0.05387429]
 [-0.4516129 ]
 [ 0.20685068]
 [ 0.40106418]
 [-0.63784503]
 [-0.67775191]
 [-0.72164948]
 [-0.6005986 ]
 [ 0.40106418]
 [-0.69371467]
 [ 0.38643166]
 [-0.69770535]
 [ 0.51280346]
 [ 0.53940805]
 [-0.57399401]
 [ 0.47156635]
 [ 0.17891586]
 [-0.05387429]
 [ 0.33189225]
 [-0.65380778]
 [-0.72164948]
 [ 0.37312936]
 [-0.1669438 ]]


Min-Dist-Pair metric returned [0.00399069]
>>> Trying scaler: StandardScaler()
[[ 1.2055331 ]
 [ 1.09728351]
 [-0.96487107]
 [-0.96487107]
 [ 0.89431554]
 [ 0.30164907]
 [-0.72672199]
 [ 0.61286662]
 [ 1.00797761]
 [-1.10559553]
 [-1.18678272]
 [-1.27608863]
 [-1.02982083]
 [ 1.00797761]
 [-1.2192576 ]
 [ 0.97820897]
 [-1.22737632]
 [ 1.23530173]
 [ 1.28942653]
 [-0.97569603]
 [ 1.15140831]
 [ 0.55603559]
 [ 0.08244366]
 [ 0.86725315]
 [-1.13807041]
 [-1.27608863]
 [ 0.95114657]
 [-0.14758671]]


Min-Dist-Pair metric returned [0.00811872]
>>> Trying scaler: Normalizer()
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]]


Min-Dist-Pair metric returned inf
>>> Trying scaler: MaxAbsScaler()
[[0.969]
 [0.929]
 [0.167]
 [0.167]
 [0.854]
 [0.635]
 [0.255]
 [0.75 ]
 [0.896]
 [0.115]
 [0.085]
 [0.052]
 [0.143]
 [0.896]
 [0.073]
 [0.885]
 [0.07 ]
 [0.98 ]
 [1.   ]
 [0.163]
 [0.949]
 [0.729]
 [0.554]
 [0.844]
 [0.103]
 [0.052]
 [0.875]
 [0.469]]


Min-Dist-Pair metric returned [0.003]
>>> Trying scaler: QuantileTransformer()
[[0.92592593]
 [0.85185185]
 [0.33950617]
 [0.33950617]
 [0.66666667]
 [0.51851852]
 [0.40740741]
 [0.59259259]
 [0.81481481]
 [0.22222222]
 [0.14814815]
 [0.        ]
 [0.25925926]
 [0.81481481]
 [0.11111111]
 [0.74074074]
 [0.07407407]
 [0.96296296]
 [1.        ]
 [0.2962963 ]
 [0.88888889]
 [0.55555556]
 [0.48148148]
 [0.62962963]
 [0.18518519]
 [0.        ]
 [0.7037037 ]
 [0.44444444]]



n_quantiles (1000) is greater than the total number of samples (28). n_quantiles is set to n_samples.



Min-Dist-Pair metric returned [0.03703704]
>>> Trying scaler: PowerTransformer()
[[ 1.18652586]
 [ 1.0866709 ]
 [-0.9577399 ]
 [-0.9577399 ]
 [ 0.89777366]
 [ 0.33270769]
 [-0.70483741]
 [ 0.63204622]
 [ 1.00382878]
 [-1.10982961]
 [-1.19852667]
 [-1.29693582]
 [-1.02768057]
 [ 1.00382878]
 [-1.23420799]
 [ 0.9761202 ]
 [-1.24314673]
 [ 1.21388043]
 [ 1.26350118]
 [-0.96936692]
 [ 1.13667415]
 [ 0.57782933]
 [ 0.11812364]
 [ 0.87241813]
 [-1.14522267]
 [-1.29693582]
 [ 0.95088906]
 [-0.11064802]]


Min-Dist-Pair metric returned [0.00893874]


In [98]:
# The scaler we chose to experiment with: QuantileTransformer
# As it evenly distributes the VAD space, hence should ease kNN classification

q_scaler = QuantileTransformer(n_quantiles=15, output_distribution='uniform')
scaled_vads = q_scaler.fit_transform(vads)
plot(scaled_vads[:, 1])
print(f"Min-Dist-Pair metric returned {min_dist_pair(scaled_vads[:, 1])}")
scaled_vads

Min-Dist-Pair metric returned 0.006721775847268338


array([[0.92709174, 0.38864971, 0.76226688],
       [0.85145134, 0.85655335, 0.85527433],
       [0.35714286, 0.92676768, 0.65507876],
       [0.35714286, 0.69590062, 0.24889543],
       [0.65801887, 0.14972875, 1.        ],
       [0.52015928, 0.17251356, 0.50737005],
       [0.38703416, 0.53982301, 0.12356841],
       [0.57925126, 0.74744578, 0.42388759],
       [0.78571429, 0.65068323, 0.63910564],
       [0.21838035, 0.22467832, 0.22827688],
       [0.14705882, 0.34238876, 0.33497537],
       [0.        , 0.77253988, 0.18841297],
       [0.26296633, 0.62460897, 0.05832549],
       [0.78571429, 0.6173097 , 0.76860402],
       [0.09243697, 0.86363636, 0.14681852],
       [0.7424072 , 0.13601441, 0.6002401 ],
       [0.07773109, 0.47668394, 0.44444444],
       [0.95271868, 0.83867164, 0.84457279],
       [1.        , 0.28992974, 0.68549701],
       [0.32773109, 1.        , 0.0729355 ],
       [0.88942696, 0.36399217, 0.87800253],
       [0.56694873, 0.46632124, 0.94383562],
       [0.