In [1]:
import pandas as pd
import numpy as np
import scipy.spatial.distance as scidist
from scipy.stats import zscore
from sklearn.preprocessing import normalize
import json
import os
import random
import re

In [32]:
mtcars = pd.read_csv('mtcars.csv')
mtcars = mtcars.dropna()

In [33]:
mtcars['brand'] = mtcars['name'].apply(lambda n: n.split(' ')[0])
mtcars['model'] = mtcars['name'].apply(lambda n:  n[(n.index(' ')):] if ' ' in n else '')
mtcars

Unnamed: 0,name,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,brand,model
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet,chevelle malibu
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,70,usa,buick,skylark 320
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth,satellite
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,70,usa,amc,rebel sst
4,ford torino,17.0,8,302.0,140.0,3449,10.5,70,usa,ford,torino
...,...,...,...,...,...,...,...,...,...,...,...
393,ford mustang gl,27.0,4,140.0,86.0,2790,15.6,82,usa,ford,mustang gl
394,vw pickup,44.0,4,97.0,52.0,2130,24.6,82,europe,vw,pickup
395,dodge rampage,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge,rampage
396,ford ranger,28.0,4,120.0,79.0,2625,18.6,82,usa,ford,ranger


In [34]:
numeric_cols = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration']
for col in numeric_cols:
    mtcars[f'{col}_unit'] = zscore(mtcars[col], nan_policy='omit')
mtcars

Unnamed: 0,name,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,brand,model,cylinders_unit,displacement_unit,horsepower_unit,weight_unit,acceleration_unit
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet,chevelle malibu,1.483947,1.077290,0.664133,0.620540,-1.285258
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,70,usa,buick,skylark 320,1.483947,1.488732,1.574594,0.843334,-1.466724
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth,satellite,1.483947,1.182542,1.184397,0.540382,-1.648189
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,70,usa,amc,rebel sst,1.483947,1.048584,1.184397,0.536845,-1.285258
4,ford torino,17.0,8,302.0,140.0,3449,10.5,70,usa,ford,torino,1.483947,1.029447,0.924265,0.555706,-1.829655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,ford mustang gl,27.0,4,140.0,86.0,2790,15.6,82,usa,ford,mustang gl,-0.864014,-0.520637,-0.480448,-0.221125,0.021294
394,vw pickup,44.0,4,97.0,52.0,2130,24.6,82,europe,vw,pickup,-0.864014,-0.932079,-1.364896,-0.999134,3.287676
395,dodge rampage,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge,rampage,-0.864014,-0.568479,-0.532474,-0.804632,-1.430430
396,ford ranger,28.0,4,120.0,79.0,2625,18.6,82,usa,ford,ranger,-0.864014,-0.712005,-0.662540,-0.415627,1.110088


## TODO

Limit to a couple of brands, otherwise we don't have enough colors to draw with.

In [35]:

# let's take some movies at random for efficiency
car_names = mtcars['name']

car_dists = scidist.squareform(scidist.pdist(mtcars[list(map(lambda c: f'{c}_unit',numeric_cols))]))

car_brands = mtcars['brand']
all_brands = set(mtcars['brand'])

setdist_table = [
    list(map(lambda ag: 1 if ag in i else 0, all_brands)) for i in car_brands
]
setdist_table = np.array(setdist_table)
setdists = scidist.squareform(scidist.pdist(setdist_table, scidist.jaccard))


In [36]:
with open(f'mtcars.json', 'w', encoding='utf8') as f:
    jsonstr = {
        'E': car_names.tolist(),
        'EA': car_dists.tolist(),
        'SR': car_brands.apply(lambda s: list(s)).tolist(),
        'S': list(all_brands),
        'SA': setdists.tolist()
    }
    json.dump(jsonstr, f, ensure_ascii=False)