In [36]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn.decomposition import PCA
import umap

In [103]:
data=pd.read_csv('lang-meta.tsv',sep='\t')

In [92]:
keys = [
    'System', 
    'Carrier', 
    'Output Type',
    'Conceptual Model', 
    'Abstraction Mechanism', 
    'Source', 
    'Language Form',
    'Coded Domain', 
    'Execution Model', 
    'Alt API Available', 
    'Extensible',
    'Formal Definition Available', 
    'Language', 
    'Data manipulation',
    'Provides Accessibility', 
    'Juxtaposition strategy', 
    'Allowed Data Type',
    'Data model', 
    'Interaction source', 
    'Open Source', 
    'Dependent',
    'Mark Types', 
    'Series Types', 
    'Output Type Coded',
    'Embedded language', 
    'Coordinate Systems'
    ]


binary_columns = [
    'Abstraction Mechanism',  
    'Alt API Available', 
    'Extensible',
    'Formal Definition Available', 
    'Provides Accessibility', 
    'Open Source', 
    'Dependent',
]

dumb_cols = [
    'Carrier', 
    'Output Type',
    'Conceptual Model',
    'Source', 
    'Language Form',
    'Coded Domain', 
    'Execution Model',
    'Language', 
    'Data manipulation',
    'Juxtaposition strategy', 
    'Allowed Data Type',
    'Data model', 
    'Interaction source', 
    'Output Type Coded',
    'Embedded language', 
]

split_cols = [
    'Mark Types', 
    # 'Series Types', 
    'Coordinate Systems'
]

df = data[keys].copy()

gen_split_cols = []
for col in split_cols:
    col_types = set(",".join(df[col].dropna().tolist()).split(','))
    for col_type in col_types:
        new_col = str(col + col_type + 'new')
        gen_split_cols.append(new_col)
        df[new_col] = df[col].apply(lambda x : 1 if col_type in str(x) and str(x) != 'nan' else 0)


for bin_col in binary_columns: 
    df[bin_col] = df[bin_col].apply(lambda x : 0 if "no" in x.lower() else 0)
    

keep_cols = ['System'] + dumb_cols + binary_columns + gen_split_cols
out_df = pd.get_dummies(df[keep_cols], columns=dumb_cols)
ana_df = out_df.copy().drop(['System'], axis=1)

In [93]:
pca = PCA(n_components=2)
pca.fit(ana_df.T)

(2, 56)

In [94]:
simple_plot(pca.components_, "Conceptual Model")

In [97]:
reducer = umap.UMAP()
embedding = reducer.fit_transform(ana_df)

In [102]:
simple_plot(embedding.T, "Conceptual Model")

In [101]:
def simple_plot(inp, color):
    local_df = pd.DataFrame(inp, columns=df['System']).T.reset_index().rename(columns={0: "x", 1: 'y'}).merge(df, on=['System'])
    base = alt.Chart(local_df).encode(
        x=alt.X("x", scale=alt.Scale(zero=False)), 
        y=alt.Y("y", scale=alt.Scale(zero=False)),
        color=alt.Color(color, scale=alt.Scale(range=["#EE0000", "#AF00DB", "#267f99"])),
        tooltip=["System"]).properties(width=900)
    circles = base.mark_circle()
    txts = base.mark_text(dy=-10).encode(text="System")
    return circles + txts

In [52]:
pd.DataFrame(embedding.T, columns=df['System']).T.reset_index().rename(columns={0: "x", 1: 'y'}).merge(df, on=['System']).columns


Index(['System', 'x', 'y', 'Carrier', 'Output Type', 'Conceptual Model',
       'Abstraction Mechanism', 'Source', 'Language Form', 'Coded Domain',
       'Execution Model', 'Alt API Available', 'Extensible',
       'Formal Definition Available', 'Language', 'Data manipulation',
       'Provides Accessibility', 'Juxtaposition strategy', 'Allowed Data Type',
       'Data model', 'Interaction source', 'Open Source', 'Dependent',
       'Mark Types', 'Series Types', 'Output Type Coded', 'Embedded language',
       'Coordinate Systems'],
      dtype='object')