<a href="https://colab.research.google.com/github/pkolachi/geodist2typfeat/blob/master/exptnbs/sigtyp-st2020-part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%autosave 60
%matplotlib inline
%pylab

import sys
%pip install -q --user pandas==1.0.3 seaborn=0.10.0 scikit-learn==0.22.2.post1
import pandas    as pd
import seaborn   as sns
import itertools as it
from collections import Counter, defaultdict
from operator    import itemgetter
from IPython.display import display as pd_displayHTML

In [0]:
# https://people.mpi-inf.mpg.de/~pmiettin/slides/BooleanMatrixFactorizationsForDataMining_Antwerp_slides.pdf
# https://csustan.csustan.edu/~tom/Clustering/GraphLaplacian-tutorial.pdf
# https://towardsdatascience.com/spectral-clustering-aba2640c0d5b 


### Loading the dataset

In [0]:
fpurl   = 'https://raw.githubusercontent.com/sigtyp/ST2020/master/data/train.csv'
# the header from the csv is not properly tab-seperated. hence hard-coding
header  = ['wals_code', 'name', 
           'latitude', 'longitude', 
           'genus', 'family', 'countrycodes', 
           'features'
          ]
df = pd.read_csv(fpurl, sep='\t', header=None, names=header,
                 #index_col=0,
                 error_bad_lines=True, skiprows=[0])
missingVal, missingLbl = '*-missing-*', '*-unknown-*'
featsFull = df.iloc[:, 0:-1]
clablFull = df.iloc[:, -1]
alablInst = Counter(albl for inst in clablFull for albl in inst.split('|'))
alablTabl = pd.DataFrame([{'name': n, 'id': i, 'freq': f}
                          for i,(n,f) in enumerate(alablInst.most_common(), start=1)
                         ]).set_index('name')
alablFull = pd.DataFrame([dict(albl.split('=', 1) for albl in inst.split('|'))
                          for inst in clablFull
                         ]).fillna(missingLbl) # fill missing values (no NaN) 
for incol in ['wals_code', 'name', 'genus', 'family', 'countrycodes']:
  featsFull[incol] = featsFull[incol].astype('category')
clablFull = clablFull.astype('category')
alablFull = alablFull.astype('category')
slablFull = pd.DataFrame([{'{0}={1}'.format(fcn, lbl): 1
                           for fcn, lbl in row.items() if lbl != missingLbl
                          } for row in alablFull.to_dict(orient='records')
                         ])
print(featsFull.shape, clablFull.shape, alablFull.shape, slablFull.shape, 
      alablTabl.shape)
slablMat_ = slablFull 

### Sampling

In [0]:
N = 3
import random
#subsid = list(range(0, featsFull.shape[0], featsFull.shape[0]//N))[:N]
subsid = list(sorted(random.sample(range(slablFull.shape[1]), N)))
subfcn = set([fcn for row in slablFull.iloc[subsid,:].to_dict(orient='records') for fcn in row if row[fcn]])
subfci = [slablFull.columns.to_list().index(fcn) for fcn in subfcn]
subfci = list(map(itemgetter(0), sorted(zip(subfci, subfcn))))
subfcn = [slablFull.columns[idx] for idx in subfci]
print(len(subfci), len(subfcn))
spsLblMat_ = slablFull.iloc[subsid,subfci]

62 62


In [0]:
syndata = [{'l1': 1, 'l2': 1}, 
           {'l2': 1, 'l3': 1},
           {'l4': 1, 'l5': 1},
           {'l6': 1, 'l7': 1},
           {'l6': 1, 'l8': 1},
           {'l7': 1, 'l9': 1}
          ]
syndata = pd.DataFrame(syndata).fillna(0)
spsLblMat_ = syndata

### Clustering labels

In [0]:
spsLblMat = slablMat_.to_numpy().T  # (n_labels, n_samples)
lablComat = np.dot(spsLblMat, spsLblMat.T)
lablFreqv = np.diag(lablComat)
# P(l1,l2) = #S(l1,l2)/N
lablProbs = (lablComat - np.diag(lablFreqv))/spsLblMat.shape[0] 
lablDegrs = np.diag([np.count_nonzero(row) for row in lablProbs])
lablLapcn = lablDegrs - (lablProbs>0)
eigvals, eigvecs = np.linalg.eig(lablLapcn)
print(np.count_nonzero(eigvals < 1e-7))

In [0]:
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering

clstnms = ['kmns', 'spectral', ]
clstobj = [KMeans(random_state=20200408), 
           SpectralClustering(random_state=20200408),
           AffinityPropagation(),
           AgglomerativeClustering()
          ]