In [1]:
from hdbscan import HDBSCAN
import nltk
from nltk.corpus import verbnet as vn
import pandas as pd
from sklearn.cluster import SpectralClustering, MeanShift
from sklearn.decomposition import PCA
#from sklearn.manifold import TSNE
from sklearn.metrics import f1_score, adjusted_rand_score, adjusted_mutual_info_score, v_measure_score
from sklearn.metrics import fowlkes_mallows_score
from umap import UMAP


from cluster import ClusterVerbs
from eval_tensor import VerbTensorEvaluator

import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = (8, 4)

Populating the interactive namespace from numpy and matplotlib


# Compared otVerbNet

In [3]:
def compare_to_verbnet(non_negative=False, decomp_algo='tucker', rank=64, cutoff=100000, weight='npmi',
                       min_cluster_size=50, min_samples=5):
    evalor = VerbTensorEvaluator(non_negative=non_negative, decomp_algo=decomp_algo, 
                                 rank=rank, cutoff=cutoff, weight=weight)
    evalor.load_embeddings()
    df = pd.DataFrame(evalor.index['ROOT'], columns=['verb'])
    df['vnet'] = df.verb.apply(lambda verb: vn.classids(lemma=verb))
    mapper = UMAP(n_neighbors=30, min_dist=0.0, n_components=10, metric='cosine')
    lowdim_vecs = mapper.fit_transform(evalor.decomped_tns.factors[1])
    clusser = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
    df['cluster'] = clusser.fit_predict(lowdim_vecs)
    return df, adjusted_rand_score(df.vnet.astype(str), df.cluster.astype(str))

In [36]:
def show_clusters(df):
    return pd.DataFrame(
        [(i, size, ', '.join(df[df.cluster==i].verb.values))
         for i, size in df.groupby('cluster').size().sort_values(ascending=False).head(20).to_dict().items()],
        columns=['index', 'n_verbs', 'verbs']).set_index('index')

In [40]:
df_posibneg, score = compare_to_verbnet(min_cluster_size=5)

In [42]:
show_clusters(df_posibneg)

Unnamed: 0_level_0,n_verbs,verbs
index,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,514,"be, take, provide, include, mean, keep, put, p..."
9,114,"kill, catch, shoot, treat, trust, bear, feed, ..."
18,97,"wish, wonder, care, listen, gon, gather, pray,..."
34,80,"break, pull, push, lay, stick, roll, touch, pr..."
3,78,"commit, repeat, expose, separate, heal, distin..."
11,76,"tell, ask, call, thank, please, join, contact,..."
57,69,"check, view, click, display, generate, update,..."
26,67,"leave, enter, visit, fill, reserve, clean, cro..."
19,62,"live, wait, sleep, laugh, sing, cry, smile, re..."
10,61,"remind, strike, worry, blow, inspire, bother, ..."


In [32]:
weights = ["iact" ,"iact_sali" ,"ldice" ,"ldice_sali" ,"log_freq" ,"niact" ,"npmi" ,"pmi" ,"pmi_sali"]

cutoffs = []
for i in [1, 2, 3, 5]:
    cutoffs += list(i*10**np.arange(2, 9))

def eval_agains_verbnet_for(algo='tucker'):    
    data = []
    for non_negative in range(2):
        for cutoff in cutoffs:
            for exp in range(10):
                rank = 2**exp
                for weight in weights:
                    try:
                        _, score = compare_to_verbnet(non_negative=non_negative, decomp_algo=algo,
                                                   rank=rank, cutoff=cutoff, weight=weight)
                        record = (cutoff, rank, weight, non_negative, algo, score)
                        logging.info((record))
                        data.append(record)
                    except FileNotFoundError:
                        pass
                    except AttributeError as e:
                        logging.warning((record, e))
                    except ValueError:
                        logging.warning(record)
    df = pd.DataFrame(data, columns=['cutoff', 'rank', 'weight', 'non_negative', 'algo', 'corr'])
    return df.sort_values('corr', ascending=False)

In [64]:
%time score_df = eval_agains_verbnet_for()

Disconnection_distance = 1 has removed 340 edges.
It has only fully disconnected 4 vertices.
Use umap.utils.disconnected_vertices() to identify them.
  f"A few of your vertices were disconnected from the manifold.  This shouldn't cause problems.\n"


CPU times: user 54min 53s, sys: 24.6 s, total: 55min 17s
Wall time: 6min 46s


In [37]:
score_df.sort_values('corr', ascending=False)

Unnamed: 0,cutoff,rank,weight,non_negative,algo,corr
31,50000,32,log_freq,0,tucker,0.022485
3,100000,64,iact,0,tucker,0.021839
40,50000,64,pmi_sali,0,tucker,0.020478
9,100000,64,npmi,0,tucker,0.018066
1,1000,4,log_freq,0,tucker,0.015387
...,...,...,...,...,...,...
53,300000,64,log_freq,1,tucker,-0.012484
59,500000,64,log_freq,1,tucker,-0.015166
60,500000,64,npmi,1,tucker,-0.017200
26,300000,64,log_freq,0,tucker,-0.018261


In [34]:
df.groupby('cluster').size().sort_values(ascending=False).head(10)

cluster
-1     547
 29    100
 21     98
 30     94
 45     88
 25     81
 16     77
 5      71
 19     70
 4      69
dtype: int64

In [None]:
#df.groupby(df.vnet.astype(str)).size().sort_values(ascending=False).head(10)

In [None]:
ser_clust = df.groupby(df.cluster).size().sort_values(ascending=False).values
plt.plot(ser_clust + 1)
plt.xscale('log')
plt.yscale('log')

In [None]:
ser_class = df.groupby(df.vnet.astype(str)).size().sort_values(ascending=False)
plt.plot(ser_class+1)
plt.xscale('log')
plt.yscale('log')

In [None]:
df1 = df[(df.vnet.str.len()!=0)&(df.cluster!=-1)]
df1.groupby(['cluster', df1.vnet.astype(str)]).size().sort_values(ascending=False).head(20)

In [None]:
adjusted_mutual_info_score(df.vnet.astype(str), df.cluster.astype(str))

In [None]:
fowlkes_mallows_score(df.vnet.astype(str), df.cluster.astype(str))

In [None]:
f1_score(df.vnet.apply(lambda l: 'amuse-31.1' in l), df.cluster==14)

In [None]:
df = df.reset_index()

In [None]:
plt.rcParams['figure.figsize'] = [16, 8]

In [None]:
plt.scatter(*verb_mx.T[0:], s=3, c=df.index)
plt.colorbar()

In [None]:
_ = plt.hist2d(df.index, df.vnet.str.len())

In [None]:
def least_class(classes):
    if classes:
        return min([int(class_.split('-')[1].split('.')[0]) for class_ in classes])
    else:
        return np.nan

In [None]:
df['least_class'] = df.vnet.apply(least_class)

In [None]:
df = df.fillna(df.least_class.max()+1)

In [None]:
df[df.vnet.str.len()==0].head()

In [None]:
ser = df.groupby('least_class').size()

In [None]:
ser.name = 'lclass_size'

In [None]:
df = df.set_index('least_class').join(ser)

In [None]:
df[df.least_class==9].head()

In [None]:
part = (df.lclass_size > 50).values
plt.scatter(*verb_mx[part].T, s=5, c=df.least_class[part])#==9)
plt.colorbar()