In [1]:
from hdbscan import HDBSCAN
import nltk
from nltk.corpus import verbnet as vn
import pandas as pd
from sklearn.cluster import SpectralClustering, MeanShift
from sklearn.decomposition import PCA
#from sklearn.manifold import TSNE
from sklearn.metrics import f1_score, adjusted_rand_score, adjusted_mutual_info_score, v_measure_score
from sklearn.metrics import fowlkes_mallows_score
import sparse
from umap import UMAP


from cluster import ClusterVerbs
from eval_tensor import VerbTensorEvaluator

import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = (8, 4)



Populating the interactive namespace from numpy and matplotlib


# Compared otVerbNet

In [2]:
def get_evalor_based_on_svo_sim(non_negative=True, decomp_algo='parafac'):
    if decomp_algo == 'parafac':
        if non_negative:
            evalor = VerbTensorEvaluator(non_negative=non_negative)
        else: # general
            evalor = VerbTensorEvaluator(non_negative=non_negative, decomp_algo=decomp_algo, 
                                         weight='npmi', include_empty=False, cutoff=300000, rank=256)    
    else: # tucker
        if non_negative:
            evalor = VerbTensorEvaluator(non_negative=non_negative, decomp_algo=decomp_algo,
                                         weight='npmi', include_empty=False, cutoff=1000000, rank=64)
        else: # general
            evalor = VerbTensorEvaluator(non_negative=non_negative, decomp_algo=decomp_algo,
                                         weight='npmi', include_empty=True, cutoff=100000, rank=64)
    evalor.load_embeddings()
    return evalor

In [3]:
def lookup_in_verbnet(verb):
    classes = vn.classids(lemma=verb)
    if classes:
        return classes#[0]
    else:
        return ''

In [4]:
def compare_to_verbnet(evalor, n_components=16, min_cluster_size=15):
    evalor.load_embeddings()
    df = pd.DataFrame(evalor.index['ROOT'], columns=['verb'])
    embed_vecs = evalor.decomped_tns.factors[1]
    if isinstance(embed_vecs, sparse.COO):
        embed_vecs = embed_vecs.todense()
    mapper = UMAP(n_neighbors=30, n_components=n_components, metric='euclidean', min_dist=0.0, random_state=42)# TODO n_components=16
    embed_vecs = mapper.fit_transform(embed_vecs)
    #plt.scatter(*embed_vecs.T, s=10)
    clusser = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=5) # TODO min_cluster_size, min_samples
    df['cluster'] = clusser.fit_predict(embed_vecs)
    df['vnet'] = df.verb.apply(lambda verb: vn.classids(lemma=verb))
    return df, adjusted_rand_score(df.vnet.astype(str), df.cluster.astype(str))

In [5]:
def show_clusters(evalor, n_components=16, min_cluster_size=15):
    df, score = compare_to_verbnet(evalor)#, n_components=n_components, min_cluster_size=min_cluster_size)
    print(score)
    cluster_df = pd.DataFrame(
        [(i, size, ', '.join(df[df.cluster==i].verb.values))
         for i, size in df.groupby('cluster').size().sort_values(ascending=False).head(20).to_dict().items()],
        columns=['index', 'n_verbs', 'verbs']).set_index('index')
    return cluster_df#.head()

In [6]:
evalor = get_evalor_based_on_svo_sim(non_negative=True, decomp_algo='parafac')
show_clusters(evalor, n_components=32, min_cluster_size=5)

0.003810098481864747


Unnamed: 0_level_0,n_verbs,verbs
index,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,220,"have, get, make, take, need, give, provide, he..."
12,102,"do, say, think, know, want, look, try, feel, m..."
5,62,"be, go, come, work, happen, seem, allow, conti..."
1,49,"tell, ask, call, thank, let, kill, hate, conta..."
8,46,"keep, put, break, hit, remove, cut, pull, thro..."
4,43,"read, write, check, post, contain, visit, view..."
7,40,"add, eat, cover, fill, drink, feed, clean, spr..."
11,39,"agree, move, decide, stand, return, die, sit, ..."
0,37,"start, support, carry, perform, complete, fini..."
3,33,"use, include, change, set, choose, develop, se..."


In [7]:
pd.set_option('lw', 90)

In [8]:
evalor = get_evalor_based_on_svo_sim(non_negative=False, decomp_algo='tucker')
show_clusters(evalor, n_components=16, min_cluster_size=15)#to_latex())

0.013850777052012504


Unnamed: 0_level_0,n_verbs,verbs
index,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,702,"have, do, get, go, take, think, know, want, need, give, look, work, provide, try, feel..."
32,131,"live, talk, stand, die, walk, wait, sit, stay, wonder, care, arrive, fly, gon, sleep, ..."
20,86,"kill, catch, trust, bear, email, marry, fuck, date, judge, bless, honor, forgive, beg,..."
7,85,"add, eat, produce, deliver, prepare, drink, spread, cook, burn, taste, wash, supply, s..."
29,80,"use, develop, manage, perform, complete, replace, install, connect, test, conduct, lau..."
17,80,"let, reach, hit, cost, exceed, rate, approach, /, -lsb-_VBD, rank, -lsb-_VB, \, -lsb-_..."
2,79,"put, break, pull, throw, push, lay, stick, grab, touch, press, suck, kick, shake, stre..."
0,77,"identify, commit, defend, repeat, expose, separate, dig, heal, dress, distinguish, kid..."
31,76,"send, check, view, click, display, generate, update, access, search, store, delete, ed..."
5,65,"leave, enter, visit, fill, explore, ride, clean, cross, surround, locate, clear, rent,..."


In [9]:
evalor = get_evalor_based_on_svo_sim(non_negative=False, decomp_algo='parafac')
show_clusters(evalor, n_components=16, min_cluster_size=5)

0.02342129279005374


Unnamed: 0_level_0,n_verbs,verbs
index,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,402,"have, do, take, use, give, provide, want, include, offer, like, bring, hear, write, ad..."
26,157,"be, think, look, happen, guess, care, count, misspell, appear, sleep, online, suppose,..."
21,77,"kill, hate, express, hurt, trust, imagine, blame, separate, promise, prove, feed, repe..."
23,64,"wear, work, turn, move, shake, clean, adore, pack, dig, dislike, hang, don, scratch, l..."
20,52,"cause, allow, drive, involve, prevent, recognize, define, promote, enable, limit, oppo..."
16,49,"make, know, recommend, consider, pass, forget, approve, implement, establish, extend, ..."
6,49,"find, show, understand, support, describe, develop, present, discuss, remove, publish,..."
11,49,"get, buy, lose, win, run, pick, build, own, purchase, control, stand, steal, retain, p..."
22,48,"put, pull, push, touch, point, grab, ride, kick, press, wash, lift, kiss, stick, suck,..."
13,33,"create, share, visit, choose, enter, view, select, edit, delete, install, access, stor..."


In [10]:
evalor = get_evalor_based_on_svo_sim(non_negative=True, decomp_algo='tucker')
show_clusters(evalor, n_components=16, min_cluster_size=15)

0.0031965275514762035


Unnamed: 0_level_0,n_verbs,verbs
index,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,147,"give, be, include, say, like, mean, meet, miss, contain, go, understand, follow, look,..."
15,93,"know, think, try, believe, guess, happen, figure, seem, care, count, wait, sound, trav..."
9,66,"require, affect, allow, drive, live, improve, hurt, control, involve, claim, encourage..."
7,65,"take, make, write, hear, change, set, turn, accept, consider, enter, pass, order, thro..."
10,46,"provide, offer, enjoy, bring, serve, kill, catch, feature, draw, deserve, hope, welcom..."
5,46,"have, use, need, buy, play, create, learn, sell, appreciate, build, choose, develop, p..."
1,46,"put, visit, talk, pull, thank, stop, let, trust, strike, bother, blame, inspire, assis..."
2,40,"tell, love, ask, help, call, teach, remind, lead, hate, join, contact, treat, invite, ..."
14,38,"do, feel, cause, experience, face, notice, fix, solve, prevent, encounter, suffer, fig..."
11,36,"get, receive, send, keep, share, win, own, view, return, seek, collect, edit, maintain..."


In [11]:
evalor = VerbTensorEvaluator(non_negative=False, decomp_algo='parafac', include_empty=False, 
                             weight='npmi', cutoff=300000, rank=256)
show_clusters(evalor)

0.02342129279005374


Unnamed: 0_level_0,n_verbs,verbs
index,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,402,"have, do, take, use, give, provide, want, include, offer, like, bring, hear, write, ad..."
26,157,"be, think, look, happen, guess, care, count, misspell, appear, sleep, online, suppose,..."
21,77,"kill, hate, express, hurt, trust, imagine, blame, separate, promise, prove, feed, repe..."
23,64,"wear, work, turn, move, shake, clean, adore, pack, dig, dislike, hang, don, scratch, l..."
20,52,"cause, allow, drive, involve, prevent, recognize, define, promote, enable, limit, oppo..."
16,49,"make, know, recommend, consider, pass, forget, approve, implement, establish, extend, ..."
6,49,"find, show, understand, support, describe, develop, present, discuss, remove, publish,..."
11,49,"get, buy, lose, win, run, pick, build, own, purchase, control, stand, steal, retain, p..."
22,48,"put, pull, push, touch, point, grab, ride, kick, press, wash, lift, kiss, stick, suck,..."
13,33,"create, share, visit, choose, enter, view, select, edit, delete, install, access, stor..."


In [12]:
evalor = VerbTensorEvaluator(non_negative=False, decomp_algo='parafac', include_empty=False, 
                             weight='npmi', cutoff=300000, rank=256)
show_clusters(evalor, n_components=32)

0.02342129279005374


Unnamed: 0_level_0,n_verbs,verbs
index,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,402,"have, do, take, use, give, provide, want, include, offer, like, bring, hear, write, ad..."
26,157,"be, think, look, happen, guess, care, count, misspell, appear, sleep, online, suppose,..."
21,77,"kill, hate, express, hurt, trust, imagine, blame, separate, promise, prove, feed, repe..."
23,64,"wear, work, turn, move, shake, clean, adore, pack, dig, dislike, hang, don, scratch, l..."
20,52,"cause, allow, drive, involve, prevent, recognize, define, promote, enable, limit, oppo..."
16,49,"make, know, recommend, consider, pass, forget, approve, implement, establish, extend, ..."
6,49,"find, show, understand, support, describe, develop, present, discuss, remove, publish,..."
11,49,"get, buy, lose, win, run, pick, build, own, purchase, control, stand, steal, retain, p..."
22,48,"put, pull, push, touch, point, grab, ride, kick, press, wash, lift, kiss, stick, suck,..."
13,33,"create, share, visit, choose, enter, view, select, edit, delete, install, access, stor..."


In [13]:
evalor = VerbTensorEvaluator(non_negative=False, decomp_algo='parafac', include_empty=False, 
                             weight='npmi', cutoff=300000, rank=256)
show_clusters(evalor, min_cluster_size=5)

0.02342129279005374


Unnamed: 0_level_0,n_verbs,verbs
index,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,402,"have, do, take, use, give, provide, want, include, offer, like, bring, hear, write, ad..."
26,157,"be, think, look, happen, guess, care, count, misspell, appear, sleep, online, suppose,..."
21,77,"kill, hate, express, hurt, trust, imagine, blame, separate, promise, prove, feed, repe..."
23,64,"wear, work, turn, move, shake, clean, adore, pack, dig, dislike, hang, don, scratch, l..."
20,52,"cause, allow, drive, involve, prevent, recognize, define, promote, enable, limit, oppo..."
16,49,"make, know, recommend, consider, pass, forget, approve, implement, establish, extend, ..."
6,49,"find, show, understand, support, describe, develop, present, discuss, remove, publish,..."
11,49,"get, buy, lose, win, run, pick, build, own, purchase, control, stand, steal, retain, p..."
22,48,"put, pull, push, touch, point, grab, ride, kick, press, wash, lift, kiss, stick, suck,..."
13,33,"create, share, visit, choose, enter, view, select, edit, delete, install, access, stor..."


# Attic

In [None]:
df, score = compare_to_verbnet()
show_clusters(df)

In [None]:
df_posibneg, score = compare_to_verbnet()
score

In [None]:
compare_to_verbnet(decomp_algo='parafac', rank=256, include_empty=False, cutoff=100000)[1]

In [None]:
compare_to_verbnet(non_negative=True, decomp_algo='parafac', cutoff=1000000)[1]

In [None]:
show_clusters(df_posibneg)

In [None]:
weights = ["iact" ,"iact_sali" ,"ldice" ,"ldice_sali" ,"log_freq" ,"niact" ,"npmi" ,"pmi" ,"pmi_sali"]

cutoffs = []
for i in [1, 2, 3, 5]:
    cutoffs += list(i*10**np.arange(2, 9))

def eval_agains_verbnet_for(algo='tucker'):    
    data = []
    for non_negative in range(2):
        for cutoff in cutoffs:
            for exp in range(10):
                rank = 2**exp
                for weight in weights:
                    try:
                        _, score = compare_to_verbnet(non_negative=non_negative, decomp_algo=algo,
                                                   rank=rank, cutoff=cutoff, weight=weight)
                        record = (cutoff, rank, weight, non_negative, algo, score)
                        logging.info((record))
                        data.append(record)
                    except FileNotFoundError:
                        pass
                    except AttributeError as e:
                        logging.warning((record, e))
                    except ValueError:
                        logging.warning(record)
    df = pd.DataFrame(data, columns=['cutoff', 'rank', 'weight', 'non_negative', 'algo', 'corr'])
    return df.sort_values('corr', ascending=False)

In [None]:
%time score_df = eval_agains_verbnet_for()

In [None]:
score_df.sort_values('corr', ascending=False)

In [None]:
df.groupby('cluster').size().sort_values(ascending=False).head(10)

In [None]:
#df.groupby(df.vnet.astype(str)).size().sort_values(ascending=False).head(10)

In [None]:
ser_clust = df.groupby(df.cluster).size().sort_values(ascending=False).values
plt.plot(ser_clust + 1)
plt.xscale('log')
plt.yscale('log')

In [None]:
ser_class = df.groupby(df.vnet.astype(str)).size().sort_values(ascending=False)
plt.plot(ser_class+1)
plt.xscale('log')
plt.yscale('log')

In [None]:
df1 = df[(df.vnet.str.len()!=0)&(df.cluster!=-1)]
df1.groupby(['cluster', df1.vnet.astype(str)]).size().sort_values(ascending=False).head(20)

In [None]:
adjusted_mutual_info_score(df.vnet.astype(str), df.cluster.astype(str))

In [None]:
fowlkes_mallows_score(df.vnet.astype(str), df.cluster.astype(str))

In [None]:
f1_score(df.vnet.apply(lambda l: 'amuse-31.1' in l), df.cluster==14)

In [None]:
df = df.reset_index()

In [None]:
plt.rcParams['figure.figsize'] = [16, 8]

In [None]:
plt.scatter(*verb_mx.T[0:], s=3, c=df.index)
plt.colorbar()

In [None]:
_ = plt.hist2d(df.index, df.vnet.str.len())

In [None]:
def least_class(classes):
    if classes:
        return min([int(class_.split('-')[1].split('.')[0]) for class_ in classes])
    else:
        return np.nan

In [None]:
df['least_class'] = df.vnet.apply(least_class)

In [None]:
df = df.fillna(df.least_class.max()+1)

In [None]:
df[df.vnet.str.len()==0].head()

In [None]:
ser = df.groupby('least_class').size()

In [None]:
ser.name = 'lclass_size'

In [None]:
df = df.set_index('least_class').join(ser)

In [None]:
df[df.least_class==9].head()

In [None]:
part = (df.lclass_size > 50).values
plt.scatter(*verb_mx[part].T, s=5, c=df.least_class[part])#==9)
plt.colorbar()