In [10]:
!rm -rf probabilistic-lexicon-classification

In [11]:
! git clone https://github.com/perathambkk/probabilistic-lexicon-classification.git

Cloning into 'probabilistic-lexicon-classification'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects:   6% (1/15)[Kremote: Counting objects:  13% (2/15)[Kremote: Counting objects:  20% (3/15)[Kremote: Counting objects:  26% (4/15)[Kremote: Counting objects:  33% (5/15)[Kremote: Counting objects:  40% (6/15)[Kremote: Counting objects:  46% (7/15)[Kremote: Counting objects:  53% (8/15)[Kremote: Counting objects:  60% (9/15)[Kremote: Counting objects:  66% (10/15)[Kremote: Counting objects:  73% (11/15)[Kremote: Counting objects:  80% (12/15)[Kremote: Counting objects:  86% (13/15)[Kremote: Counting objects:  93% (14/15)[Kremote: Counting objects: 100% (15/15)[Kremote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 36 (delta 5), reused 0 (delta 0), pack-reused 21[K
Unpacking objects: 100% (36/36), done.


In [None]:
%cd /content/probabilistic-lexicon-classification

In [12]:
! python bayeslex.py --epochs 250 cornell liu-pos.utf8 liu-neg.utf8 --optimizer admm --prefilter



Namespace(admm_rho=1.0, epochs=250, extra=None, grad_based=False, iters_per_epoch=5, max_k=0.9, neglex='liu-neg.utf8', optimizer='admm', poslex='liu-pos.utf8', prefilter=True, prefix='cornell', verbosity=0, vocab_size=50000)
docs: 2000	 vocabulary: 38097	 tokens per doc: 636.835
lexicon sizes: 1448	3080
OOO baseline:	0.696	0.765
OOO presence:	0.711	0.770
OOO pmi:	0.712	0.761
c_hat= 545.7194884208576
prefiltering from 1448,3080 to 874,1868
ADMM optimization
done!	it=50	dual=7.03e-10<min(2.32e-08,3.39e-08)	primal=7.50e-12<7.92e-10
OOO LexiMom:	0.737	0.810
OOO LexiMom-Bayes:	0.750	0.824


In [17]:
from scipy.sparse import csr_matrix
import numpy as np
import scipy as sp
import sys
import argparse

import admm
from bayeslex_data import getLex, loadData, loadExtraData
from bayeslex_baselines import pmiPredictor, getLexClassifier
from bayeslex_eval import threeClassAcc, resultString
from bayeslex_opt import BayesLexOptimizer
from bayeslex_stats import estimateDCMFromMOM,computeR,computeRNonBayes,scale,makePredictionsKPerWord

In [20]:
parser = argparse.ArgumentParser()
parser.add_argument('prefix')
parser.add_argument('poslex')
parser.add_argument('neglex')
parser.add_argument('--vocab_size',default=50000,type=int)
parser.add_argument('--epochs',default=200,type=int)
parser.add_argument('--iters_per_epoch',default=5,type=int)
parser.add_argument('--optimizer',default='admm')
parser.add_argument('--admm_rho',default=1.0,type=float)
parser.add_argument('--max_k',default=0.9,type=float)
parser.add_argument('--verbosity',default=0,type=int)
parser.add_argument('--extra',default=None,type=str)

prefilter_group = parser.add_mutually_exclusive_group(required=False)
prefilter_group.add_argument('--prefilter',dest='prefilter',action='store_true',
                             help="""Prefilter the vocabulary to only include items
                             whose observed cross-lexicon counts are lower than expected.
                             Does not seem to make things better.""")
prefilter_group.add_argument('--no-prefilter',dest='prefilter',action='store_false')
parser.set_defaults(prefilter=True)

grad_based_group = parser.add_mutually_exclusive_group(required=False)
grad_based_group.add_argument('--grad',dest='grad_based',action='store_true',
                              help="""Use gradient-based optimization inside ADMM inner loop""")
grad_based_group.add_argument('--quadratic',dest='grad_based',action='store_false',
                              help="""Use closed-form quadratic optimization inside ADMM inner loop""")
parser.set_defaults(grad_based=False)

In [23]:
args = parser.parse_args(["cornell","liu-pos.utf8", "liu-neg.utf8"])

In [24]:
args

Namespace(admm_rho=1.0, epochs=200, extra=None, grad_based=False, iters_per_epoch=5, max_k=0.9, neglex='liu-neg.utf8', optimizer='admm', poslex='liu-pos.utf8', prefilter=True, prefix='cornell', verbosity=0, vocab_size=50000)

In [25]:
y,x,vocab = loadData(args.prefix,args.vocab_size)
    
print(args)
print("====================================")
print("docs: %d\t vocabulary: %d\t tokens per doc: %.3f"%(x.shape[0],x.shape[1],x.sum(axis=1).mean()))
pos_lex = getLex(args.poslex,vocab)
neg_lex = getLex(args.neglex,vocab)
print("lexicon sizes: %d\t%d"%(len(pos_lex),len(neg_lex)))

Namespace(admm_rho=1.0, epochs=200, extra=None, grad_based=False, iters_per_epoch=5, max_k=0.9, neglex='liu-neg.utf8', optimizer='admm', poslex='liu-pos.utf8', prefilter=True, prefix='cornell', verbosity=0, vocab_size=50000)
docs: 2000	 vocabulary: 38097	 tokens per doc: 636.835
lexicon sizes: 1448	3080


In [26]:
clf = getLexClassifier(pos_lex,neg_lex,vocab)
    
pred_baseline = np.array(clf.dot(x.T).todense())[0] 
print(resultString(scale(pred_baseline,x),y,"baseline"))

pred_presence = np.array(clf.dot((x>0).T).todense())[0] 
print(resultString(scale(pred_presence,x),y,"presence"))

pred_pmi = pmiPredictor(x,pos_lex,neg_lex)
print(resultString(scale(pred_pmi,x),y,"pmi"))

e_mu, c_hat = estimateDCMFromMOM(x)
print('c_hat=',c_hat)

OOO baseline:	0.696	0.765
OOO presence:	0.711	0.770
OOO pmi:	0.712	0.761
c_hat= 545.7194884208576


In [27]:
if args.extra is not None:
    x_train = sp.sparse.vstack([x,loadExtraData(args.extra,vocab)])
else:
    x_train = x

In [28]:
opt = BayesLexOptimizer(x_train,pos_lex,neg_lex,
                                prefilter=args.prefilter,
                                max_k=args.max_k,
                                verbosity=args.verbosity)
if args.optimizer == 'admm':
    print('ADMM optimization')
    opt.estimateADMM(max_iter=args.iters_per_epoch,
                      n_epochs=args.epochs,
                      rho=args.admm_rho,
                      grad_based=args.grad_based
    )
elif args.optimizer == 'slsqp':
    print('SLSQP optimization (warning, slow!)')
    opt.estimateSLSQP(max_iter=args.iters_per_epoch,
                      n_epochs=args.epochs)
else:
    raise Exception('Valid optimizers are admm and slsqp only')

prefiltering from 1448,3080 to 874,1868
ADMM optimization
done!	it=50	dual=7.03e-10<min(2.32e-08,3.39e-08)	primal=7.50e-12<7.92e-10


In [29]:
pred_khat = makePredictionsKPerWord(x,opt.pos_lex,opt.neg_lex,opt.k_pos,opt.k_neg,c_hat,bayesian=False)
print(resultString(pred_khat,y,"LexiMom"))

OOO LexiMom:	0.737	0.810


In [30]:
pred_khat_bayes = makePredictionsKPerWord(x,opt.pos_lex,opt.neg_lex,opt.k_pos,opt.k_neg,c_hat,bayesian=True)
print(resultString(pred_khat_bayes,y,"LexiMom-Bayes"))

OOO LexiMom-Bayes:	0.750	0.824
