In [1]:
from glob import glob
import csv
import operator
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re
from sklearn.utils.validation import check_array
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from IPython import display
from coloredweighteddoc import ColoredWeightedDoc
from scipy.sparse import diags, issparse

In [2]:
def load_imdb_data(path_to_imdb):
    print("Loading the imdb reviews data")
    train_neg_files = glob(path_to_imdb + r"/train/neg/*.txt")
    train_pos_files = glob(path_to_imdb + r"/train/pos/*.txt")
    train_corpus = []
    y_train = []
    for tnf in train_neg_files:
            with open(tnf, 'r', errors='replace') as f:
                line = f.read()
                train_corpus.append(line)
                y_train.append(0)

    for tpf in train_pos_files:
        with open(tpf, 'r', errors='replace') as f:
            line = f.read()
            train_corpus.append(line)
            y_train.append(1)

    test_neg_files = glob(path_to_imdb + r"/test/neg/*.txt")
    test_pos_files = glob(path_to_imdb + r"/test/pos/*.txt")

    test_corpus = []

    y_test = []

    for tnf in test_neg_files:
        with open(tnf, 'r', errors='replace') as f:
            test_corpus.append(f.read())
            y_test.append(0)

    for tpf in test_pos_files:
        with open(tpf, 'r', errors='replace') as f:
            test_corpus.append(f.read())
            y_test.append(1)

    print("Data loaded.")
    return train_corpus, y_train, test_corpus, y_test

In [3]:
def load_vocabulary(filename):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip())
    return vocabulary

In [4]:
path_to_imdb = "C:/Users/mbilgic/Desktop/aclImdb"

In [5]:
train_corpus, y_train, test_corpus, y_test = load_imdb_data(path_to_imdb)

Loading the imdb reviews data
Data loaded.


In [6]:
train_corpus = np.array(train_corpus)
test_corpus = np.array(test_corpus)
y_train = np.array(y_train)

y_test = np.array(y_test)

In [7]:
#tp=r"(?u)\b\w\w+\b" # default
tp=r"(?u)\b[\w\'/]+\b" # customized

In [136]:
unigrams = load_vocabulary("imdb-unigrams.txt")

In [137]:
unigrams

['1/10',
 '2/10',
 '3/10',
 '4/10',
 '5/10',
 '6/10',
 '7/10',
 '8/10',
 '9/10',
 '10/10',
 'amazing',
 'annoying',
 'avoid',
 'awful',
 'bad',
 'badly',
 'beautiful',
 'beautifully',
 'best',
 'bland',
 'boring',
 'brilliant',
 'cheap',
 'disappointed',
 'disappointing',
 'disappointment',
 'dreadful',
 'dull',
 'enjoyable',
 'enjoyed',
 'excellent',
 'fails',
 'fantastic',
 'fascinating',
 'favorite',
 'forgettable',
 'fun',
 'funny',
 'funniest',
 'gem',
 'great',
 'horrible',
 'incredible',
 'insult',
 'lacks',
 'lame',
 'laughable',
 'lousy',
 'loved',
 'mediocre',
 'mess',
 'mst3k',
 'noir',
 'obnoxious',
 'pathetic',
 'perfect',
 'perfectly',
 'pointless',
 'poor',
 'poorly',
 'predictable',
 'rare',
 'recommended',
 'redeeming',
 'refreshing',
 'ridiculous',
 'sadly',
 'solid',
 'stupid',
 'subtle',
 'superb',
 'surprisingly',
 'tedious',
 'terrible',
 'unfortunately',
 'unfunny',
 'waste',
 'wasted',
 'weak',
 'wonderful',
 'wonderfully',
 'worse',
 'worst']

In [9]:
vectorizer = CountVectorizer(lowercase=True, vocabulary=unigrams, ngram_range=(1,1), binary=True, token_pattern=tp)

In [10]:
X_train = vectorizer.fit_transform(train_corpus)
X_test = vectorizer.transform(test_corpus)

In [11]:
X_train.shape

(25000, 83)

In [12]:
words = vectorizer.get_feature_names()
#words = np.array(words)

In [13]:
freq = np.sum(X_train, axis=0)
freq = freq.A1

In [14]:
fi = np.argsort(freq)

In [15]:
clf = LogisticRegression(penalty='l1', C=0.5)
clf.fit(X_train, y_train)
weights = clf.coef_[0]
bias = clf.intercept_[0]

#clf = MultinomialNB(alpha=100)
#clf.fit(X_train, y_train)
#weights = clf.feature_log_prob_[1] - clf.feature_log_prob_[0]

In [16]:
clf.score(X_train, y_train)

0.82016

In [17]:
clf.score(X_test, y_test)

0.81559999999999999

In [18]:
np.sum(weights != 0)

82

In [19]:
wi = np.argsort(abs(weights))

In [20]:
[(words[i], freq[i], weights[i]) for i in range(len(words))]

[('1/10', 158, -2.956906254202583),
 ('2/10', 121, -2.4775399377300573),
 ('3/10', 170, -3.5222232351512512),
 ('4/10', 173, -4.2043167783156594),
 ('5/10', 90, 0.29784363899764604),
 ('6/10', 29, 0.0),
 ('7/10', 198, 4.2769770120795361),
 ('8/10', 222, 3.2961716557129623),
 ('9/10', 153, 2.2242802100998489),
 ('10/10', 256, 2.2812518502971146),
 ('amazing', 1109, 1.0992629837113479),
 ('annoying', 881, -1.0065289765469307),
 ('avoid', 728, -1.1680118369150152),
 ('awful', 1444, -1.6269356479344206),
 ('bad', 5892, -0.92505027637421033),
 ('badly', 593, -1.135498547868991),
 ('beautiful', 1803, 0.75962780569902288),
 ('beautifully', 413, 1.2472732901992858),
 ('best', 4892, 0.78068641989742649),
 ('bland', 257, -0.94527913892289317),
 ('boring', 1508, -1.1131748811122091),
 ('brilliant', 1028, 0.84651463434808349),
 ('cheap', 789, -0.781522527036003),
 ('disappointed', 857, -0.6385252136443097),
 ('disappointing', 394, -1.234165085599765),
 ('disappointment', 388, -1.3591361317419783),

In [21]:
[(words[i], freq[i], weights[i]) for i in wi[::-1]]

[('7/10', 198, 4.2769770120795361),
 ('4/10', 173, -4.2043167783156594),
 ('3/10', 170, -3.5222232351512512),
 ('8/10', 222, 3.2961716557129623),
 ('1/10', 158, -2.956906254202583),
 ('2/10', 121, -2.4775399377300573),
 ('10/10', 256, 2.2812518502971146),
 ('waste', 1302, -2.2256168492087305),
 ('9/10', 153, 2.2242802100998489),
 ('mst3k', 112, -1.8719553539988083),
 ('unfunny', 233, -1.8674942607747189),
 ('worst', 2264, -1.8486357723817051),
 ('poorly', 629, -1.8440987495137737),
 ('pointless', 458, -1.7588507758785035),
 ('refreshing', 197, 1.7190158753611384),
 ('wonderfully', 312, 1.6980241667274627),
 ('lousy', 202, -1.6860786578186409),
 ('laughable', 398, -1.6809203781327537),
 ('redeeming', 315, -1.6591362971403929),
 ('awful', 1444, -1.6269356479344206),
 ('mess', 592, -1.5295084130630565),
 ('wasted', 524, -1.5057639961610727),
 ('tedious', 210, -1.4753195036519626),
 ('insult', 207, -1.4250241908247441),
 ('dreadful', 224, -1.3790081385168673),
 ('excellent', 1779, 1.378436

In [22]:
tokenizer = re.compile(tp)

# Evidences

In [23]:
def compute_evidences_nonnegative_matrix(weights, X, bias=0):   
    X = check_array(X, accept_sparse="csr")
    neg_weights = weights * (weights < 0)
    pos_weights = weights * (weights > 0)
    if issparse(X):
        neg_evi = X * neg_weights
        pos_evi = X * pos_weights
    else:
        neg_evi = np.dot(X, neg_weights)
        pos_evi = np.dot(X, pos_weights)
    
    if bias > 0:
        pos_evi += bias
    else:
        neg_evi -= bias
    return neg_evi, pos_evi

In [24]:
def frequency_of_a_phrase(p, corpus):
    c = 0
    for doc in corpus:
        if p in doc:
            c += 1
    return c

## Most

In [25]:
probs = clf.predict_proba(X_test)
neg_evi, pos_evi = compute_evidences_nonnegative_matrix(weights, X_test, bias)

### Most Negative wrt Probs

In [26]:
j = np.argmax(probs[:,0])
print(probs[j])
print(neg_evi[j])
print(pos_evi[j])
display.display(ColoredWeightedDoc(test_corpus[j], words, weights, token_pattern=tp, binary = True))

[  9.99999986e-01   1.42727824e-08]
-20.9943073849
2.92939595367


### Most Positive wrt Probs

In [27]:
j = np.argmax(probs[:,1])
print(probs[j])
print(neg_evi[j])
print(pos_evi[j])
display.display(ColoredWeightedDoc(test_corpus[j], words, weights, token_pattern=tp, binary = True))

[  1.80472696e-07   9.99999820e-01]
-3.4025902141
18.930276373


### Most-evidence

In [28]:
total_abs_evi = pos_evi + abs(neg_evi)

In [29]:
j = np.argmax(total_abs_evi)
print(probs[j])
print(neg_evi[j])
print(pos_evi[j])
display.display(ColoredWeightedDoc(test_corpus[j], words, weights, token_pattern=tp, binary = True))

[  9.99993028e-01   6.97216442e-06]
-22.90510327
11.0315253947


### Least-evidence

In [30]:
j = np.argmin(total_abs_evi)
print(probs[j])
print(neg_evi[j])
print(pos_evi[j])
display.display(ColoredWeightedDoc(test_corpus[j], words, weights, token_pattern=tp, binary = True))

[ 0.44535359  0.55464641]
0.0
0.219462269995


## Bigrams

In [138]:
bigrams = load_vocabulary("imdb-bigrams.txt")

In [141]:
len(bigrams)

54

In [32]:
vectorizer = CountVectorizer(lowercase=True, vocabulary=bigrams, ngram_range=(2,2), binary=True, token_pattern=tp)

In [33]:
X_train = vectorizer.fit_transform(train_corpus)
X_test = vectorizer.transform(test_corpus)

In [34]:
words = vectorizer.get_feature_names()

In [35]:
freq = np.sum(X_train, axis=0)
freq = freq.A1
fi = np.argsort(freq)

In [36]:
clf = LogisticRegression(penalty='l1', C=0.5)
clf.fit(X_train, y_train)
weights = clf.coef_[0]
bias = clf.intercept_[0]

In [37]:
clf.score(X_train, y_train)

0.65680000000000005

In [38]:
clf.score(X_test, y_test)

0.65259999999999996

In [39]:
wi = np.argsort(abs(weights))

In [40]:
[(words[i], freq[i], weights[i]) for i in fi[::-1]]

[('the worst', 1820, -1.8669900901650722),
 ('so bad', 672, -1.74560860516686),
 ('an excellent', 532, 1.3505772179695021),
 ('a must', 438, 2.1315693517802758),
 ('i recommend', 383, 1.208512424826709),
 ('love this', 350, 1.4156881730977708),
 ('worst movie', 343, -1.6694442520267236),
 ('bad acting', 324, -2.1480098993190673),
 ('is excellent', 309, 1.6382002521731909),
 ('how bad', 271, -2.2561588430505677),
 ('loved it', 265, 1.7140276243895571),
 ('love it', 262, 1.0067052562314303),
 ('highly recommend', 255, 1.914453985389313),
 ('that bad', 238, -1.460581323125455),
 ('well worth', 237, 2.7428214927121783),
 ('avoid this', 221, -2.6906370602097032),
 ('great job', 218, 2.0046214856904307),
 ('not funny', 216, -2.273088383115998),
 ('highly recommended', 213, 2.8179902011556814),
 ('not worth', 208, -2.222313655626563),
 ('worst movies', 201, -1.9914049197154224),
 ('worst film', 194, -1.7652603187828542),
 ('is perfect', 188, 2.248306741366787),
 ('are great', 185, 1.026866988

In [41]:
[(words[i], freq[i], weights[i]) for i in wi[::-1]]

[('this crap', 144, -2.9739106471838048),
 ('definitely worth', 123, 2.9733291183297497),
 ('is awful', 158, -2.9733101553247931),
 ('this mess', 119, -2.8709658366538204),
 ('highly recommended', 213, 2.8179902011556814),
 ('well worth', 237, 2.7428214927121783),
 ('avoid this', 221, -2.6906370602097032),
 ('skip this', 112, -2.5923307968732714),
 ('was terrible', 150, -2.4114692668426589),
 ('was awful', 107, -2.3246080589026747),
 ('not funny', 216, -2.273088383115998),
 ('poorly written', 106, -2.2573487429075079),
 ('how bad', 271, -2.2561588430505677),
 ('is perfect', 188, 2.248306741366787),
 ('not worth', 208, -2.222313655626563),
 ('loved this', 162, 2.2130920907761675),
 ('first rate', 112, 2.1502445074968461),
 ('bad acting', 324, -2.1480098993190673),
 ('a must', 438, 2.1315693517802758),
 ('no plot', 166, -2.1171230652697401),
 ('not recommend', 101, -2.1114629584970013),
 ('great job', 218, 2.0046214856904307),
 ('worst movies', 201, -1.9914049197154224),
 ('a disappointm

In [42]:
from coloredweighteddoc import ColoredWeightedDocBigram

In [43]:
j=2
display.display(ColoredWeightedDocBigram(test_corpus[j], words, weights, token_pattern=tp, binary = True))

# Both

In [44]:
both = unigrams + bigrams

In [45]:
vectorizer = CountVectorizer(lowercase=True, vocabulary=both, ngram_range=(1,2), binary=True, token_pattern=tp)

In [46]:
X_train = vectorizer.fit_transform(train_corpus)
X_test = vectorizer.transform(test_corpus)

In [47]:
words = vectorizer.get_feature_names()

In [48]:
freq = np.sum(X_train, axis=0)
freq = freq.A1
fi = np.argsort(freq)

In [49]:
clf = LogisticRegression(penalty='l1', C=0.5)
clf.fit(X_train, y_train)
weights = clf.coef_[0]
bias = clf.intercept_[0]

In [50]:
clf.score(X_train, y_train)

0.83243999999999996

In [51]:
clf.score(X_test, y_test)

0.82776000000000005

In [52]:
wi = np.argsort(abs(weights))

In [53]:
[(words[i], freq[i], weights[i]) for i in fi[::-1]]

[('great', 6311, 0.75969924382075549),
 ('bad', 5892, -0.78050325860243464),
 ('best', 4892, 0.75710935997331608),
 ('funny', 3136, 0.15008383892587471),
 ('worst', 2264, -1.3507447711836096),
 ('fun', 2138, 0.5547565164990903),
 ('the worst', 1820, -0.27598090917399953),
 ('beautiful', 1803, 0.73227380536238673),
 ('excellent', 1779, 1.2791283423502928),
 ('poor', 1597, -0.88681831736129335),
 ('boring', 1508, -1.057514637566966),
 ('awful', 1444, -1.5123245242977448),
 ('wonderful', 1440, 1.0995938887338348),
 ('perfect', 1358, 1.080237569744821),
 ('stupid', 1350, -1.1125308737372526),
 ('terrible', 1333, -0.87373518038249187),
 ('waste', 1302, -2.1690924033933685),
 ('worse', 1271, -0.94542796955805075),
 ('loved', 1245, 0.48124135478121743),
 ('unfortunately', 1242, -0.76558862313374243),
 ('enjoyed', 1141, 0.7103947327027017),
 ('amazing', 1109, 0.97512444626747408),
 ('favorite', 1086, 0.93228658370107231),
 ('brilliant', 1028, 0.82875878267079095),
 ('horrible', 1002, -1.192072

In [54]:
[(words[i], freq[i], weights[i]) for i in wi[::-1]]

[('7/10', 198, 4.3517849133830238),
 ('4/10', 173, -4.1511388353914187),
 ('3/10', 170, -3.5677310447750807),
 ('8/10', 222, 3.0640787158833094),
 ('definitely worth', 123, 2.8401698233588051),
 ('1/10', 158, -2.7195584603459086),
 ('this crap', 144, -2.5389668020261973),
 ('pleasantly surprised', 104, 2.4997076217998879),
 ('well worth', 237, 2.4611110639898448),
 ('skip this', 112, -2.4097881383584481),
 ('10/10', 256, 2.2446002306678858),
 ('waste', 1302, -2.1690924033933685),
 ('9/10', 153, 2.1486879343361935),
 ('2/10', 121, -2.1405949084163294),
 ('not recommend', 101, -2.0813803594130187),
 ('first rate', 112, 1.9054242891732254),
 ('a must', 438, 1.8855367732228165),
 ('poorly', 629, -1.8476906380481228),
 ('mst3k', 112, -1.8159624939464181),
 ('unfunny', 233, -1.7829755077453804),
 ('not funny', 216, -1.7533885589936742),
 ('no plot', 166, -1.742404913244423),
 ('not worth', 208, -1.7399759097157241),
 ('highly recommended', 213, 1.7374117493429768),
 ('lousy', 202, -1.7224799

In [55]:
j=0
print("Unigram")
display.display(ColoredWeightedDoc(test_corpus[j], words, weights, token_pattern=tp, binary = True))
print("\n\nBigram")
display.display(ColoredWeightedDocBigram(test_corpus[j], words, weights, token_pattern=tp, binary = True))

Unigram




Bigram


# Documents that have zero words/phrases

In [155]:
counts = np.sum(X_train, axis=1)

In [156]:
counts = counts.A1

In [157]:
np.sum(counts==0)

2001

In [158]:
vectorizer = CountVectorizer(lowercase=True, min_df = 10, ngram_range=(1,2), binary=True, token_pattern=tp)

In [159]:
X_empty = vectorizer.fit_transform(train_corpus[counts==0])

In [160]:
words = vectorizer.get_feature_names()

In [161]:
clf = LogisticRegression(penalty='l1', C=0.5)
clf.fit(X_empty, y_train[counts==0])
weights = clf.coef_[0]
bias = clf.intercept_[0]

In [162]:
np.sum(weights!=0)

693

In [163]:
wi = np.argsort(abs(weights))

In [164]:
freq = np.sum(X_empty, axis=0)
freq = freq.A1

In [165]:
[(words[i], freq[i], weights[i]) for i in wi[::-1]]

[('oh', 56, -1.852160102352566),
 ('outstanding', 29, 1.4549736736742811),
 ('money', 91, -1.3536988703125243),
 ('nothing', 171, -1.2117504479141665),
 ('co', 33, -1.178242904327063),
 ('the us', 15, -1.1769631659552784),
 ('on dvd', 34, 1.1034627689293059),
 ('think the', 41, 1.0901622350942839),
 ("don't go", 12, -1.0209021772790512),
 ('okay', 29, -1.0071936768102998),
 ('material', 28, -0.94951546025153677),
 ('and to', 34, 0.90698344491712557),
 ('a way', 39, 0.8916700783541528),
 ('rape', 23, -0.88349961337916327),
 ('grade', 34, -0.87903573349085939),
 ('stands', 21, 0.87839158206514156),
 ('theme', 32, 0.87708317849462392),
 ('pass', 23, -0.83404556370827032),
 ('bored', 28, -0.81828006267770104),
 ('to find', 71, 0.81326958477101485),
 ("don't", 383, -0.80653757969418727),
 ('storyline', 29, -0.79906371319927694),
 ('effort', 32, -0.78314686926460308),
 ('him in', 34, 0.77039812915087758),
 ('memorable', 29, 0.76132327901339314),
 ('ten', 34, -0.75905259534558145),
 ('4', 56,