In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [2]:
def load_list(filename):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip())
    return np.asarray(vocabulary)

pos_related = load_list('./sentence_data/pos_related.txt')
neg_related = load_list('./sentence_data/neg_related.txt')
pos_unrelated = load_list('./sentence_data/pos_unrelated.txt')
neg_unrelated = load_list('./sentence_data/neg_unrelated.txt')

In [61]:
print(pos_related.shape)
print(neg_related.shape)
print(pos_unrelated.shape)
print(neg_unrelated.shape)

(466,)
(83,)
(388,)
(34,)


In [62]:
print(pos_related[:10])

['not perfect by a long shot, but definitely good for a smile on a bad day.'
 'the whole cast was great, each character had their own personality and charm.'
 'even though it has one of the standard "revenge price plot," this film is my favorite of vincent price\'s work.'
 'i really enjoyed this movie, it is really fun to watch get elvira into all these adventure, she is just great.'
 'with more laugh than any other third-in-a-disney-series movie, hakuna matata is worth watching - if only for the hot tub scene which is still funny despite being a little bit predictable.'
 'it is really a wonderful thriller i enjoyed very much'
 'when my sister said this movie was gonna be good i had second thought but i watched it and it was actually funny'
 'it touched me in a way that, even all these year later, still affects me.'
 'i strongly recommend seeing for all'
 'without a doubt, the best late night television ever.']


In [63]:
print(neg_related[:10])

['i had numerous problem with this film'
 'dear god i do not know where to start why this movie sucked too much'
 'i was pretty disappointed'
 'if you are tempted to watch this movie, rip your eyeball out and flush them down the toilet'
 'the music there was was annoying, and boring'
 'someone must have been seriously joking when they made this film'
 'ugly then, uglier now' 'this film is predictable'
 'even the supporting male character are all "bad"'
 'trust me, this is one let down movie that you want to avoid and this comes from one huge denzel washington fan']


In [64]:
related_set = np.hstack((pos_related, neg_related))
print(related_set.shape)
y_related = np.ones(related_set.shape)
print(y_related.shape)

(549,)
(549,)


In [65]:
unrelated_set = np.hstack((pos_unrelated, neg_unrelated))
print(unrelated_set.shape)
y_unrelated = np.zeros(unrelated_set.shape)
print(y_unrelated.shape)

(422,)
(422,)


In [66]:
X = np.hstack((related_set, unrelated_set))
y = np.hstack((y_related, y_unrelated))

X.shape

(971,)

In [80]:
from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=1, binary=True, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

X_vectorized = tf_vectorizer.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.33, random_state=42)

In [81]:
words = tf_vectorizer.get_feature_names()
print(len(words))
print(words[500:600])

3685
['buscemi', 'business', 'but', 'button', 'buy', 'buying', 'by', 'cab', 'cable', 'cabot', 'cage', 'calhoun', 'call', 'called', 'callous', 'came', 'camera', 'camerawork', 'camp', 'campeone', 'camper', 'can', 'canada', 'canary', 'canceled', 'canister', 'cannot', 'canvas', 'canyon', 'capable', 'capano', 'capra', 'capture', 'captured', 'captures', 'car', 'care', 'career', 'cares', 'carl', 'carlisle', 'carnal', 'carpet', 'carries', 'cartoon', 'case', 'cassavetes', 'cassidy', 'cast', 'casting', 'cat', "cat's", 'catalog', 'catastrophe', 'catch', 'catchy', 'category', 'catherine', 'catholic', 'causes', 'cbc', 'cell', 'central', 'centre', 'century', "century's", 'certain', 'certainly', 'cgi', 'challenge', 'chambara', "chamberlain's", 'champion', 'championed', 'chance', 'chang', 'change', 'changed', 'channel', 'character', 'characterized', 'charismatic', 'charlie', 'charm', 'chase', 'chasing', 'cheap', 'cheating', 'check', 'checking', 'checkpoint', 'cheesiest', 'cheesy', 'chemistry', 'cheste

In [82]:
print(y_train.shape)
print(y_test.shape)

(650,)
(321,)


In [83]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

print('Train : ', np.around(clf.score(X_train, y_train),5))
print('Test : ', np.around(clf.score(X_test, y_test),5))

Train :  0.99538
Test :  0.75078


In [122]:
from sklearn.metrics import confusion_matrix, recall_score, classification_report

confusion_matrix(y_test, y_predict, labels=[1,0])

array([[142,  46],
       [ 39,  94]], dtype=int64)

In [123]:
# Balanced Accuracy

np.around(recall_score(y_test, y_predict,average='weighted'), 3)

0.735

In [124]:
print(classification_report(y_test,y_predict))

             precision    recall  f1-score   support

        0.0       0.67      0.71      0.69       133
        1.0       0.78      0.76      0.77       188

avg / total       0.74      0.74      0.74       321



In [131]:
weights = clf.coef_.flatten()
def negative_positive_counts(X, y, word_index):
    neg_count = np.sum(X[y==0, word_index])
    pos_count = np.sum(X[y==1, word_index])    
    return neg_count, pos_count

In [140]:
not_related_indices = np.argsort(weights)
related_indices = not_related_indices[::-1]

print("word\tweight\trelated\tunrelated")
for i in not_related_indices[:20]:
    unrel_cnt, rel_cnt = negative_positive_counts(X_train, y_train, i)
    print("%s\t%0.2f\t%d\t%d" %(words[i], weights[i], rel_cnt, unrel_cnt))

word	weight	related	unrelated
remember	-1.37	0	12
saw	-1.32	2	11
when	-1.17	6	27
man	-1.13	1	15
into	-1.09	2	15
he	-1.06	4	33
big	-0.99	5	8
because	-0.99	6	13
have	-0.98	24	29
to	-0.94	82	126
about	-0.91	11	21
am	-0.89	6	16
too	-0.85	2	7
his	-0.85	10	34
which	-0.84	10	15
there	-0.83	9	18
fear	-0.79	0	3
we	-0.79	2	9
after	-0.78	2	13
want	-0.75	1	6


In [143]:
print("word\t\tweight\trelated\tunrelated")
for i in related_indices[:20]:
    unrel_cnt, rel_cnt = negative_positive_counts(X_train, y_train, i)
    print("%s\t\t%0.2f\t%d\t%d" %(words[i], weights[i], rel_cnt, unrel_cnt))

word		weight	related	unrelated
performance		1.25	14	0
scene		1.19	14	1
this		1.17	147	50
loved		1.09	8	0
best		1.06	25	4
recommend		1.05	13	0
excellent		0.95	15	1
worth		0.91	9	0
movie		0.82	85	28
great		0.82	26	6
disappointed		0.82	4	0
most		0.81	14	4
ever		0.80	14	2
character		0.75	14	4
acting		0.75	13	1
pretty		0.72	8	0
it		0.70	100	57
watch		0.67	14	3
must		0.66	11	2
wonderful		0.64	7	1


In [144]:
abs_indices = np.argsort(np.absolute(weights))[::-1]

print("word\tweight\trelated\tunrelated")
for i in abs_indices[:20]:
    unrel_cnt, rel_cnt = negative_positive_counts(X_train, y_train, i)
    print("%s\t%0.2f\t%d\t%d" %(words[i], weights[i], rel_cnt, unrel_cnt))

word	weight	related	unrelated
remember	-1.37	0	12
saw	-1.32	2	11
performance	1.25	14	0
scene	1.19	14	1
when	-1.17	6	27
this	1.17	147	50
man	-1.13	1	15
into	-1.09	2	15
loved	1.09	8	0
best	1.06	25	4
he	-1.06	4	33
recommend	1.05	13	0
big	-0.99	5	8
because	-0.99	6	13
have	-0.98	24	29
excellent	0.95	15	1
to	-0.94	82	126
about	-0.91	11	21
worth	0.91	9	0
am	-0.89	6	16


In [14]:
for sentence in X[:10] :
    print(sentence)

not perfect by a long shot, but definitely good for a smile on a bad day.
the whole cast was great, each character had their own personality and charm.
even though it has one of the standard "revenge price plot," this film is my favorite of vincent price's work.
i really enjoyed this movie, it is really fun to watch get elvira into all these adventure, she is just great.
with more laugh than any other third-in-a-disney-series movie, hakuna matata is worth watching - if only for the hot tub scene which is still funny despite being a little bit predictable.
it is really a wonderful thriller i enjoyed very much
when my sister said this movie was gonna be good i had second thought but i watched it and it was actually funny
it touched me in a way that, even all these year later, still affects me.
i strongly recommend seeing for all
without a doubt, the best late night television ever.


In [15]:
def save_pickle(path, X):
    with open(path, 'wb') as f:
        pickle.dump(X, f)
def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

X_te_clean = open_pickle("./pickles/imdb_x_te_clean.pickle")
y_te = open_pickle("./pickles/imdb_y_te.pickle")

In [16]:
def print_sentence(corpus):
    from textblob import TextBlob
    text = TextBlob(corpus)
    i = 0
    sent = []
    for sentence in text.raw_sentences:
        sent.append(sentence)
    return sent

In [17]:
# import nltk
# nltk.download('punkt')
test = print_sentence(X_te_clean[0])

test_matrix = tf_vectorizer.transform(test) 
test_matrix.shape

y_pred_test = clf.predict(test_matrix)
for i in range(len(test)):
    print(y_pred_test[i], ':' , test[i])

1.0 : this was an excellent show.
0.0 : it came on pbs back home in chicago and i remember cindy herron (from envogue) played the teen aged daughter.
0.0 : the show dealt with subject such as sex, peer pressure and puberty.
0.0 : it was about a middle class black family who had a teen aged daughter and son who moved to a middle class neighborhood from oakland or somewhere (i can not remember).
0.0 : i remember several episode but the one i remember most was when their cousin got her period for the first time.
0.0 : i was probably 7-8 when i first watched it and i was able to keep up with the program.
1.0 : this was a great show.
0.0 : i can not remember the name of the guy who played the son on the show, but i always got him confused with kevin hook.


In [18]:
test = print_sentence(X_te_clean[1])

test_matrix = tf_vectorizer.transform(test) 
test_matrix.shape

y_pred_test = clf.predict(test_matrix)
x_extracted_1 = ''
print(y_te[1])
for i in range(len(test)):
    if y_pred_test[i] == 1:
        x_extracted_1 += test[i]
        print('-', test[i])

1
- age cannot tarnish the beauty of this east-west love story for me.
- with hong kong as the backdrop, this movie tells the story of a eurasian doctor and a u.s. journalist who meet and fall in love during the korean war.
- as mark elliott, william holden is intelligent, breezy and a bit weak; jennifer jone is perhaps well-nigh-perfect as dr. han suyin, by turns doubt-torn and ecstatic, eager and hesitant.
- other in the large cast include torin thatcher, isobel elsom, murray matheson, virginia gregg, richard loo, soo yong, philip ahn, jorja curtright and donna martell; many of hollywood's best oriental actor played smaller uncredited part also.
- the film is unarguably physically busy, interesting and often beautiful also.
- with cinematography by leon shamroy, ben nye's makeup and helen turpin's hairstyle, the great work by set decorator, sound and lighting, art department and all concerned, this has to be one of the most memorable production set in a major non-u.s. city of all tim

In [19]:
test = print_sentence(X_te_clean[3])

test_matrix = tf_vectorizer.transform(test) 
test_matrix.shape

y_pred_test = clf.predict(test_matrix)
x_extracted_3 = ''
print(y_te[3])
for i in range(len(test)):
    if y_pred_test[i] == 1:
        x_extracted_3 += test[i]
        print('-',test[i])

0
- a clear sign that this is unimpressive is that it was directed by a visual effect creator, whose only other credit in that field is a henry rooker film that was not well received.
- )'s russian accent laughable and/or irritating.
- the action is not terrible.
- cinematography and editing are fine.
- the music is cool enough.
- language is infrequent, if even that.
- violence is fairly bloody.
- i recommend this solely to fan of b-movie, and i will say that you can do worse than this.
- 1/10


In [20]:
## Test the clf 3

X_tr_clean = open_pickle("./pickles/imdb_x_tr_clean.pickle")
y_tr = open_pickle("./pickles/imdb_y_tr.pickle")
cv = CountVectorizer(min_df=100, token_pattern=token)
X_train_ = cv.fit_transform(X_tr_clean)

In [21]:
X_train_.shape
y_tr.shape

(25000,)

In [22]:
x_extracted = np.hstack((x_extracted_1, x_extracted_3))
X_test_ = cv.transform(x_extracted)
y_test_ = [1, 0]

In [23]:
X_test_.shape

(2, 3689)

In [24]:
clf_2 = LogisticRegression()
clf_2.fit(X_train_, y_tr)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
clf_2.score(X_test_, y_test_)

1.0