In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [2]:
def load_list(filename):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip())
    return np.asarray(vocabulary)

pos_related = load_list('./sentence_data/pos_related.txt')
neg_related = load_list('./sentence_data/neg_related.txt')
pos_unrelated = load_list('./sentence_data/pos_unrelated.txt')
neg_unrelated = load_list('./sentence_data/neg_unrelated.txt')

In [3]:
print(pos_related.shape)
print(neg_related.shape)
print(pos_unrelated.shape)
print(neg_unrelated.shape)

(466,)
(198,)
(388,)
(149,)


In [4]:
print(pos_related[:10])

['not perfect by a long shot, but definitely good for a smile on a bad day.'
 'the whole cast was great, each character had their own personality and charm.'
 'even though it has one of the standard "revenge price plot," this film is my favorite of vincent price\'s work.'
 'i really enjoyed this movie, it is really fun to watch get elvira into all these adventure, she is just great.'
 'with more laugh than any other third-in-a-disney-series movie, hakuna matata is worth watching - if only for the hot tub scene which is still funny despite being a little bit predictable.'
 'it is really a wonderful thriller i enjoyed very much'
 'when my sister said this movie was gonna be good i had second thought but i watched it and it was actually funny'
 'it touched me in a way that, even all these year later, still affects me.'
 'i strongly recommend seeing for all'
 'without a doubt, the best late night television ever.']


In [5]:
print(neg_related[:10])

['ï»¿i had numerous problem with this film'
 'dear god i do not know where to start why this movie sucked too much'
 'i was pretty disappointed'
 'if you are tempted to watch this movie, rip your eyeball out and flush them down the toilet'
 'the music there was was annoying, and boring'
 'someone must have been seriously joking when they made this film'
 'ugly then, uglier now' 'this film is predictable'
 'even the supporting male character are all "bad"'
 'trust me, this is one let down movie that you want to avoid and this comes from one huge denzel washington fan']


In [6]:
related_set = np.hstack((pos_related, neg_related))
print(related_set.shape)
y_related = np.ones(related_set.shape)
print(y_related.shape)

(664,)
(664,)


In [7]:
unrelated_set = np.hstack((pos_unrelated, neg_unrelated))
print(unrelated_set.shape)
y_unrelated = np.zeros(unrelated_set.shape)
print(y_unrelated.shape)

(537,)
(537,)


In [8]:
X = np.hstack((related_set, unrelated_set))
y = np.hstack((y_related, y_unrelated))

X.shape

(1201,)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=1, binary=True, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

X_vectorized = tf_vectorizer.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.33, random_state=42)

In [10]:
words = tf_vectorizer.get_feature_names()
print(len(words))
print(words[500:600])

4018
['brian', 'bridge', 'brief', 'brilliant', 'bring', 'britain', 'britian', 'british', 'britton', 'bro', 'broadcast', 'brooklyn', 'broomstick', 'brother', 'brought', 'brown', 'brussel', 'brutally', 'btw', 'buck', 'buckaroo', 'budget', 'buff', 'building', 'builds', 'bull', 'bully', 'bullying', 'bumbling', 'bunch', 'burgeoning', 'burn', 'burned', 'burnt', 'burrow', 'burton', 'busby', 'buscemi', 'business', 'but', 'button', 'buy', 'buying', 'by', 'caan', 'cab', 'cable', 'cabot', 'cage', 'calhoun', 'call', 'called', 'callous', 'came', 'camera', 'camerawork', 'camp', 'campeone', 'camper', 'can', 'canada', 'canary', 'canceled', 'canister', 'cannot', 'canvas', 'canyon', 'capable', 'capano', 'capra', 'capt', 'captain', 'captivating', 'capture', 'captured', 'captures', 'car', 'care', 'career', 'careless', 'cares', 'carl', 'carlisle', 'carnal', 'carpet', 'carries', 'cartoon', 'case', 'cassavetes', 'cassidy', 'cast', 'casting', 'cat', "cat's", 'catalog', 'catastrophe', 'catch', 'catchy', 'categ

In [11]:
print(y_train.shape)
print(y_test.shape)

(804,)
(397,)


In [12]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

print('Train : ', np.around(clf.score(X_train, y_train),5))
print('Test : ', np.around(clf.score(X_test, y_test),5))

Train :  0.98632
Test :  0.74307


In [13]:
print(np.sum(y_train==0))

369


In [14]:
188./(188+133)

0.5856697819314641

In [15]:
from sklearn.metrics import confusion_matrix, recall_score, classification_report

print(confusion_matrix(y_test, y_predict, labels=[1,0]))

[[180  49]
 [ 53 115]]


In [16]:
# Balanced Accuracy

np.around(recall_score(y_test, y_predict,average='weighted'), 3)

0.743

In [17]:
print(classification_report(y_test,y_predict))

             precision    recall  f1-score   support

        0.0       0.70      0.68      0.69       168
        1.0       0.77      0.79      0.78       229

avg / total       0.74      0.74      0.74       397



In [18]:
weights = clf.coef_.flatten()
def negative_positive_counts(X, y, word_index):
    neg_count = np.sum(X[y==0, word_index])
    pos_count = np.sum(X[y==1, word_index])    
    return neg_count, pos_count

In [19]:
not_related_indices = np.argsort(weights)
related_indices = not_related_indices[::-1]

print("word\tweight\trelated\tunrelated")
for i in not_related_indices[:20]:
    unrel_cnt, rel_cnt = negative_positive_counts(X_train, y_train, i)
    print("%s\t%0.2f\t%d\t%d" %(words[i], weights[i], rel_cnt, unrel_cnt))

word	weight	related	unrelated
when	-1.38	5	35
saw	-1.28	3	10
remember	-1.22	0	10
friend	-1.18	0	13
he	-1.15	8	47
about	-1.03	13	21
his	-0.97	9	39
back	-0.94	3	8
could	-0.91	4	12
first	-0.90	6	20
person	-0.86	10	14
man	-0.86	2	12
because	-0.83	7	17
through	-0.81	3	13
child	-0.80	0	4
in	-0.79	70	100
as	-0.78	30	42
have	-0.78	29	37
woman	-0.76	0	6
we	-0.75	1	12


In [20]:
print("word\t\tweight\trelated\tunrelated")
for i in related_indices[:20]:
    unrel_cnt, rel_cnt = negative_positive_counts(X_train, y_train, i)
    print("%s\t\t%0.2f\t%d\t%d" %(words[i], weights[i], rel_cnt, unrel_cnt))

word		weight	related	unrelated
recommend		1.37	16	0
ever		1.28	22	2
performance		1.18	13	0
worth		1.17	13	0
acting		1.12	17	0
this		1.10	164	65
loved		1.09	8	0
perfect		1.02	7	0
pretty		1.01	12	0
here		0.89	10	5
watch		0.88	15	4
scene		0.88	12	3
quite		0.87	7	1
film		0.85	89	39
piece		0.84	6	1
best		0.83	24	8
any		0.83	14	4
excellent		0.82	13	1
very		0.80	27	6
all		0.78	35	22


In [21]:
abs_indices = np.argsort(np.absolute(weights))[::-1]

print("word\tweight\trelated\tunrelated")
for i in abs_indices[:20]:
    unrel_cnt, rel_cnt = negative_positive_counts(X_train, y_train, i)
    print("%s\t%0.2f\t%d\t%d" %(words[i], weights[i], rel_cnt, unrel_cnt))

word	weight	related	unrelated
when	-1.38	5	35
recommend	1.37	16	0
saw	-1.28	3	10
ever	1.28	22	2
remember	-1.22	0	10
performance	1.18	13	0
friend	-1.18	0	13
worth	1.17	13	0
he	-1.15	8	47
acting	1.12	17	0
this	1.10	164	65
loved	1.09	8	0
about	-1.03	13	21
perfect	1.02	7	0
pretty	1.01	12	0
his	-0.97	9	39
back	-0.94	3	8
could	-0.91	4	12
first	-0.90	6	20
here	0.89	10	5


In [22]:
for sentence in X[:10] :
    print(sentence)

not perfect by a long shot, but definitely good for a smile on a bad day.
the whole cast was great, each character had their own personality and charm.
even though it has one of the standard "revenge price plot," this film is my favorite of vincent price's work.
i really enjoyed this movie, it is really fun to watch get elvira into all these adventure, she is just great.
with more laugh than any other third-in-a-disney-series movie, hakuna matata is worth watching - if only for the hot tub scene which is still funny despite being a little bit predictable.
it is really a wonderful thriller i enjoyed very much
when my sister said this movie was gonna be good i had second thought but i watched it and it was actually funny
it touched me in a way that, even all these year later, still affects me.
i strongly recommend seeing for all
without a doubt, the best late night television ever.


In [23]:
def save_pickle(path, X):
    with open(path, 'wb') as f:
        pickle.dump(X, f)
def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

X_te_clean = open_pickle("./pickles/imdb_x_te_clean.pickle")
y_te = open_pickle("./pickles/imdb_y_te.pickle")

In [24]:
def print_sentence(corpus):
    from textblob import TextBlob
    text = TextBlob(corpus)
    i = 0
    sent = []
    for sentence in text.raw_sentences:
        sent.append(sentence)
    return sent

In [25]:
# import nltk
# nltk.download('punkt')
test = print_sentence(X_te_clean[0])

test_matrix = tf_vectorizer.transform(test) 
test_matrix.shape

y_pred_test = clf.predict(test_matrix)
for i in range(len(test)):
    print(y_pred_test[i], ':' , test[i])

1.0 : this was an excellent show.
0.0 : it came on pbs back home in chicago and i remember cindy herron (from envogue) played the teen aged daughter.
0.0 : the show dealt with subject such as sex, peer pressure and puberty.
0.0 : it was about a middle class black family who had a teen aged daughter and son who moved to a middle class neighborhood from oakland or somewhere (i can not remember).
0.0 : i remember several episode but the one i remember most was when their cousin got her period for the first time.
0.0 : i was probably 7-8 when i first watched it and i was able to keep up with the program.
1.0 : this was a great show.
0.0 : i can not remember the name of the guy who played the son on the show, but i always got him confused with kevin hook.


In [34]:
test = print_sentence(X_te_clean[1])

test_matrix = tf_vectorizer.transform(test) 
test_matrix.shape

y_pred_test = clf.predict(test_matrix)
x_extracted_1 = ''
print(y_te[1])
for i in range(len(test)):
    if y_pred_test[i] == 1:
        x_extracted_1 += test[i]
        print('-', test[i])

1
- age cannot tarnish the beauty of this east-west love story for me.
- as mark elliott, william holden is intelligent, breezy and a bit weak; jennifer jone is perhaps well-nigh-perfect as dr. han suyin, by turns doubt-torn and ecstatic, eager and hesitant.
- other in the large cast include torin thatcher, isobel elsom, murray matheson, virginia gregg, richard loo, soo yong, philip ahn, jorja curtright and donna martell; many of hollywood's best oriental actor played smaller uncredited part also.
- the film is unarguably physically busy, interesting and often beautiful also.
- with cinematography by leon shamroy, ben nye's makeup and helen turpin's hairstyle, the great work by set decorator, sound and lighting, art department and all concerned, this has to be one of the most memorable production set in a major non-u.s. city of all time, and one of the most difficult to capture on film.
- truly, love is a many-splendored thing, dr. han says; and this movie stands as one of that doctrin

In [27]:
test = print_sentence(X_te_clean[3])

test_matrix = tf_vectorizer.transform(test) 
test_matrix.shape

y_pred_test = clf.predict(test_matrix)
x_extracted_3 = ''
print(y_te[3])
for i in range(len(test)):
    if y_pred_test[i] == 1:
        x_extracted_3 += test[i]
        print('-',test[i])

0
- i have yet to watch the first entry in this series, however, fortunately, i was still able to follow the complex and intricate plot, with all its unexpected twist and turn, and i applaud them for the utter originality of the concept herein.
- a clear sign that this is unimpressive is that it was directed by a visual effect creator, whose only other credit in that field is a henry rooker film that was not well received.
- the acting is average at best, and i defy anyone to not find... scottish computer-woman(come on, seriously, what is with that last name?
- )'s russian accent laughable and/or irritating.
- the action is not terrible.
- cinematography and editing are fine.
- the music is cool enough.
- language is infrequent, if even that.
- violence is fairly bloody.
- i recommend this solely to fan of b-movie, and i will say that you can do worse than this.
- 1/10


In [28]:
## Test the clf 3

X_tr_clean = open_pickle("./pickles/imdb_x_tr_clean.pickle")
y_tr = open_pickle("./pickles/imdb_y_tr.pickle")
cv = CountVectorizer(min_df=100, token_pattern=token)
X_train_ = cv.fit_transform(X_tr_clean)

In [29]:
X_train_.shape
y_tr.shape

(25000,)

In [30]:
x_extracted = np.hstack((x_extracted_1, x_extracted_3))
X_test_ = cv.transform(x_extracted)
y_test_ = [1, 0]

In [31]:
X_test_.shape

(2, 3689)

In [32]:
clf_2 = LogisticRegression()
clf_2.fit(X_train_, y_tr)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [33]:
clf_2.score(X_test_, y_test_)

1.0