In [1]:
import os
import sys
import csv
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import operator
import re

In [2]:
def load_data():
    print("Loading the imdb reviews data...")
    train_corpus=[]
    y_train=[]
    directory = os.path.normpath("C:/Users/meghu/Desktop/train/neg")
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):
                f=open(os.path.join(subdir, file),'r')
                try:
                    a = f.read() 
                    train_corpus.append(a)
                    y_train.append(0)
                except:
                    pass
                f.close()
    directory = os.path.normpath("C:/Users/meghu/Desktop/train/pos")
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):
                f=open(os.path.join(subdir, file),'r')
                try:
                    a = f.read() 
                    train_corpus.append(a)
                    y_train.append(1)
                except:
                    pass
                f.close()
    
    print("Data loaded successfully!")
    return train_corpus, y_train

In [3]:
train_corpus, y_train=load_data()

Loading the imdb reviews data...
Data loaded successfully!


In [4]:
def parse_human_phrases(phrase_file, to_lower=False):
    with open(phrase_file) as csvfile:
        reader=csv.reader(csvfile, delimiter='\t')
        phrases={} # key:phrase value:number of times appeared in the collection
        for row in reader:            
            for phrase in row:
                if phrase is not "":
                    if to_lower:
                        phrase = phrase.lower()
                    if phrase not in phrases:
                        phrases[phrase] = 1
                    else:
                        phrases[phrase] += 1
    return phrases

In [5]:
train_corpus = np.array(train_corpus)
y_train = np.array(y_train)

In [6]:
token_pattern = r"(?u)\b[\w\'/]+\b"
tokenizer = re.compile(token_pattern)

In [7]:
neg_phrases_dict = parse_human_phrases('Negative shortened for parsing.tsv', to_lower=True)
sorted(neg_phrases_dict.items(), key=operator.itemgetter(1), reverse=True)

[('acting was terrible', 3),
 ('avoid this movie', 2),
 ("don't waste your time", 2),
 ('acting sucks', 2),
 ('2 out of 10', 2),
 ('incomprehensible', 2),
 ('had nothing to do with the story', 2),
 ('ï»¿terrific example', 1),
 ('violent mob', 1),
 ('unfortunately it stays absurd ', 1),
 ('no general narrative', 1),
 ('too off putting', 1),
 ('should be turned off', 1),
 ('cryptic dialogue', 1),
 ('absurd comedy', 1),
 ('crazy chantings', 1),
 ('most unbelievably intelligent illiterate ', 1),
 ('so wasteful of talent', 1),
 ('truly disgusting', 1),
 ('script is unbelievable', 1),
 ('dialog is unbelievable', 1),
 ('character is a caricature of herself', 1),
 ("movie moves at a snail's pace", 1),
 ('photographed in an ill-advised manner', 1),
 ('insufferably preachy', 1),
 ('movie is this worthless', 1),
 ('waste of time and tape', 1),
 ('not a funny one', 1),
 ('horrible performances', 1),
 ('zero chemistry', 1),
 ('worst effort', 1),
 ('illiterate of all time', 1),
 ('uncomfortably drif

In [8]:
pos_phrases_dict = parse_human_phrases('Positive shortened for parsing.tsv', to_lower=True)
sorted(pos_phrases_dict.items(), key=operator.itemgetter(1), reverse=True)

[('highly recommended', 4),
 ('liked stanley & iris very much', 2),
 ('acting was very good', 2),
 ('story had a unique and interesting arrangement', 2),
 ('absence of violence and porno sex was refreshing', 2),
 ('very enjoyable movie', 2),
 ('i recommend this movie', 2),
 ('i loved it', 2),
 ('good one', 2),
 ('one of the best', 2),
 ('it was great', 2),
 ('sends a powerful message', 2),
 ('great movie', 2),
 ('i recommend', 2),
 ('excellent movie', 2),
 ('watch it', 2),
 ('one of the best movies', 2),
 ('i highly recommend', 2),
 ('much closer to reality', 1),
 ("is far fetched. what a pity that it isn't!", 1),
 ('good leader', 1),
 ('cast is also fantastic', 1),
 ('very good!', 1),
 ('nothing short of brilliant', 1),
 ('expertly scripted', 1),
 ('leaves you literally rolling with laughter', 1),
 ('superbly caricatured cross section', 1),
 ("doesn't shy away from parodying every imaginable subject", 1),
 ('will not disappoint!', 1),
 ('perfectly delivered', 1),
 ('??', 1),
 ('only s

In [9]:
def print_phrase_stats(human_vocab):
    vectorizer = CountVectorizer(vocabulary=human_vocab, lowercase=True, ngram_range=(1,20), binary=True)
    train_phrases_X = vectorizer.fit_transform(train_corpus)
    vocab = vectorizer.get_feature_names()
    all_counts = np.sum(train_phrases_X, axis=0)
    all_counts_array = all_counts.A1
    all_counts_sorted_indices = np.argsort(all_counts_array)[::-1]
    in_neg_counts = np.sum(train_phrases_X[y_train==0], axis=0)
    in_neg_counts = in_neg_counts.A1
    in_pos_counts = np.sum(train_phrases_X[y_train==1], axis=0)
    in_pos_counts = in_pos_counts.A1
    print("The Phrase\tneg_count\tpos_count\ttotal_count")
    for i in all_counts_sorted_indices:
        print("%s\t%d\t%d\t%d" %(vocab[i], in_neg_counts[i], in_pos_counts[i], all_counts_array[i]))

In [10]:
print_phrase_stats(list(neg_phrases_dict.keys()))

The Phrase	neg_count	pos_count	total_count
the worst	1660	164	1824
stupid	1116	236	1352
unfortunately	849	392	1241
predictable	561	213	774
bad movie	319	47	366
worst movie	333	11	344
too long	183	114	297
no plot	152	14	166
stereotypical	114	51	165
convoluted	75	44	119
very disappointed	80	13	93
incomprehensible	71	16	87
slow moving	48	24	72
terrible movie	67	0	67
unimaginative	54	6	60
save your money	58	1	59
skip this one	56	1	57
avoid this movie	53	3	56
should be ashamed	53	3	56
made no sense	46	4	50
psychedelic	22	25	47
this movie does not	26	12	38
badly done	34	4	38
the worst movie ever made	36	0	36
there is no plot	34	1	35
just horrible	32	3	35
no redeeming qualities	32	3	35
only saving grace	32	1	33
cheaper	28	5	33
acting was terrible	30	2	32
lack thereof	22	10	32
save your time	28	2	30
was slow	26	4	30
not in good way	29	0	29
movie was terrible	26	1	27
movie sucked	24	2	26
very obvious	17	9	26
new low	22	3	25
weak plot	23	2	25
very cheesy	20	4	24
rips off	19	5	24
movie was bad	20

In [11]:
print_phrase_stats(list(pos_phrases_dict.keys()))

The Phrase	neg_count	pos_count	total_count
the best	911	2009	2920
special	941	854	1795
watch it	527	681	1208
watch this	683	467	1150
enjoyable	252	533	785
very well	177	574	751
one of the best	86	614	700
my favorite	147	534	681
good movie	287	296	583
was good	250	241	491
was great	133	338	471
watch this movie	249	170	419
super	212	192	404
wow	246	146	392
great movie	82	289	371
great film	55	241	296
remarkable	62	219	281
highly recommend	29	226	255
enjoyed it	51	164	215
highly recommended	8	205	213
fun to watch	92	116	208
big fan	105	83	188
one of the greatest	31	129	160
really enjoyed	36	118	154
even better	39	106	145
very entertaining	20	116	136
her best	49	87	136
good one	65	70	135
was excellent	26	107	133
comic relief	58	65	123
lot of fun	26	92	118
it was great	23	91	114
see it again	20	84	104
great fun	11	77	88
great performances	9	77	86
best work	18	60	78
one of the best movies	4	71	75
great to see	9	66	75
excellent movie	2	64	66
bravo	11	44	55
oscar worthy	15	30	45
excellent perf

In [12]:
neg_phrases_list = list(neg_phrases_dict.keys())
neg_list=[]
for neg_phrase in neg_phrases_list:
    neg_list.append(" ".join(tokenizer.findall(neg_phrase)))

In [13]:
pos_phrases_list = list(pos_phrases_dict.keys())
pos_list = []
for pp in pos_phrases_list:
    pos_list.append(" ".join(tokenizer.findall(pp)))

In [14]:
def print_phrase_stats(human_vocab, token_pattern=r"(?u)\b[\w\'/]+\b"):
    vectorizer = CountVectorizer(vocabulary=human_vocab, lowercase=True, ngram_range=(1,20), binary=True, token_pattern=token_pattern)
    train_phrases_X = vectorizer.fit_transform(train_corpus)
    vocab = vectorizer.get_feature_names()
    all_counts = np.sum(train_phrases_X, axis=0)
    all_counts_array = all_counts.A1
    all_counts_sorted_indices = np.argsort(all_counts_array)[::-1]
    in_neg_counts = np.sum(train_phrases_X[y_train==0], axis=0)
    in_neg_counts = in_neg_counts.A1
    in_pos_counts = np.sum(train_phrases_X[y_train==1], axis=0)
    in_pos_counts = in_pos_counts.A1
    print("The Phrase\tneg_count\tpos_count\ttotal_count")
    for i in all_counts_sorted_indices:
        print("%s\t%d\t%d\t%d" %(vocab[i], in_neg_counts[i], in_pos_counts[i], all_counts_array[i]))

In [15]:
neg_set = set(neg_list) # can't have duplicate entries

In [16]:
print_phrase_stats(list(neg_set), token_pattern=token_pattern)

The Phrase	neg_count	pos_count	total_count
the worst	1657	163	1820
poor	1228	368	1596
stupid	1114	236	1350
unfortunately	849	392	1241
ridiculous	710	165	875
predictable	561	213	774
bad movie	311	42	353
worst movie	332	11	343
too long	183	114	297
1/2	150	79	229
didn't like	114	100	214
don't waste your time	179	7	186
3/10	166	4	170
no plot	152	14	166
stereotypical	114	51	165
bad movies	150	13	163
convoluted	75	44	119
don't bother	99	15	114
implausible	71	29	100
4 out of 10	89	6	95
very disappointed	80	13	93
1 out of 10	92	1	93
incomprehensible	71	16	87
2 out of 10	72	2	74
slow moving	48	24	72
terrible movie	67	0	67
don't watch it	47	16	63
unimaginative	54	6	60
save your money	58	1	59
skip this one	56	1	57
avoid this movie	53	3	56
should be ashamed	53	3	56
made no sense	46	4	50
psychedelic	21	25	46
it's nothing	30	16	46
badly done	34	4	38
this movie does not	26	12	38
the worst movie ever made	36	0	36
no redeeming qualities	32	3	35
there is no plot	34	1	35
cheaper	28	5	33
only saving grace

In [17]:
pos_set = set(pos_list) # can't have duplicate entries

In [18]:
print_phrase_stats(list(pos_set), token_pattern=token_pattern)

The Phrase	neg_count	pos_count	total_count
the best	910	2008	2918
special	940	853	1793
watch it	520	673	1193
watch this	683	467	1150
very good	351	717	1068
enjoyable	251	533	784
very well	177	574	751
one of the best	85	613	698
my favorite	147	532	679
i loved	136	448	584
good movie	279	293	572
see this movie	227	202	429
watch this movie	249	170	419
super	211	192	403
wow	246	146	392
i recommend	104	279	383
great movie	80	288	368
was good	190	153	343
was great	86	222	308
great film	55	239	294
remarkable	62	219	281
10/10	18	238	256
highly recommend	29	226	255
enjoyed it	51	163	214
highly recommended	8	205	213
fun to watch	92	116	208
a masterpiece	49	152	201
i highly recommend	21	169	190
big fan	105	81	186
one of the greatest	31	129	160
really enjoyed	36	118	154
even better	39	105	144
her best	49	87	136
very entertaining	20	116	136
good one	65	69	134
was excellent	26	106	132
comic relief	58	65	123
i loved it	10	109	119
lot of fun	26	92	118
see it again	20	84	104
did a great job	17	74	91
gre

In [19]:
def print_document_stats(human_vocab, corpus, token_pattern):
    vectorizer = CountVectorizer(vocabulary=human_vocab, lowercase=True, ngram_range=(1,20), binary=True, token_pattern=token_pattern)
    X=vectorizer.fit_transform(corpus)
    counts = np.sum(X, axis=1)
    ca = counts.A1
    print("Has zero phrases:\t%d" %(np.sum(ca==0)))
    print("Has one phrase:\t%d" %(np.sum(ca==1)))
    print("Has two ore more phrases:\t%d" %(np.sum(ca>=2)))

In [20]:
print_document_stats(list(neg_set), train_corpus[y_train==0], token_pattern)

Has zero phrases:	5857
Has one phrase:	4039
Has two ore more phrases:	2599


In [21]:
print_document_stats(list(pos_set), train_corpus[y_train==1], token_pattern)

Has zero phrases:	4880
Has one phrase:	3691
Has two ore more phrases:	3918
