### Mike Ogrysko
### CS 766 Information Retrieval and Natural Language Processing

Parsing the IMDB movie reviews for sentiment
- IMDB movie review data
- Top 20 most frequent words in reviews grouped by sentiment
- 20 top frequent bigrams in reviews grouped by sentiment
- 20 top frequent bigrams, which are 'NN' POS tagged in reviews grouped by sentiment
- 4-grams that have counts 2 or more in reviews grouped by sentiment
- Probabilities of words that come after "worst film ever" and "best movie ever"

In [None]:
from collections import defaultdict
import csv
from string import punctuation
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import nltk
import numpy as np
import operator
import re


In [None]:
Reviews, Sentiments = [], []

with open('../Datasets-20220907/movie_data.csv','r', encoding='utf8') as fin:
    reader = csv.reader(fin, delimiter=',', quotechar='"')
    header = next(reader)
    for i, line in enumerate(reader):
        Reviews += [line[0]]
        Sentiments +=[int(line[1])]

N=len(Reviews)
M=len(Sentiments)
print('Total reviews loaded', N)
print('Total sentiments loaded', M)

**Top 20 most frequent words in reviews grouped by sentiment**

In [None]:
#combination of stop words and punctuations, also get rid of br
stop_words = stopwords.words('english') + list(punctuation)
stop_words_set = set(stop_words) | set(['br', 'The', 'This'])

#develop tokenizer
def tokenize(text):
    terms = word_tokenize(text)
    #filter stop words
    terms = [w for w in terms if w not in stop_words_set and not w.isdigit()]
    #regex for contractions and other special character strings
    terms = [w for w in terms if not re.search(r'^\W+|\w\'\w+|\'\w+$', w)]
    terms = [w for w in terms if not re.search(r'^[^a-z]+$', w)]
    #regex for words two letters or less and numbers
    terms = [w for w in terms if not re.search(r'^\b\w{1,2}\b|(?<!\S)\d+(?!\S)$', w)]
    #lemmatize
    lemmatizer = WordNetLemmatizer()
    #was passing get_wordnet_pos() into lemmatizer but stopped because of memory issues
    #terms = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in terms]
    terms = [lemmatizer.lemmatize(w, 'n') for w in terms]
    return terms



In [None]:
#get bag0 list
Bag0 = []
for i, review in enumerate(Reviews):
    if Sentiments[i] == 0:
        Bag0 += [review]
        
#get bag1 list
Bag1 = []
for i, review in enumerate(Reviews):
    if Sentiments[i] == 1:
        Bag1 += [review]

In [None]:
#function to get term/review counts - _reviews = list, _pos = 0 or 1 (no or yes)
def tokenize_dict(_reviews, _pos):
    my_dict = defaultdict(int)
    for review in _reviews:
        if _pos == 1:
            terms = set(nltk.pos_tag(tokenize(review)))
        else:
            terms = set(tokenize(review))
        for term in terms:
            my_dict[term] +=1
    return my_dict

In [None]:
#create count dictionaries for the bags - no POS
vocab_counts_bag0 = tokenize_dict(Bag0, 0)
vocab_counts_bag1 = tokenize_dict(Bag1, 0)

In [None]:
#sort the dictionaries and store the top 20
sort_vocab_counts_bag0 = dict(sorted(vocab_counts_bag0.items(), key=lambda kv:kv[1],reverse=True)[:20])
sort_vocab_counts_bag1 = dict(sorted(vocab_counts_bag1.items(), key=lambda kv:kv[1],reverse=True)[:20])


In [None]:
#print top 20 bag 0
print("Sentiment 0 - 20 most frequent")
for i in sort_vocab_counts_bag0:
    print(f"{sort_vocab_counts_bag0[i]} {i}")

In [None]:
#print top 20 bag 1
print("Sentiment 1 - 20 most frequent")
for i in sort_vocab_counts_bag1:
    print(f"{sort_vocab_counts_bag1[i]} {i}")

**20 top frequent bigrams in reviews grouped by sentiment**

In [None]:
def grams_dict(_text, _n):
    grams_dict_counts = defaultdict(int)
    for review in _text:
        terms = tokenize(review)
        if len(terms) >= _n:
            for i in range(len(terms)-_n+1):
                gram_li = [_ for _ in terms[i:i+_n]]
                gram = ' '.join(gram_li)
                grams_dict_counts[gram] += 1
    return grams_dict_counts

In [None]:
#create count dictionaries for the bags - no POS
bigram_counts_bag0 = grams_dict(Bag0, 2)
bigram_counts_bag1 = grams_dict(Bag1, 2)

In [None]:
#sort the dictionaries and store the top 20
sort_bigram_counts_bag0 = dict(sorted(bigram_counts_bag0.items(), key=lambda kv:kv[1],reverse=True)[:20])
sort_bigram_counts_bag1 = dict(sorted(bigram_counts_bag1.items(), key=lambda kv:kv[1],reverse=True)[:20])


In [None]:
#print top 20 bag 0
print("Sentiment 0 - 20 most frequent bigrams")
for i in sort_bigram_counts_bag0:
    print(f"{sort_bigram_counts_bag0[i]} {i}")

In [None]:
#print top 20 bag 1
print("Sentiment 1 - 20 most frequent bigrams")
for i in sort_bigram_counts_bag1:
    print(f"{sort_bigram_counts_bag1[i]} {i}")

**20 top frequent bigrams, which are 'NN' POS tagged in reviews grouped by sentiment**

In [None]:
def grams_dict_NN(_text, _n):
    grams_dict_counts = defaultdict(int)
    for review in _text:
        terms = nltk.pos_tag(tokenize(review))
        if len(terms) >= _n:
            for i in range(len(terms)-_n+1):
                count = 0
                gram_li, gram_pos = [], []
                for term in terms[i:i+_n]:
                    gram_li.append(term[0])
                    gram_pos.append(term[1])
                    if 'NN' in term[1]:
                        count += 1
                if count == _n:
                    key=""
                    for j, k in enumerate(gram_li):
                        key += k +" ("+ gram_pos[j]+") "
                    grams_dict_counts[key.strip()] += 1
    return grams_dict_counts

In [None]:
#create count dictionaries for the bags - wPOS
bigramNN_counts_bag0 = grams_dict_NN(Bag0, 2)


In [None]:
#create count dictionaries for the bags - wPOS
bigramNN_counts_bag1 = grams_dict_NN(Bag1, 2)


In [None]:
#sort the dictionaries and store the top 20
sort_bigramNN_counts_bag0 = dict(sorted(bigramNN_counts_bag0.items(), key=lambda kv:kv[1],reverse=True)[:20])


In [None]:
#sort the dictionaries and store the top 20
sort_bigramNN_counts_bag1 = dict(sorted(bigramNN_counts_bag1.items(), key=lambda kv:kv[1],reverse=True)[:20])


In [None]:
#print top 20 bag 0
print("Sentiment 0 - 20 most frequent bigrams w NN")
for i in sort_bigramNN_counts_bag0:
    print(f"{sort_bigramNN_counts_bag0[i]} {i}")

In [None]:
#print top 20 bag 1
print("Sentiment 1 - 20 most frequent bigrams w NN")
for i in sort_bigramNN_counts_bag1:
    print(f"{sort_bigramNN_counts_bag1[i]} {i}")

**4-grams that have counts 2 or more in reviews grouped by sentiment**

In [None]:
#generate 4grams for Bag0
bag0_4gram_dict = grams_dict(Bag0, 4)
bag0_4gram_dict = {k:v for k, v in bag0_4gram_dict.items() if v >= 2}


In [None]:
#generate 4grams for Bag1
bag1_4gram_dict = grams_dict(Bag1, 4)
bag1_4gram_dict = {k:v for k, v in bag1_4gram_dict.items() if v >= 2}


In [None]:
#sort the dictionaries and give the top 5
sorted_bag0_4gram = sorted(bag0_4gram_dict.items(), key= lambda kv:kv[1], reverse=True)
sorted_bag1_4gram = sorted(bag1_4gram_dict.items(), key= lambda kv:kv[1], reverse=True)


In [None]:
print('Sentiment 0 4grams - all\n')
for i in sorted_bag0_4gram[:20]:
    print(f"{i[1]} {i[0]}")


In [None]:
print('Sentiment 1 4grams - all\n')
for i in sorted_bag1_4gram[:20]:
    print(f"{i[1]} {i[0]}")


**Probabilities of words that come after "worst film ever" and "best movie ever"**

In [None]:
#get full vocab dict
vocab_4gram_dict = grams_dict(Reviews, 4)

In [None]:
#get dictionary of 'worst film ever'
vocab_4gram_dict_worst = {k:vocab_4gram_dict[k] for k in vocab_4gram_dict if 'worst film ever ' in k}
sorted_vocab_4gram_dict_worst = dict( sorted(vocab_4gram_dict_worst.items(), key= lambda kv:kv[1], reverse=True))


In [None]:
#calculate probabilities of worst film ever
sum_worst = 0
for k in sorted_vocab_4gram_dict_worst:
    sum_worst += sorted_vocab_4gram_dict_worst[k]
print(f"Count 'worst film ever': {sum_worst}\n")

sum_worst_prob = {}
for k in sorted_vocab_4gram_dict_worst:
    sum_worst_prob[k] = sorted_vocab_4gram_dict_worst[k]/sum_worst

print(f"Probabilities of 'worst film ever': \n")
for i in sum_worst_prob:
    print(f"{i} {sum_worst_prob[i]:.3f}")

In [None]:
#get dictionary of 'best movie ever'
vocab_4gram_dict_best = {k:vocab_4gram_dict[k] for k in vocab_4gram_dict if 'best movie ever ' in k and 'dumbest' not in k}
sorted_vocab_4gram_dict_best = dict( sorted(vocab_4gram_dict_best.items(), key= lambda kv:kv[1], reverse=True))



In [None]:
#calculate probabilities of best movie ever
sum_best = 0
for k in sorted_vocab_4gram_dict_best:
    sum_best += sorted_vocab_4gram_dict_best[k]
print(f"Count 'best movie ever': {sum_best}\n")

sum_best_prob = {}
for k in sorted_vocab_4gram_dict_best:
    sum_best_prob[k] = sorted_vocab_4gram_dict_best[k]/sum_best

print(f"Probabilities of 'best movie ever': \n")
for i in sum_best_prob:
    print(f"{i} {sum_best_prob[i]:.3f}")