validation
- number to word
- sword-&-shield = sword-and-sorcery 

open neg+pos reviews, afinn

store each review based on sentences

sentiment score

ver 1 0-10 scale
- clean reviews based on afinn, those words not in afinn r removed

ver 2 0-1 scale
- each review have a score, based on bow, and imdber, calculate the score of each review



In [19]:
'''
pip install contractions spacy beautifulsoup4 nltk
python -m spacy download en_core_web_sm
nltk.download('stopwords')
'''
import contractions # expands contractions, e.g. don't -> do not
from bs4 import BeautifulSoup # removes html tags

import spacy # for sentence splitting and tokenization
nlp = spacy.load('en_core_web_sm') # load spaCy English model

import nltk
from nltk.corpus import stopwords # for stopwords
english_stopwords = set(stopwords.words('english')) 

from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Open imdb.vocab, imdbEr.txt & AFINN-en-165.txt

In [None]:
def imdbvocab_dict(filepath):
    ''' 
    function 
    > open imdb.vocab as dict with word:index 
    
    e.g. {'abandon': 0, 'abandoned': 1, ...}
    '''
    with open(filepath, encoding='utf-8') as f:
        return {line.strip(): idx for idx, line in enumerate(f)} # loop through file & remove empty lines
    
vocab_dict = imdbvocab_dict('dataset/imdb.vocab') # 89527 words
    
def imdbEr_list(filepath): 
    '''
    function 
    > open imdbEr.txt as list of float
    
    e.g. [0.0, -0.47, ...]
    '''
    with open(filepath, encoding='utf-8') as f:
        return [float(line.strip()) for line in f] # loop through file & remove empty lines
    
imdbEr = imdbEr_list('dataset/imdbEr.txt') # 89527 scores

def afinn_dict(filepath): 
    '''
    function 
    > open AFINN-en-165.txt as dict with word:score
    
    e.g. {'abandon': -2, 'abandoned': -2, ...}
    '''
    with open(filepath, encoding='utf-8') as f:
        return {line.split('\t')[0]: int(line.split('\t')[1]) for line in f} # loop through file & split by tab
    
afinn = afinn_dict('dataset/AFINN-en-165.txt') # 3382 words

# combine index & score from vocab in a dict
combined = {word: (idx, imdbEr[idx]) for word, idx in vocab_dict.items()} # 89527 words

print('shapes')
print(f'> vocab_dict : {len(vocab_dict)} words')
print(f'> imdbEr : {len(imdbEr)} scores')
print(f'> afinn : {len(afinn)} words')
print(f'> combined : {len(combined)} words')

def wordinfo (word):
    '''
    function
    > print word info from afinn & imdb.vocab
    
    variable
    > word : str, word to search
    
    e.g. wordinfo('terrific')
    '''
    print(f'\n{word}')
    print('> AFINN-en-165.txt')
    print(f'   > sentiment score : {afinn.get(word)}')
    print('> imdb.vocab')
    print(f'   > index : {combined.get(word)[0]}')
    print(f'   > score : {combined.get(word)[1]}')
    
wordinfo('terrific') # change word to search index & score

shapes
> vocab_dict : 89527 words
> imdbEr : 89527 scores
> afinn : 3382 words
> combined : 89527 words

terrific
> AFINN-en-165.txt
   > sentiment score : 4
> imdb.vocab
   > index : 1256
   > score : 1.62327578759


In [None]:
def opentextfile(filepath):
    '''
    
    '''
    with open(filepath, 'r', encoding='utf-8') as f: 
        text = f.read()

    text = BeautifulSoup(text, 'html.parser').get_text() # remove HTML tags
    text = contractions.fix(text).lower() # expand contractions
    doc = nlp(text) # split sentences and tokenize
    fullsentences = [] # keep full sentences for sia, e.g. ['sentence1', 'sentence2', ...]
    sentences = [] # keep tokenized sentences for afinn & vocab, e.g. [['token1', 'token2', ...], [...], ...]
    for sent in doc.sents:
        fullsentences.append(sent.text) 
        token_list = [] # store words in a sentence e.g. [['word1', 'word2', ...], [], [], ...]
        for token in sent:
            if not token.is_space: 
                token_list.append(token.text)
        sentences.append(token_list)

    return sentences, fullsentences

filepath = 'dataset/train/neg/0_3.txt'  
sentences, fullsentences = opentextfile(filepath)
print(fullsentences)
print(sentences)

['story of a man who has unnatural feelings for a pig.', 'starts out with a opening scene that is a terrific example of absurd comedy.', 'a formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it is singers.', 'unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting.', 'even those from the era should be turned off.', 'the cryptic dialogue would make shakespeare seem easy to a third grader.', 'on a technical level it is better than you might think with some good cinematography by future great vilmos zsigmond.', 'future stars sally kirkland and frederic forrest can be seen briefly.']


In [None]:
def sentiment_score(sentences, scoretype): # sentiment score of each word and sentence
    '''
    sentences: list of tokenized sentences, e.g. [['token1', 'token2', ...], [...], ...]
    scoretype: afinn or combined 
    return: list of sentence scores, list of word scores for each sentence
    
    e.g. sentencescore_list = [3, -1, 0, ...]
            wordscores_list = [[('word1', 2), ('word2', 1)], [('word3', -1)], [], ...]
            
            
    '''
    sentencescore_list = []
    wordscores_list = []

    for sentence in sentences:
        wordscoreS = []
        score = 0
        for word in sentence:
            if scoretype is afinn:
                if word in scoretype:  
                    wordscore = scoretype[word]
                    score += wordscore
                    wordscoreS.append((word, wordscore))
            if scoretype is combined:
                if word not in english_stopwords:
                    if word in scoretype:  
                        wordscore = scoretype[word][1]
                        score += wordscore
                        wordscoreS.append((word, wordscore))
        sentencescore_list.append(score)
        wordscores_list.append(wordscoreS)

    return sentencescore_list, wordscores_list

def slidingwindow(sentencescore_list, windowsize=3): # sliding window for most pos/neg segment
    maxscore = None
    minscore = None
    maxidx = minidx = 0

    for i in range(len(sentencescore_list) - windowsize + 1):
        windowscore = sum(sentencescore_list[i:i + windowsize])

        if maxscore is None or windowscore > maxscore:
            maxscore = windowscore
            maxidx = i

        if minscore is None or windowscore < minscore:
            minscore = windowscore
            minidx = i

    return (maxidx, maxscore), (minidx, minscore)

In [None]:
def printsentiment(total, sentences, scores, posidx, negidx, poswin, negwin, windowsize):
    print(f'total sentiment score : {total}')
    print(f'most pos sentence ({scores[posidx]}) : {sentences[posidx]}')
    print(f'most neg sentence ({scores[negidx]}) : {sentences[negidx]}')
    
    print(f'\nmost pos segment ({poswin[1]}) :')
    for i in range(poswin[0], poswin[0] + windowsize):
        print(f'  {i+1} : {sentences[i]}')
    
    print(f'\nmost neg segment ({negwin[1]}) :')
    for i in range(negwin[0], negwin[0] + windowsize):
        print(f'  {i+1} : {sentences[i]}')

def sentimentscore(filepath, method, windowsize=3):
    if method in ['afinn', 'vocab']:
        # Tokenized preprocessing
        sentences, fullsentences = opentextfile(filepath)
        scoretype = afinn if method == 'afinn' else combined
        
        # Sentence-level scores
        scores, _ = sentiment_score(sentences, scoretype)
        total = sum(scores)
        
        # Sentence indices
        posidx, negidx = scores.index(max(scores)), scores.index(min(scores))
        
        # Sliding windows
        poswin, negwin = slidingwindow(scores, windowsize)
        
        printsentiment(total, fullsentences, scores, posidx, negidx, poswin, negwin, windowsize)
    
    elif method == 'sia':
        # Sentence-based preprocessing
        sentences, fullsentences = opentextfile(filepath)
        sia = SentimentIntensityAnalyzer()
        
        # Sentence-level scores
        scores = [sia.polarity_scores(s)['compound'] for s in fullsentences]
        total = sum(scores)
        
        # Sentence indices
        posidx, negidx = scores.index(max(scores)), scores.index(min(scores))
        
        # Sliding windows
        window_scores = [
            (i, sia.polarity_scores(' '.join(fullsentences[i:i+windowsize]))['compound'])
            for i in range(len(fullsentences) - windowsize + 1)
        ]
        poswin = max(window_scores, key=lambda x: x[1])
        negwin = min(window_scores, key=lambda x: x[1])
        
        printsentiment(total, fullsentences, scores, posidx, negidx, poswin, negwin, windowsize)
    
    else:
        raise ValueError('method must be \'afinn\', \'vocab\', or \'sia\'')


In [17]:
filepath = 'dataset/train/neg/12481_1.txt'
sentimentscore(filepath, method='vocab', windowsize=3)

[-1.6760280775194598, -0.6122235945879999, -4.26666250292146, -3.2194334881679003, -2.032291541066, -5.637296477377749, -0.08888936621250002, -1.91384963009] [[('story', 0.202447200898), ('appallingly', -2.22729553652), ('brutal', 0.865420662809), ('callous', 0.420866499319), ('hero', -0.00915072179286), ('vanquishing', 0.215171487084), ('evil', -0.0763100765738), ('king', 0.277583163506), ('worthless', -2.12555582265), ('almost', -0.0449922079508), ('every', 0.087875564798), ('detail', 0.737911709554)], [('acting', -0.443413788335), ('horrible', -2.14571785916), ('leads', 0.311509442892), ('supporting', 0.972938490902), ('roles', 0.692460119113)], [('leering', -0.347224742073), ('gloating', -0.982802552948), ('glee', 0.335634196639), ('director', -0.168637601385), ('shows', 0.721476098916), ('hero', -0.00915072179286), ('smearing', -0.457442739905), ('blood', -0.192841132259), ('around', -0.224191863485), ('absolutely', -0.312484509531), ('disgusting', -1.50710159992), ('redeemed', -0

In [18]:
filepath = 'test.txt'
sentimentscore(filepath, method='vocab', windowsize=3)

[0.712009554686, 0.47354348986608, 5.82297235104, 3.41169903933, -0.487001620724, 0.9740855025009999, 3.420291153079, 3.763136070869, 1.5894042956210002, -1.3472051616426, 3.2050926006080003, 3.8702851291930003, 0.8368843551160999, 1.38389101887897, 0.9914952919142] [[('absolutely', -0.312484509531), ('loved', 1.40568694008), ('movie', -0.246354038618), ('!', -0.134838837245)], [('plot', -0.663771954468), ('thrilling', 0.778097454912), ('characters', -0.075112449321), ('well', 0.441646380701), ('developed', -0.00731594195792)], [('cinematography', 0.611919591895), ('stunning', 1.4907307838), ('soundtrack', 0.617625920554), ('perfectly', 1.55772515341), ('complemented', 1.65315215274), ('scenes', -0.108181251359)], [('would', -0.383385991768), ('highly', 1.20549070691), ('recommend', 0.67670059007), ('film', 0.147885815777), ('anyone', -0.213229825818), ('looking', -0.290385920301), ('engaging', 0.632461856895), ('emotional', 0.892561550036), ('experience', 0.743600257529)], [('five', -