# Sentinment Analysis via Trigrams
// Written on **June 13, 2017**<br/>
// Status: **Archived**

Goal: Use raw trigrams to analyze sentiment.

In [3]:
import nltk
nltk.download('subjectivity')

[nltk_data] Downloading package subjectivity to
[nltk_data]     /home/qwergram/nltk_data...
[nltk_data]   Unzipping corpora/subjectivity.zip.


True

In [4]:
# Steal data from nltk libraries
from nltk.sentiment.util import *
from nltk.corpus import subjectivity

n_instances = 10000
n_train_instances = int(.8 * n_instances)
subj_docs = [sent for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [sent for sent in subjectivity.sents(categories='obj')[:n_instances]]

subj_tests = subj_docs[:n_train_instances]
obj_tests = obj_docs[:n_train_instances]

subj_train = subj_docs[n_train_instances:]
obj_train = obj_docs[n_train_instances:]

In [5]:
# Adjustable NGram Structure
class Ngram(object):
    
    def __init__(self, size, clean=True):
        self.size = size
        self.clean = clean
        self.ngrams = {}
        
    def addIntent(self, intent):
        self.ngrams[intent] = {}
        
    def removeIntent(self, intent):
        del self.ngrams[intent]
        
    def cleanText(self, text):
        newtext = []
        for char in text:
            if char.isalnum() or char == ' ':
                newtext.append(char.lower())
                
        text = "".join(newtext)
        
        ignoreWords = []
        
#         ignoreWords = [
#             "is", "the", "a", "i", "via", "did", "like",
#             "to", "in", "and", "that", "till", "im", "am"
#         ]
        for word in ignoreWords:
            text = text.replace(word + " ", "")
        
        return text
        
        
    def train(self, intent, seeds):
        if self.clean:
            seeds = self.cleanText(seeds)
        
        if len(seeds.split()) < (self.size + 1):
            raise Exception("sample size too small")

        seeds = seeds.split()

        for i, _ in enumerate(seeds):
            try:
                result = seeds[i+self.size]
                key = tuple(seeds[i:i+self.size])
                if not "".join(key).isalnum():
                    continue
                self.ngrams[intent].setdefault(key, [])
                if not result in self.ngrams[intent][key]:
                    self.ngrams[intent][key].append(result)
            except IndexError:
                break
        
    def think(self, seeds):
        if self.clean:
            seeds = self.cleanText(seeds)
        
        if len(seeds.split()) < (self.size + 1):
            raise Exception("sample size too small")
        
        seeds = seeds.split()

        results = {}
        
        for intent, ngrams in self.ngrams.items():
            total_keys = matching_keys = 0
            
            for i, _ in enumerate(seeds):
                try:
                    result = seeds[i+self.size]
                    key = tuple(seeds[i:i+self.size])
                    if not "".join(key).isalnum():
                        continue
                    value_added = 0
                    if key in ngrams:
                        value_added = .35 * (1 / len(ngrams[key]))
                        matching_keys += value_added
                        if result in ngrams[key]:
                            matching_keys += 1 - value_added
                    total_keys += 1
                except IndexError:
                    break
            
            results[intent] = matching_keys / total_keys
                    
        return results

In [6]:
# Train data
trigram = Ngram(2)
trigram.addIntent("obj")
trigram.addIntent("subj")
for subj_doc in subj_tests:
    trigram.train("subj", " ".join(subj_doc))
for obj_doc in obj_tests:
    trigram.train("obj", " ".join(obj_doc))

In [7]:
# Test data
score = 0
total = len(obj_tests) + len(subj_tests)
for obj_doc in obj_tests:
    result = trigram.think(" ".join(obj_doc))
    if result['obj'] > result['subj']:
        score += 1

for subj_doc in subj_tests:
    result = trigram.think(" ".join(subj_doc))
    if result['obj'] < result['subj']:
        score += 1
print(score / total)

1.0


# Examples

In [8]:
trigram.think("President Trump’s budget would not add to economic growth or eliminate the deficit in coming years, the nonpartisan Congressional Budget Office said Thursday, casting doubt on a plan the White House has touted as central to achieving the president’s domestic agenda.")

{'obj': 0.0720982905982906, 'subj': 0.018121301775147928}

In [9]:
trigram.think("Difficult to understand how a person who is a sexual predator, pathological liar and totally without morals is supported by the political party that claims to be the Party of Family Values and Jesus. Course it's easy to understand why ignorant fundamentalists, like the Preacher who one ran for the Presidency, supports Trump. It's a case of \"Birds of a Feather....\". And as Your soul-probing caricature shows...the Son (actually Both Sons) are worthless rotten apples.")

{'obj': 0.13085393540121318, 'subj': 0.1194731846729153}

In [10]:
trigram.think("Outstanding in every respect. True art.")

{'obj': 0.0, 'subj': 0.014583333333333332}

# Conclusion
Seems like there's some hope, but raw trigrams are probably never going to be enough for sentiment analysis.