In [1]:
import spacy
import pickle

In [2]:
from spacy.lang.en import STOP_WORDS

In [3]:
import string

In [143]:
class PreProcessor:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
        self.stop_words = set(STOP_WORDS)
        self.stop_words.update(string.punctuation)
        self.stop_words.remove('not')
        
        self.docs = []
        self.splits = []
        
    def split_into_sents(self, review):
        if not isinstance(review, spacy.tokens.doc.Doc):
            review = self.nlp(review)
        
        sents = []
        for sentence in review.sents:
            start = 0
            counter = 0
            print("Sentence: ", sentence)
            for token in sentence:
                # 89 -> Conjunctions,
                # 97 -> Punctuations
                if token.pos in [89, 97] or token.text.strip() == ',':
                    if counter > start: 
                        sents.append(sentence[start: counter])
                    start = counter + 1
                counter += 1
        return sents
    
    def feature_extraction(self, custom_sent):
        features = {}
        
        nouns = []
        verbs = []
        adj = []
        
        # 92 -> NOUN, 96 -> Proper Noun
        # 95 -> PRONOUN
        # 86 -> AdVerb
        # 84 -> Adjective
        # 100 -> VERB
        # 87 -> AUX. VERB
        # 94 -> Partition (mostly used alongside AUX. VERB)
        for token in custom_sent:
            if token.pos in [92, 96]:
                nouns.append(token.lemma_)
            elif token.pos in [84, 86]:
                adj.append(token.lemma_)
            elif token.pos in [100, 87, 94]:
#                 if token.pos == 94 and len(verbs) > 0:
#                     verbs[-1] += token.lemma_
#                 else:
                    verbs.append(token.lemma_)
        return { 
            "entity": ', '.join(nouns),
            "features": ' '.join(adj) if len(adj) > 0 else ' '.join(verbs)
        }
                

In [105]:
p = PreProcessor()
p.split_into_sents("I liked the food, but service was awful. Ambience was damn poor.")

Sentence:  I liked the food, but service was awful.
Sentence:  Ambience was damn poor.


[I liked the food, service was awful, Ambience was damn poor]

In [144]:
p = PreProcessor()
sents = p.split_into_sents("I liked the food, but didn't like the service. Ambience was damn poor.")
for sent in sents:
    print(p.feature_extraction(sent))

Sentence:  I liked the food, but didn't like the service.
Sentence:  Ambience was damn poor.
{'entity': 'food', 'features': 'like'}
{'entity': 'service', 'features': 'do not like'}
{'entity': 'Ambience', 'features': 'damn poor'}


In [13]:
type(STOP_WORDS)

set

In [128]:
nlp = spacy.load('en_core_web_sm')

In [131]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [147]:
print(sia.polarity_scores("like"))
print(sia.polarity_scores("do  like"))
print(sia.polarity_scores('damn poor'))

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.3612}
{'neg': 0.0, 'neu': 0.286, 'pos': 0.714, 'compound': 0.3612}
{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.7003}


In [140]:
doc = nlp("I didn't like the food")
print(type(doc))
for token in doc:
    print(token,token.lemma_, token.pos, token.pos_, spacy.explain(token.pos_), token.dep_)

<class 'spacy.tokens.doc.Doc'>
I -PRON- 95 PRON pronoun nsubj
did do 87 AUX auxiliary aux
n't not 94 PART particle neg
like like 100 VERB verb ROOT
the the 90 DET determiner det
food food 92 NOUN noun dobj


In [139]:
doc = nlp("I did not like the food")
for token in doc:
    print(token, token.lemma_, token.pos, token.pos_, spacy.explain(token.pos_), token.dep_)

I -PRON- 95 PRON pronoun nsubj
did do 87 AUX auxiliary aux
not not 94 PART particle neg
like like 100 VERB verb ROOT
the the 90 DET determiner det
food food 92 NOUN noun dobj


In [17]:
doc = nlp("food was pretty bad")
for token in doc:
    print(token, token.pos, token.pos_, spacy.explain(token.pos_), token.dep_)

food 92 NOUN noun nsubj
was 87 AUX auxiliary ROOT
pretty 86 ADV adverb advmod
bad 84 ADJ adjective acomp


In [120]:
doc = nlp("food was good, but Ambience was pretty awful.")
for token in doc:
    print(token, token.pos, token.pos_, spacy.explain(token.pos_), token.dep_)

food 92 NOUN noun nsubj
was 87 AUX auxiliary ROOT
good 84 ADJ adjective acomp
, 97 PUNCT punctuation punct
but 89 CCONJ coordinating conjunction cc
Ambience 96 PROPN proper noun nsubj
was 87 AUX auxiliary conj
pretty 86 ADV adverb advmod
awful 84 ADJ adjective acomp
. 97 PUNCT punctuation punct


In [36]:
p = PreProcessor()
p.split_into_sents("I didn't like the food")

[]