In [2]:
import spacy
import re
from spacy.matcher import Matcher

In [3]:
nlp = spacy.load ('en_core_web_sm')

In [24]:
#######################################################################
### Every Review will be parsed as per this Review class            ###
#######################################################################

class Review:
    def __init__ (self):
        self.abbreviations = set ( "i.e. e.g. am mr mrs dr. prof. kg lbs cm in m mm ft".split() )
        
    def remove_urls_hyperlinks (self, review):
        pattern = r"\s*(http[s]?:[/]{2}www[.])?([a-z0-9]+)[.]([a-z]{3})([/][A-Za-z0-9?='.]*)*\s*"
        
        """
        Pattern matched below urls and hyperlinks, i guess it works for now
            http://www.google.com 
            https://www.google.com
            www.google.com
            google.com 
            zomato.com
            zomato.com/
            zomato.com/items/this?dir=hello
            https://www.zomato.com/places/NewDelhi/bbqnation/12.html
        """
        return re.sub (pattern, '', review)
        
        
    def token_merge (self, doc):
        # patterns for can't, didn't shouldn't wouldn't wasn't
        verb_patterns = [
            [{"TEXT": {"REGEX": r"ca|did|should|would|was"}}, {"LOWER": "n't"}],
        ]
        matcher = Matcher (nlp.vocab)
        matcher.add ('neg_verbs', None, *verb_patterns)
        for _,start,end in matcher (doc):
            span = doc[start: end]
            span.merge()
        
        return doc
    
    def to_string (self, doc):
        """
            param:
                doc -> an nlp object ie. object of spacy.load() instance
        """
        if isinstance (doc, str):
            # Means, function got a string instead of a doc
            doc = nlp (doc)
        # doc.ents -> returns a tuple of spacy spans
        # spans -> returns tokens
        ne_indices = set([token.i for token in [span for span in doc.ents]])
        msg = ""
        for token in doc:
            if token.i not in ne_indices:
                msg += token.text
        return msg
                
        
    def split_into_sentences (self, review):
        """
            While Splitting a big chunk of review, we need to take care of a few things first
            1. split on . (fullstops)
            2. Don't split if the (.) is a part of any standard abbreviation
            3. Split in case you found a conjunction joining the words, cause most of the time
               that sentence will convey 2 different sentiments, we don't want that for now.
            4. Drop Punctuations
        """
        doc = nlp (review)
        doc = self.token_merge (doc)
        start = 0
        splits = []
        for token in doc:
            # print (token, end=' # ')
            ## Step 1:
            if token.text.strip() == '.':
                # Step 2: Check for previous token being an Abbreviation
                if (token.i-1) >= 0 and doc[token.i - 1].text.lower() in self.abbreviations:
                    pass
                else:
                    splits.append (doc[start: token.i])
                    start = token.i + 1
            # Step 3: Splitting on Conjunctions or ADP
            elif token.pos == 89:
                splits.append (doc[start: token.i])
                start = token.i + 1
                
        if len (doc[start: ]) > 0:
            splits.append (doc[start: ])
        return splits
    
    def pre_process (self, msg):
        msg = self.remove_urls_hyperlinks (msg)
        return self.split_into_sentences (msg)

In [25]:
r = Review ()

In [73]:
msg = "I would like 2.4kg of weight but I didn't like the food and i wouldn't have liked the service pretty much."
r.split_into_sentences(msg)

[I would like 2.4kg of weight,
 I didn't like the food,
 i wouldn't have liked the service pretty much]

In [4]:
r2 = Review ()
msg = "I wouldn't like you if you did that"
r2.split_into_sentences (msg)

NameError: name 'Review' is not defined

In [27]:
r.to_string ("Mr. Narendra Modi is the prime minister of India.")

AttributeError: 'spacy.tokens.span.Span' object has no attribute 'i'

In [29]:
for token in doc:
    print (token.text)

Mr.
Narendra
Modi
is
the
Prime
Minister
of
India
.


In [8]:
doc.ents

(Narendra Modi, India)

In [9]:
for i in doc.ents:
    print (i)

Narendra Modi
India


In [20]:
for i in doc.ents[0]:
    print (i, i.i)

Narendra 1
Modi 2


In [22]:
type (doc)

spacy.tokens.doc.Doc

In [23]:
type (doc.ents[0])

spacy.tokens.span.Span