## Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
import re
import spacy
nlp = spacy.load("en_core_web_sm")
from nltk.corpus import stopwords 
stop = set(stopwords.words('english'))

## Data

In [3]:
df = pd.read_csv("./amazon-mobileelectronics.tsv",sep="\t",error_bad_lines=False)
df[0:1]

b'Skipping line 35246: expected 15 fields, saw 22\n'
b'Skipping line 87073: expected 15 fields, saw 22\n'


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,20422322,R8MEA6IGAHO0B,B00MC4CED8,217304173,BlackVue DR600GW-PMP,Mobile_Electronics,5.0,0.0,0.0,N,Y,Very Happy!,"As advertised. Everything works perfectly, I'm...",2015-08-31


In [4]:
data = df[['product_id','product_title','star_rating','review_headline','review_body']]
data = data.dropna()

In [21]:
sent = data.iloc[47]['review_body']
sent

'I am really enjoying this speaker. I was unsure if it was as good as some of the reviews gave it but I am very pleased with the sound quality and the value.'

## Definitions

In [6]:
#LINE PREPROCESSOR
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    return reviews

In [7]:
#POS TAGGING
def tagSentence(sentence):
    doc = nlp(sentence)
    obj = {}
    obj["NOUN"] = [token.lemma_.lower() for token in doc if token.pos_ == "NOUN"]
    obj["PROPN"] = [token.lemma_.lower() for token in doc if token.pos_ == "PROPN"]
    obj["PRON"] = [token.lemma_.lower() for token in doc if token.pos_ == "PRON"]
    obj["VERB"] = [token.lemma_.lower() for token in doc if token.pos_ == "VERB" and token.lemma_.lower() != "be"]
    obj["ADJ"] = [token.lemma_.lower() for token in doc if token.pos_ == "ADJ"]
    obj["ADV"] = [token.lemma_.lower() for token in doc if token.pos_ == "ADV"]
    obj["NUM"] = [token.lemma_.lower() for token in doc if token.pos_ == "NUM"]
    obj["SYM"] = [token.lemma_.lower() for token in doc if token.pos_ == "SYM"]
    obj["X"] = [token.lemma_.lower() for token in doc if token.pos_ == "X"]
    return obj

## Testing

In [16]:
testsent = "the battery life is by far the best and the camera was bad"
# testsent = [i for i in testsent.lower().split() if i not in stop]
# testsent = " ".join(testsent)
# testsent

In [9]:
def chunk(sent):
#     sent=resolve_co_reference(sent)
    conj = set(('and', 'or' ,'but','while','so','because','where','however','whereas'))
    beverbs=set(('is','was','are','were'))
    wdt=set(('which','that'))
    l=[1,2,3]
    tagged_list=[[]]
    i=0
    doc=nlp(sent)
    for token in doc:
        # if (token.lemma_=="be"):
        #   l[0]="be"
        # else:
        #   l[0]=token.text
        l[0]=token.text
        l[1]=token.tag_
        if(token.dep_=='nsubj' and token.tag_.startswith("W")==False):
            l[2]=1
        else:
            l[2]=0
        tagged_list.insert(i,l)
        l=[1,2,3]
        i=i+1

    noun=-1
    i=0
    while(i<len(tagged_list)-1):
        if(tagged_list[i][1].find("NN")!=-1):
            noun=i
        if(tagged_list[i][0]in beverbs):
            if(i<len(tagged_list)-2 and tagged_list[i+1][1].startswith("V") and not tagged_list[i+1][1].startswith("VBG")):
                tagged_list[noun][2]=3
        i=i+1
    #print(tagged_list)

    n=[[]]
    ind=0
    ind2=-1
    i=0
    subj=""
    lis=[]
    flag=-1
    find=-1
    subj_type=-1
    while(i<len(tagged_list)-1):
        if(tagged_list[i][0] in wdt and i+1<len(tagged_list)-1 and tagged_list[i+1][1].find("VB")!=-1):
            if(i-2>=0 and tagged_list[i-1][0]==","):
                tagged_list[i][0]=tagged_list[i-2][0]
                tagged_list[i][2]=1
                tagged_list[i][1]=tagged_list[i-2][1]
            else:
                tagged_list[i-1][2]=1
            subj=tagged_list[i-1][0]
        i=i+1
    i=0
    while(i<len(tagged_list)-1):
        if(tagged_list[i][2]==1 or tagged_list[i][2]==3):
            subj=tagged_list[i][0]
            subj_type=tagged_list[i][2]
        if(tagged_list[i][1]=="CC"  or tagged_list[i][0] in conj or tagged_list[i][0]=="," or tagged_list[i][0]==";" or tagged_list[i][0]=="." or(tagged_list[i][0]in wdt and i+1<len(tagged_list)-1 and tagged_list[i+1][1].find("VB")==-1)):
            j=i+1
            while(j<len(tagged_list)-1 and tagged_list[j][1].find("NN")==-1 and tagged_list[j][1].find("VB")==-1):
                j=j+1
            if(j<len(tagged_list)-1and tagged_list[j][1].find("NN")!=-1):
                
                if((tagged_list[j][2]==1 or tagged_list[j][2]==3)):
                    if(ind2!=-1 and ind2!=ind):
                        find=find+1                 
                        while(find<len(tagged_list)-1 and (tagged_list[find][1]!="CC"  and tagged_list[find][0] not in conj and tagged_list[find][0]!="," and tagged_list[find][0]!=";" and tagged_list[find][0]!="." and(tagged_list[i][0]not in wdt or (i+1<len(tagged_list)-1 and tagged_list[i+1][1].find("VB")!=-1)))):
                            find=find+1
                        n.append([tagged_list[x][0] for x in range(ind2,i) if(x not in range(ct,find+1))])
                        ind2=-1
                    else:
                        for x in range(ind,i):
                            if(tagged_list[x][1]=="CC"  or tagged_list[x][0] in conj or tagged_list[x][0]=="," or tagged_list[x][0]==";" or tagged_list[x][0]=="." or (tagged_list[x][0]in wdt and x+1<len(tagged_list)-1 and tagged_list[x+1][1].find("VB")==-1) ):
                                if(x>ind and x<i-1):
                                    if((tagged_list[x-1][2]== 1 or tagged_list[x-1][2]==2)):
                                        y=x+1
                                        while(y<len(tagged_list)-1 and tagged_list[y][1].find("NN")==-1 and tagged_list[y][1].find("VB")==-1):
                                            y=y+1
                                        if(tagged_list[y][2]==1 or tagged_list[y][2]==2 or tagged_list[y][2]==3):
                                            if(len(lis)==0):
                                                lis.append(x-1)
                                            lis.append(y)
                        for l in range(len(lis)):
                            n.append([tagged_list[x][0] for x in range(ind,i) if(x == lis[l] or x>lis[len(lis)-1]) or (l==0 and x<lis[0]) or (l>0 and x>lis[l-1]) and x<=lis[l]])
                        if(len(lis)==0):
                            n.append([tagged_list[x][0] for x in range(ind,i)])
                    lis=[]
                    ind =i+1
                elif(i-1>=0 and (tagged_list[i-1][2]==1 or tagged_list[i-1][2]==2 or tagged_list[i-1][2]==3)):
                    tagged_list[j][2]=2
                    subj=subj+" "+tagged_list[i][0]+" "+tagged_list[j][0]

                else:
                    if(ind2==-1):
                        ind2=ind
                    ct=ind2
                    while(ct<i-1 and ((tagged_list[ct][1].find("NN")==-1 or tagged_list[ct][1].find("VB")==-1) or (tagged_list[ct][2]==1 or tagged_list[ct][2]==2 or tagged_list[ct][2]==3 ))):
                        ct=ct+1
                    if(flag!=ind2):
                        n.append([tagged_list[x][0] for x in range(ind2,i)])
                        flag=ind2
                        find=ct
                    else:
                        find=find+1                 
                        while(find<len(tagged_list)-1 and (tagged_list[find][1]!="CC"  and tagged_list[find][0] not in conj and tagged_list[find][0]!="," and tagged_list[find][0]!=";" and tagged_list[find][0]!="." and(tagged_list[i][0]not in wdt or (i+1<len(tagged_list)-1 and tagged_list[i+1][1].find("VB")!=-1)))):
                            find=find+1
                        n.append([tagged_list[x][0] for x in range(ind2,i) if(x not in range(ct,find+1))])
                    ind=i+1 #ADDED NOW



            elif(j<len(tagged_list)-1 and tagged_list[j][1].find("VB")!=-1):
                if(ind2!=-1 and ind2!=ind):
                    find=find+1                 
                    while(find<len(tagged_list)-1 and (tagged_list[find][1]!="CC"  and tagged_list[find][0] not in conj and tagged_list[find][0]!="," and tagged_list[find][0]!=";" and tagged_list[find][0]!="." and(tagged_list[i][0]not in wdt or (i+1<len(tagged_list)-1 and tagged_list[i+1][1].find("VB")!=-1)))):
                        find=find+1
                    n.append([tagged_list[x][0] for x in range(ind2,i) if(x not in range(ct,find+1))])
                    ind2=-1
                else:
                    for x in range(ind,i): #TO SEPARATE SUBJECTS
                        if(tagged_list[x][1]=="CC"  or tagged_list[x][0] in conj or tagged_list[x][0]=="," or tagged_list[x][0]==";" or tagged_list[x][0]=="." or (tagged_list[x][0]in wdt and x+1<len(tagged_list)-1 and tagged_list[x+1][1].find("VB")==-1) ):
                            if(x>ind and x<i-1):
                                if((tagged_list[x-1][2]== 1 or tagged_list[x-1][2]==2)):
                                    y=x+1
                                    while(y<len(tagged_list)-1 and tagged_list[y][1].find("NN")==-1 and tagged_list[y][1].find("VB")==-1):
                                        y=y+1
                                    if(tagged_list[y][2]==1 or tagged_list[y][2]==2 or tagged_list[y][2]==3):
                                        if(len(lis)==0):
                                            lis.append(x-1)
                                        lis.append(y)
                    for l in range(len(lis)):
                        n.append([tagged_list[x][0] for x in range(ind,i) if(x == lis[l] or x>lis[len(lis)-1]) or (l==0 and x<lis[0]) or (l>0 and x>lis[l-1]) and x<=lis[l]])
                    if(len(lis)==0):
                        n.append([tagged_list[x][0] for x in range(ind,i)])
                if(i+1<len(tagged_list)-1 and tagged_list[i+1][1]!="PRP"):
                    if(subj_type==3):
                        tagged_list[i][0]=subj+" was "
                    else:
                        tagged_list[i][0]=subj
                    ind=i
                else:
                    ind=i+1
                lis=[]

        
        i=i+1
    if(ind2!=-1 and ind2!=ind):
        find=find+1                 
        while(find<len(tagged_list)-1 and (tagged_list[find][1]!="CC"  and tagged_list[find][0] not in conj and tagged_list[find][0]!="," and tagged_list[find][0]!=";" and tagged_list[find][0]!="." and(tagged_list[find][0]not in wdt or (find+1<len(tagged_list)-1 and tagged_list[find+1][1].find("VB")!=-1)))):
            find=find+1
        n.append([tagged_list[x][0] for x in range(ind2,i) if(x not in range(ct,find+1))])
        ind2=-1;
    else:   
        for x in range(ind,i):
            if(tagged_list[x][1]=="CC"  or tagged_list[x][0] in conj or tagged_list[x][0]=="," or tagged_list[x][0]==";" or tagged_list[x][0]=="." or (tagged_list[x][0]in wdt and x+1<len(tagged_list)-1 and tagged_list[x+1][1].find("VB")==-1) ):
                if(x>ind and x<i-1):
                    if((tagged_list[x-1][2]== 1 or tagged_list[x-1][2]==2)):
                        y=x+1
                        while(y<len(tagged_list)-1 and tagged_list[y][1].find("NN")==-1 and tagged_list[y][1].find("VB")==-1):
                            y=y+1
                        if(tagged_list[y][2]==1 or tagged_list[y][2]==2):
                            if(len(lis)==0):
                                lis.append(x-1)
                            lis.append(y)
        for l in range(len(lis)):
            n.append([tagged_list[x][0] for x in range(ind,i) if(x == lis[l] or x>lis[len(lis)-1]) or (l==0 and x<lis[0]) or (l>0 and x>lis[l-1]) and x<=lis[l]])
        if(len(lis)==0):
            n.append([tagged_list[x][0] for x in range(ind,i)])
    stringArr = []
    for arr in n:
        if(len(arr)>0):
            stringArr.append(' '.join(arr))
    return stringArr


In [19]:
a = chunk(testsent)

In [20]:
for sent in a:
    print(tagSentence(sent))

{'NOUN': ['battery', 'life'], 'PROPN': [], 'PRON': [], 'VERB': [], 'ADJ': ['good'], 'ADV': ['far'], 'NUM': [], 'SYM': [], 'X': []}
{'NOUN': ['camera'], 'PROPN': [], 'PRON': [], 'VERB': [], 'ADJ': ['bad'], 'ADV': [], 'NUM': [], 'SYM': [], 'X': []}
