In [31]:
from tqdm import tqdm

import spacy
import pandas as pd
import numpy as np
import os


In [7]:
model = spacy.load('en_core_web_sm')

In [2]:
def printTags(scentence):
    tokens = model(scentence)
    for token in tokens:
        print(token.text, "---", token.pos_, "---", token.tag_)

In [3]:
sent1 = "I loved the screen on this phone"
sent2 = "The battery life on this phone is great"
sent3 = "The speakers are pathetic"

In [5]:
printTags(sent1)
print('-------------------------------')
printTags(sent2)
print('-------------------------------')
printTags(sent3)

I --- PRON --- PRP
loved --- VERB --- VBD
the --- DET --- DT
screen --- NOUN --- NN
on --- ADP --- IN
this --- DET --- DT
phone --- NOUN --- NN
-------------------------------
The --- DET --- DT
battery --- NOUN --- NN
life --- NOUN --- NN
on --- ADP --- IN
this --- DET --- DT
phone --- NOUN --- NN
is --- AUX --- VBZ
great --- ADJ --- JJ
-------------------------------
The --- DET --- DT
speakers --- NOUN --- NNS
are --- AUX --- VBP
pathetic --- ADJ --- JJ


In [8]:
con=open("Samsung.txt",'r', encoding="utf-8")
samsung_reviews=con.read()
con.close()

In [9]:
samsung_review_data = samsung_reviews.split('\n')

In [12]:
len(samsung_review_data)

46355

In [15]:
# reviews = [model(r) for r in samsung_review_data]
review1 = model(samsung_review_data[1])

In [25]:
# Convert each token into its lemma and identify the PoS tags.
def getTagsLemma(sent):
    pos = []
    lemma = []
    text = []

    for tok in sent:
        
        pos.append(tok.pos_)
        lemma.append(tok.lemma_)
        text.append(tok.text)
    
    # Convert the data into a dataframe object.
    nlp_table = pd.DataFrame({'text':text,'lemma':lemma,'pos':pos})
    return nlp_table

In [26]:
df1 = getTagsLemma(review1)
df1.head()

#Get most frequent lemma forms of nouns
print(df1[df1['pos']=='NOUN']['lemma'].value_counts())

phone      3
grade      1
pantach    1
android    1
size       1
surfing    1
medium     1
Name: lemma, dtype: int64


In [27]:
nouns = []
for review in tqdm(samsung_review_data[0:1000]):
    doc = model(review)
    for tok in doc:
        if tok.pos_=="NOUN":
            nouns.append(tok.lemma_.lower())
pd.Series(nouns).value_counts().head(5)

100%|██████████| 1000/1000 [00:09<00:00, 110.41it/s]


phone      1212
battery      91
time         90
price        87
screen       87
dtype: int64

# improving performance

In [32]:
# creating a new model with disabling parser and ner
nlp = spacy.load('en_core_web_sm', disable = ['parser','ner'])

In [33]:
nouns2 = []
for review in tqdm(samsung_review_data[0:1000]):
    doc = nlp(review)
    for tok in doc:
        if tok.pos_=="NOUN":
            nouns2.append(tok.lemma_.lower())
pd.Series(nouns2).value_counts().head(5)

100%|██████████| 1000/1000 [00:04<00:00, 204.04it/s]


phone      1212
battery      91
time         90
price        87
screen       87
dtype: int64

In [34]:
nouns2 = []
for review in tqdm(samsung_review_data):
    doc = nlp(review)
    for tok in doc:
        if tok.pos_=="NOUN":
            nouns2.append(tok.lemma_.lower())


100%|██████████| 46355/46355 [02:38<00:00, 291.55it/s]


phone      43452
battery     4332
product     3907
screen      3838
time        3829
dtype: int64

In [35]:
df = pd.Series(nouns2)
df.value_counts().head(5)

phone      43452
battery     4332
product     3907
screen      3838
time        3829
dtype: int64

In [36]:
df.value_counts().head(10)

phone      43452
battery     4332
product     3907
screen      3838
time        3829
card        3391
price       3153
problem     3129
camera      2903
app         2645
dtype: int64

## Extracting prefix and suffix features of the most commonly occuring nouns - getting context

In [42]:
import re
s1 = 'the battery life was good'

![alt text](regex.png "Regex")

In [43]:
pattern = re.compile("\w+\sbattery\s\w+")

In [44]:
print(re.findall(pattern, s1))

['the battery life']


In [45]:
re.findall(pattern,s1)[0]

'the battery life'

In [46]:
re.findall(pattern,s1)[0].split(" ")

['the', 'battery', 'life']

In [47]:
re.findall(pattern,s1)[0].split(" ")[0] ## prefix

'the'

In [48]:
re.findall(pattern,s1)[0].split(" ")[-1] ## suffix

'life'

### Extract all the prefixes and suffixes of battery

In [51]:
prefixes_suffixes = re.findall(pattern,samsung_reviews)

In [52]:
prefixes_suffixes[:10]

['that battery life',
 'The battery was',
 'great battery life',
 'removable battery or',
 'the battery in',
 'The battery was',
 'the battery is',
 'Excellent battery life',
 'the battery off',
 'the battery goes']

In [53]:
prefixes = []
suffixes = []
for p in prefixes_suffixes:
    l = p.split(" ")
    prefixes.append(l[0].lower())
    suffixes.append(l[-1].lower())

In [54]:
pd.Series(prefixes).value_counts().head(5)

the      1396
good      122
great      90
and        82
long       60
dtype: int64

In [55]:
pd.Series(suffixes).value_counts().head(5)

life     1052
is        210
and       146
lasts      83
was        66
dtype: int64

In [57]:
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

In [58]:
prefixes = [p for p in prefixes if p not in stop_words]
suffixes = [s for s in suffixes if s not in stop_words]

In [59]:
pd.Series(prefixes).value_counts().head(5)

good         122
great         90
long          60
new           51
removable     48
dtype: int64

In [60]:
pd.Series(suffixes).value_counts().head(5)

life     1052
lasts      83
last       53
doesn      40
runs       31
dtype: int64

In [62]:
prefixes=pd.Series(prefixes).value_counts().head(5).index
suffixes=pd.Series(suffixes).value_counts().head(5).index
pd.DataFrame({'prefixes':prefixes,'keyword':['battery']*len(prefixes),'suffixes':suffixes})

Unnamed: 0,prefixes,keyword,suffixes
0,good,battery,life
1,great,battery,lasts
2,long,battery,last
3,new,battery,doesn
4,removable,battery,runs


In [63]:
def get_context(reviews,keyword):
    pattern = re.compile(f"\w+\s{keyword}\s\w+")
    prefixes_suffixes = re.findall(pattern,reviews)
    prefixes = []
    suffixes = []
    for p in prefixes_suffixes:
        l = p.split(" ")
        prefixes.append(l[0].lower())
        suffixes.append(l[-1].lower())
    prefixes = [p for p in prefixes if p not in stop_words]
    suffixes = [s for s in suffixes if s not in stop_words]
    prefixes=pd.Series(prefixes).value_counts().head(5).index
    suffixes=pd.Series(suffixes).value_counts().head(5).index
    return pd.DataFrame({'prefixes':prefixes,'keyword':[f'{keyword}']*len(prefixes),'suffixes':suffixes})

In [64]:
get_context(samsung_reviews,"battery")

Unnamed: 0,prefixes,keyword,suffixes
0,good,battery,life
1,great,battery,lasts
2,long,battery,last
3,new,battery,doesn
4,removable,battery,runs


In [65]:
get_context(samsung_reviews,"screen")

Unnamed: 0,prefixes,keyword,suffixes
0,touch,screen,protector
1,big,screen,size
2,great,screen,resolution
3,large,screen,protectors
4,home,screen,quality


# Summary:

- Simple hueristics sometime are very usefull
- Regex can be life saviours
- Don't forget to use simple text processing while trying to solve a non-trival problem

