In [52]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [24]:
#load csv to dataframe
reviews = pd.read_csv('Musical_instruments_reviews.csv')
reviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [25]:
#retrieve the summary column
summary = reviews['summary']
summary

0                                                     good
1                                                     Jake
2                                     It Does The Job Well
3                            GOOD WINDSCREEN FOR THE MONEY
4                    No more pops when I record my vocals.
                               ...                        
10256                                           Five Stars
10257    Long life, and for some players, a good econom...
10258                                     Good for coated.
10259                                          Taylor Made
10260    These strings are really quite good, but I wou...
Name: summary, Length: 10261, dtype: object

In [32]:
#perform tokenization 
token_list = [word_tokenize(summ) for summ in summary]
#display first 10 tokenized reviews
token_list[:10]

[['good'],
 ['Jake'],
 ['It', 'Does', 'The', 'Job', 'Well'],
 ['GOOD', 'WINDSCREEN', 'FOR', 'THE', 'MONEY'],
 ['No', 'more', 'pops', 'when', 'I', 'record', 'my', 'vocals', '.'],
 ['The', 'Best', 'Cable'],
 ['Monster', 'Standard', '100', '-', '21', "'", 'Instrument', 'Cable'],
 ['Did', "n't", 'fit', 'my', '1996', 'Fender', 'Strat', '...'],
 ['Great', 'cable'],
 ['Best', 'Instrument', 'Cables', 'On', 'The', 'Market']]

In [55]:
#perform lemmatization
lemmatize = WordNetLemmatizer()

lemmatized_words = []

for t in token_list:
    summ = []
    for w in t:
        rootWord = lemmatize.lemmatize(w,pos='n')
        summ.append(rootWord)
    lemmatized_words.append(summ)
lemmatized_words[:10]


[['good'],
 ['Jake'],
 ['It', 'Does', 'The', 'Job', 'Well'],
 ['GOOD', 'WINDSCREEN', 'FOR', 'THE', 'MONEY'],
 ['No', 'more', 'pop', 'when', 'I', 'record', 'my', 'vocal', '.'],
 ['The', 'Best', 'Cable'],
 ['Monster', 'Standard', '100', '-', '21', "'", 'Instrument', 'Cable'],
 ['Did', "n't", 'fit', 'my', '1996', 'Fender', 'Strat', '...'],
 ['Great', 'cable'],
 ['Best', 'Instrument', 'Cables', 'On', 'The', 'Market']]

In [54]:
ps = PorterStemmer()

stemmed_words = []

for t in token_list:
    summ = []
    for w in t:
        rootWord = ps.stem(w)
        summ.append(rootWord)
    stemmed_words.append(summ)
stemmed_words[:10]

[['good'],
 ['jake'],
 ['it', 'doe', 'the', 'job', 'well'],
 ['good', 'windscreen', 'for', 'the', 'money'],
 ['no', 'more', 'pop', 'when', 'i', 'record', 'my', 'vocal', '.'],
 ['the', 'best', 'cabl'],
 ['monster', 'standard', '100', '-', '21', "'", 'instrument', 'cabl'],
 ['did', "n't", 'fit', 'my', '1996', 'fender', 'strat', '...'],
 ['great', 'cabl'],
 ['best', 'instrument', 'cabl', 'on', 'the', 'market']]

### Describe any issues and limitations of the basic NLP processing.

Some issues I could see with NLP processing is that sometimes lemmatization and stemming could change the meaning of a word, and therefore misidentify meanings. For example, stemming would change "fishing" to "fish." However, fishing is the act of catching fish, and fish is the animal. It's not the same as changing "running" to "run," because they still mean the same thing. Another issue with NLP is that it could struggle to identify figures of speech like irony and common phrases. If I say "something smells fishy," to convey that something is suspicious, the algorithm might think I literally mean that something smells like fish. If I say "we ran out of milk," I'm not saying that I ran and lost milk, but the NLP processor has no way to understand what that means unless I specify meanings for countless phrases. Sarcasm is another issue for NLP processors, as they might consider a sarcastic sentence as positive sentiment when it's really negative. Even humans struggle to understand sarcasm sometimes, so expecting a computer to do it is tough. Yet another issue is ambiguity, when a sentence could have multiple meanings.     