#                Natural Language Processing - Data Preprocessing using NLTK

In [185]:
import pandas as pd
import nltk
from nltk.tokenize import  sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import FreqDist
import string

In [186]:
## Importing data file

baby_product=pd.read_csv('E:/amazonReview-master/amazonReview-master/amazon_baby_train.csv')

In [187]:
## Data manipulation: coverting reviews to lowercase and also replacing some symbols with space

## Step 1

baby_product['review']=baby_product['review'].apply(lambda x: str(x).lower())
baby_product['review']=baby_product['review'].apply(lambda x: str(x).replace('_',' '))
baby_product['review']=baby_product['review'].apply(lambda x: str(x).replace('-',' '))
baby_product['review']=baby_product['review'].apply(lambda x: str(x).replace('/',' '))
baby_product['review']=baby_product['review'].apply(lambda x: str(x).replace(',',' '))
baby_product['review']=baby_product['review'].apply(lambda x: str(x).replace('.',' '))
baby_product['review']=baby_product['review'].apply(lambda x: str(x).replace("n't",' '))

In [188]:
baby_product.head()

Unnamed: 0,name,review,rating
0,"Moby Wrap Original 100% Cotton Baby Carrier, Red",bought this for my daughter who is expecting h...,5
1,Child to Cherish Handprints Tower Of Time Kit ...,a perfect baby or first birthday gift i recei...,5
2,CM Star Candy Heart Crossbody Shoulder Bag Sat...,it is very cute and i got a lot of compliment...,4
3,"JJ Cole Collections System Diaper Bag, Black D...",i have had 3 of these bags i love it so much...,3
4,Recaro Signo Convertible Car Seat Midnight Desert,i have used this with both my kids it is the ...,5


In [189]:
## Step 2

baby_product['tokenized_words']=baby_product['review'].apply(lambda x: word_tokenize(str(x)))

In [190]:
##  After tokenization

baby_product.head()

Unnamed: 0,name,review,rating,tokenized_words
0,"Moby Wrap Original 100% Cotton Baby Carrier, Red",bought this for my daughter who is expecting h...,5,"[bought, this, for, my, daughter, who, is, exp..."
1,Child to Cherish Handprints Tower Of Time Kit ...,a perfect baby or first birthday gift i recei...,5,"[a, perfect, baby, or, first, birthday, gift, ..."
2,CM Star Candy Heart Crossbody Shoulder Bag Sat...,it is very cute and i got a lot of compliment...,4,"[it, is, very, cute, and, i, got, a, lot, of, ..."
3,"JJ Cole Collections System Diaper Bag, Black D...",i have had 3 of these bags i love it so much...,3,"[i, have, had, 3, of, these, bags, i, love, it..."
4,Recaro Signo Convertible Car Seat Midnight Desert,i have used this with both my kids it is the ...,5,"[i, have, used, this, with, both, my, kids, it..."


In [191]:
## Step 3 : Creating a stop words list and a function to filter tokenized words 

stop_word=stopwords.words('english')+list(string.punctuation)
def stop_words(x):
    a=[]
    for i in x:
        if i not in (stop_word) and len(i)>1:
            a.append(i)
    return(a)
baby_product['stopped_words']=baby_product['tokenized_words'].apply(stop_words)

In [192]:
baby_product.head()

Unnamed: 0,name,review,rating,tokenized_words,stopped_words
0,"Moby Wrap Original 100% Cotton Baby Carrier, Red",bought this for my daughter who is expecting h...,5,"[bought, this, for, my, daughter, who, is, exp...","[bought, daughter, expecting, first, baby, sai..."
1,Child to Cherish Handprints Tower Of Time Kit ...,a perfect baby or first birthday gift i recei...,5,"[a, perfect, baby, or, first, birthday, gift, ...","[perfect, baby, first, birthday, gift, receive..."
2,CM Star Candy Heart Crossbody Shoulder Bag Sat...,it is very cute and i got a lot of compliment...,4,"[it, is, very, cute, and, i, got, a, lot, of, ...","[cute, got, lot, compliments, pros, really, cu..."
3,"JJ Cole Collections System Diaper Bag, Black D...",i have had 3 of these bags i love it so much...,3,"[i, have, had, 3, of, these, bags, i, love, it...","[bags, love, much, one, children, holds, every..."
4,Recaro Signo Convertible Car Seat Midnight Desert,i have used this with both my kids it is the ...,5,"[i, have, used, this, with, both, my, kids, it...","[used, kids, best, car, seat, market, lasts, d..."


In [193]:
## Creating a list of all words after tokenization and stopword step

all_words=[]
for i in baby_product['stopped_words']:
    all_words.extend(i)

In [194]:
## step 4 : Tagging the words 

tagged_words=nltk.pos_tag(all_words)

# List of tags 

In [195]:
## step 5: Filtering the required words from their specific POS(such as adjectives for sentiment analysis)

filtered_word=[]
tag=['JJ','JJR','JJS','RB','RBR','RBS']
for i in tagged_words:
    if i[1] in tag:
        filtered_word.append(i[0])

In [197]:
##list of top frequent words in the corpus

FreqDist(filtered_word).most_common(20)

[('great', 47729),
 ('little', 32759),
 ('old', 31603),
 ('easy', 31439),
 ('well', 30114),
 ('really', 28061),
 ('also', 27812),
 ('much', 23670),
 ('good', 23123),
 ('even', 20023),
 ('first', 19184),
 ('still', 18890),
 ('back', 17323),
 ('nice', 14341),
 ('enough', 13742),
 ('small', 13266),
 ('better', 12046),
 ('big', 11717),
 ('soft', 11453),
 ('easily', 11190)]