In [1]:
# importing the required libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
filepath = 'women_clothing_review.csv'
df = pd.read_csv(filepath)
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [3]:
df_wcr = df[['Clothing ID', 'Review Text']]
df_wcr.head()

Unnamed: 0,Clothing ID,Review Text
0,767,Absolutely wonderful - silky and sexy and comf...
1,1080,Love this dress! it's sooo pretty. i happene...
2,1077,I had such high hopes for this dress and reall...
3,1049,"I love, love, love this jumpsuit. it's fun, fl..."
4,847,This shirt is very flattering to all due to th...


In [4]:
# checking the number of rows x columns in the dataframe
df_wcr.shape

(23486, 2)

In [5]:
# checking for missing values in the 'Review Text' column
df_wcr['Review Text'].isnull().sum()

845

In [6]:
# creating dataframe 'wcr' by dropping the rows with missing values
wcr = df_wcr.dropna()

In [7]:
# checking the rows x columns after dropping the rows with missing values
wcr.shape

(22641, 2)

In [8]:
# verifying once again for the missing values in the 'Review Text' column
wcr['Review Text'].isnull().sum()

0

In [9]:
# checking characters in punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
# getting thelist of english stopwords
eng_stopwords = stopwords.words('english')
print(eng_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
# initilizing function to do sentiment analysis
sid = SentimentIntensityAnalyzer()

In [12]:
# creating a function to clean up each review, and analyzing and assigning a sentiment polarity
def reviewSentiment(review_text):
    
    review_text = review_text.lower()           # to make texts as lowercase
    
    tknz_review = word_tokenize(review_text)    # to tokenize the review and keeping them in a list
    
    for token in tknz_review:                   # to remove puntuations
        if token in punctuation:
            tknz_review.remove(token)
    
    clean_tokens = []                           # creating an empty list to hold "cleaned" tokens
    
    for token in tknz_review:                   # to remove filler words
        if token not in eng_stopwords:
            clean_tokens.append(token)
            
    clean_review = ' '.join(clean_tokens)       # to put sentence back together with remaining clean words
    
    sid_rev = sid.polarity_scores(clean_review) # to get the polarity scores dictionary
    
    r_comp = sid_rev['compound']                # to get sentiment polarity from the "compound" key in the sid_rev dictionary
    
    return r_comp                               # return the sentiment value

In [13]:
# creating a new column ('review_sentiment') to hold sentiment value from function ('reviewSentiment')
wcr['Sentiment Review Score'] = wcr['Review Text'].apply(reviewSentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
# verifying sentiment values in new column
wcr.head()

Unnamed: 0,Clothing ID,Review Text,Sentiment Review Score
0,767,Absolutely wonderful - silky and sexy and comf...,0.8991
1,1080,Love this dress! it's sooo pretty. i happene...,0.971
2,1077,I had such high hopes for this dress and reall...,0.9062
3,1049,"I love, love, love this jumpsuit. it's fun, fl...",0.9464
4,847,This shirt is very flattering to all due to th...,0.9117


In [15]:
# checking the type of data in wcr
wcr.dtypes

Clothing ID                 int64
Review Text                object
Sentiment Review Score    float64
dtype: object

In [16]:
# creating a function to assign a polarity category to the sentiment
def sentimentCategory(sent_num):
    if sent_num >= 0.2:
        return "positive"
    elif sent_num <= -0.2:
        return "negative"
    else:
        return "neutral"

In [17]:
# create a new column to hold sentiment category
wcr['Sentiment Category'] = wcr['Sentiment Review Score'].apply(sentimentCategory)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
# checking the first five rows of the new dataframe
wcr.head()

Unnamed: 0,Clothing ID,Review Text,Sentiment Review Score,Sentiment Category
0,767,Absolutely wonderful - silky and sexy and comf...,0.8991,positive
1,1080,Love this dress! it's sooo pretty. i happene...,0.971,positive
2,1077,I had such high hopes for this dress and reall...,0.9062,positive
3,1049,"I love, love, love this jumpsuit. it's fun, fl...",0.9464,positive
4,847,This shirt is very flattering to all due to th...,0.9117,positive


In [19]:
# getting the counts of positive, negative, and neutral reviews
wcr['Sentiment Category'].value_counts()

positive    21380
neutral       717
negative      544
Name: Sentiment Category, dtype: int64

In [20]:
# checking the review text in the fourth row whose sentiment review score is '0.9464'
wcr['Review Text'].iloc[3]

"I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments!"

In [21]:
# creating a dataframe for review texts (rows) with negative comments
wcr_negative = wcr.loc[wcr['Sentiment Category'] == 'negative']

In [22]:
# checking the first five rows
wcr_negative.head()

Unnamed: 0,Clothing ID,Review Text,Sentiment Review Score,Sentiment Category
77,850,The zipper broke on this piece the first time ...,-0.2263,negative
103,822,The fabric felt cheap and i didn't find it to ...,-0.3724,negative
110,861,This is so thin and poor quality. especially f...,-0.3892,negative
191,895,I was minimally torn over whether to return th...,-0.5456,negative
195,895,"Finally a ""swing top"" that doesn't look like a...",-0.4556,negative


In [23]:
# verifying the number of rows
wcr_negative.shape

(544, 4)

In [24]:
# checking the review text in the third row whose sentiment review score is '0.5456'
wcr_negative['Review Text'].iloc[3]

"I was minimally torn over whether to return this but ultimately it's going back because the knit is just too thin. i thought it would be cozy and be of normal sweater weight but it's not. and because it's so light, the swing effect doesn't really come off. nothing special."

In [25]:
# creating a dataframe for review texts (rows) with negative comments
wcr_neutral = wcr.loc[wcr['Sentiment Category'] == 'neutral']

In [26]:
# checking the first five rows
wcr_neutral.head()

Unnamed: 0,Clothing ID,Review Text,Sentiment Review Score,Sentiment Category
22,1077,"First of all, this is not pullover styling. th...",0.1027,neutral
100,861,At first i wasn't sure about it. the neckline ...,0.1106,neutral
104,863,"Runs big and looked unflattering. i am petite,...",0.0,neutral
203,895,Nice weight sweater that allows one to wear le...,0.1999,neutral
204,828,I loved this top; it reminded me of one i have...,0.0258,neutral


In [27]:
# verifying the number of rows
wcr_neutral.shape

(717, 4)

In [28]:
# checking the review text in the second row whose sentiment review score is '0.00'
wcr_neutral['Review Text'].iloc[2]

'Runs big and looked unflattering. i am petite, might work on someone taller.'