# UNDERSTANDING SENTIMENTS IN EPINIONS

Data Mining Project

## Libraries

In [180]:
# Load required libraries 

import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
import string
from string import digits
import re
from bs4 import BeautifulSoup

## Data Loading

In [2]:
# Set path of the epinions file

filepath = os.path.join('..\Datasets\D1.txt')

In [3]:
# Load data from file

with open(filepath) as myfile:
    data = myfile.read()

In [4]:
soup = BeautifulSoup(data, 'lxml')

In [5]:
print type(soup)

<class 'bs4.BeautifulSoup'>


In [6]:
print soup.prettify()[:199]

<html>
 <body>
  <doc>
   <docid>
    36
   </docid>
   <product>
    Digital_Cameras/Casio_QV_4000_4_0_Megapixel_Digital_Camera_EX_Plus_Outfit__Camera_QV4000EXK
   </product>
   <label>
    Yes
    


In [7]:
documents = soup.find_all('doc')

In [8]:
len(documents)

12000

In [9]:
tab = pd.DataFrame()

In [10]:
# for idx, doc in enumerate(documents[:5]):
#     print str(doc.docid.contents[0]).strip('\n').strip(' ')
#     print str(doc.product.contents[0]).strip('\n').strip(' ')
#     print  str(doc.label.contents[0]).strip('\n').strip(' ')
#     print  str(doc.rating.contents[0]).strip('\n').strip(' ')
#     print  str(doc.url.contents[0]).strip('\n').strip(' ')
#     print  str(doc.find('class').contents[0]).strip('\n').strip(' ')
#     print  doc.find('text').contents[0].strip('\n').strip(' ')

In [11]:
for idx, doc in enumerate(documents):
    tab.loc[idx,'DOCID'] = str(doc.docid.contents[0]).strip('\n').strip(' ')
    tab.loc[idx,'PRODUCT'] =  str(doc.product.contents[0]).strip('\n').strip(' ')
    tab.loc[idx,'LABEL'] =  str(doc.label.contents[0]).strip('\n').strip(' ')
    tab.loc[idx,'RATING'] =  str(doc.rating.contents[0]).strip('\n').strip(' ')
    tab.loc[idx,'URL'] =  str(doc.url.contents[0]).strip('\n').strip(' ')
    tab.loc[idx,'CLASS'] =  str(doc.find('class').contents[0]).strip('\n').strip(' ')
    tab.loc[idx,'TEXT'] =  doc.find('text').contents[0].strip('\n').strip(' ')

In [12]:
tab.tail()

Unnamed: 0,DOCID,PRODUCT,LABEL,RATING,URL,CLASS,TEXT
11995,36432,auto_Make/auto_Make-1994_Ford_Probe,Yes,5.0,http://www.epinions.com/content_110045859460,Auto,"I just traded-in my Ford Probe GT, auto. Coupe..."
11996,36437,auto_Make/auto_Make-1987_Dodge_Diplomat,Yes,5.0,http://www.epinions.com/content_38862753412,Auto,In my younger days.driving a cool car was a ...
11997,36442,auto_Make/auto_Make-2002_Nissan_Altima,Yes,4.0,http://www.epinions.com/content_46086590084,Auto,"I bought the 2.5S w/floor mats, in-cabin micro..."
11998,36447,auto_Make/auto_Make-2002_Nissan_Altima,Yes,5.0,http://www.epinions.com/content_63460839044,Auto,I got this car about two weeks ago. I have the...
11999,36452,auto_Make/auto_Make-2002_Nissan_Altima,No,4.0,http://www.epinions.com/content_88014163588,Auto,"The contestants:\n2003 Mazda 6s, 5-Speed Manua..."


In [13]:
set(tab.CLASS)

{'Auto', 'Camera'}

In [14]:
set(tab.RATING)

{'1.0', '2.0', '3.0', '4.0', '5.0'}

In [15]:
set(tab.LABEL)

{'No', 'Yes'}

In [16]:
auto = (tab[tab.CLASS == 'Auto'])

In [17]:
camera = (tab[tab.CLASS == 'Camera'])

In [18]:
len(auto[auto.LABEL == 'Yes'])

5105

In [19]:
len(auto[auto.LABEL == 'No'])

895

In [20]:
len(camera[camera.LABEL == 'Yes'])

5274

In [21]:
len(camera[camera.LABEL == 'No'])

726

In [22]:
tab.TEXT[0]

u"I stumbled upon this camera after much research.  It's hard to understand why this camera is so hard to find; when every pro review site has given it very high marks.  I was considering the Minolta Dimage S404, Canon G2, Canon S40, Olympus D-40, Pentax Optio 430 before I discovered the Casio.  All of these camera's received good reviews, but only the Canon G2 was higher than the Casio.  However, the casio uses the exact same lens as the Canon G2.  The Casio is a great camera.  When you first take it out of the box, you realize that the camera is built very well.  Everything is laid out nicely.  It comes with a neck strap that is very comfortable.  I have used a 2.1 mgpxl camera for the last year and a half.  It was a good camera, but seemed to always be in the camera bag when I wanted to take a picture.  On our last trip to Disneyland, I missed a lot of good shots.  This camera has an excellent lens (all glass) and quality CCD (Sony).  Together they produce great pictures.  I have th

## Data Pre-processing

The following function processes and tokenizes raw text. We use nltk package to tokenize and clean the text. This includes removing numbers, stopwords, punctuations and converting the text to lower case.

In [226]:
def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    """
    input:
        text: string of one review
        lemmatizer: default is nltk.stem.wordnet.WordNetLemmatizer()
    output:
        result: list of tokenized words
    """
    
    tokenized = []
    lemmatized = []
    
    # Remove numbers
    text = re.sub('\d', ' ', text)
    
    # Normalizes case
    text = text.lower()
    
    # remove bad characters
    text = re.sub(r'[\x92]', " ",text)
    
    # remove apostrophes, delete 's
    text = text.replace('\'s', '')
    text = text.replace('\'', '')
    
    # Remove punctuations  
    for char in string.punctuation:
        text = text.replace(char, ' ')
        
    # Tokenize text
    tokenized = nltk.word_tokenize(text)
    
    for word in tokenized:
        # Remove stopwords
        if word not in stopwords.words('english'):
            try:
                # lemmatize word
                lemmatized.append(lemmatizer.lemmatize(word))
            except Exception:
                pass
    return lemmatized

In [227]:
text = "This is a sample test input for 's processing loves."
print process(text)

['sample', 'test', 'input', 'processing', u'love']


In [228]:
def process_all(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    """
    Inputs
        df: pd.DataFrame: dataframe containing a column 'text' in last column
        lemmatizer: the default argument is of type nltk.stem.wordnet.WordNetLemmatizer
    Outputs
        pd.DataFrame: dataframe in which the values of text column have been changed from str to list(str),
                        the output from process_text() function. Other columns are unaffected.
    """
    df_copy = df.copy()
    df_length = len(df_copy.iloc[:,-1])
        
    temp = [process(df_copy.iloc[i,-1]) for i in range(df_length)]  
    df_copy.loc[:,'TEXT'] = temp
    
    return df_copy

In [229]:
processed = process_all(camera.iloc[:100,])

In [230]:
processed.TEXT[2]

[u'ah',
 u'finally',
 u'great',
 u'little',
 u'camera',
 u'beginner',
 u'easy',
 u'swallow',
 u'price',
 u'one',
 u'many',
 u'digital',
 u'camera',
 u'series',
 u'reviewing',
 u'april',
 u'feel',
 u'highly',
 u'qualified',
 u'review',
 u'digital',
 u'camera',
 u'worked',
 u'three',
 u'year',
 u'use',
 u'daily',
 u'basis',
 u'need',
 u'info',
 u'specific',
 u'camera',
 u'feel',
 u'free',
 u'email',
 u'help',
 u'best']