In [None]:
import pandas as pd
import nltk
import gzip
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
import random 
from pylab import rcParams

rcParams['figure.figsize'] = 15, 10
np.random.seed(42)

# Introduction

In this notebook, we will explore opinion mining and sentiment analysis through the use of natural language processing. Our dataset consists of reviews of electronic products scraped form Amazon and available here: http://jmcauley.ucsd.edu/data/amazon/

The objective of this analysis is to use the reviews of a product to extract meaningful concepts about it. Such concepts can then be used to help the user make faster and more informed decisions when shopping. The extracted concepts consist of pairs of words that describe a property of the product. Those pairs are either of the type adjective-noun or adverb-past participle.

For example:

* Adjective - Nouns: (good, quality) (low, price) (best, deal)
* Adverb - Past Participle pairs such as (well, made) (poorly, assembled)

The concepts can then be categorised into positive and negative opinions. 

## Table of Contents

<ol>
<li> <a href="#Data-Overview">Data Overview</a>
    <ol>
        <li><a href="#Reading-the-data">Reading the data</a></li>
        <li><a href="#Data-Overview">Formats</a></li>
        <li><a href="#Missing-values">Missing values</a></li>
        <li><a href="#Distributions">Distributions</a></li>
    </ol>
</li>
<p></p>
<li> <a href="#Characteristic-Exctraction">Characteristic exctraction</a>
<ol>
        <li><a href="#Data-Overview">Data Overview</a></li>
    </ol>
</li>
<p></p>
<li><a href="#Sentiment-Analysis">Sentiment Analysis</a>
<ol>
        <li><a href="#SentiWordNet">SentiWordNet</a></li>
        <li><a href="#Machine-Learning">Machine Learning</a></li>
    </ol>
</li>
<p></p>
<li><a href="#Applications">Applications</a>
<ol>
        <li><a href="#Brand-Rating">Brand Rating</a></li>
    </ol>
</li>
</ol>

# Reading the data 
**<a href="#Table-of-Contents">Table of contents</a>**

# Data Overview

In [None]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

def readSerialized(path):
    return pd.read_pickle(path)

In [None]:
df_elec = readSerialized('serialized_electronics')

In [None]:
df_meta = readSerialized('metadata_electronics_serialized')

### Formats

In [None]:
df_elec['reviewTime']  = pd.to_datetime(df_elec['reviewTime'],format='%m %d, %Y')
df_elec.dtypes

In [None]:
df_meta = df_meta[pd.notnull(df_meta['brand'])]
df_meta.dtypes

### Missing values

In [None]:
df_merged = df_meta.merge(df_elec, left_on='asin', right_on='asin', how='inner')
df_merged['reviewLength'] = df_merged['reviewText'].str.len()

df_plot = df_merged.copy()
df_plot = df_plot[pd.notnull(df_plot['price'])]
df_plot = df_plot[pd.notnull(df_plot['reviewLength'])]

### Distributions

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(df_plot[['price','reviewLength','overall']].sample(100000,random_state = 0), alpha=0.2, figsize=(15, 15), diagonal='hist')
plt.show()

In [None]:
sns.regplot(y='reviewLength', x='price', data=df_plot,scatter_kws={'s':5},line_kws={'color':'r'},order=5)
plt.show()

In [None]:
sns.boxplot(y='overall', x='reviewLength', data=df_plot, orient='h', showfliers=False)
plt.show()

In [None]:
f, axes = plt.subplots(2, 2, figsize=(15, 15))
sns.boxplot(data= df_plot[['price']],orient='h', showfliers=False,ax=axes[0,0])
df_plot[['price']].plot.kde(ax=axes[0,1])
sns.boxplot(data= df_plot[['reviewLength']],orient='h', showfliers=False,ax=axes[1,0])
df_plot[['reviewLength']].plot.kde(ax=axes[1,1])

plt.show()

In [None]:
import scipy
import scipy.stats

d = df_plot[['price']]
ax = d.plot.kde()
x = ax.get_children()[0]._x
y = ax.get_children()[0]._y

print(d.mean())

scale = 70
loc = -500

pdf_fitted = scipy.stats.expon.pdf(x - loc, scale=scale)
plt.plot(pdf_fitted,label= ('exp lambda = ' + str(scale)))
plt.legend(loc='best')
plt.show()

In [None]:
import scipy
import scipy.stats

d = df_plot[['reviewLength']].sample(100000)
ax = d.plot.kde()

x = ax.get_children()[0]._x
y = ax.get_children()[0]._y

scale = 1000
loc = -500

pdf_fitted = scipy.stats.pareto.pdf(x,loc=loc,b = 2, scale=scale)
plt.plot(pdf_fitted,label= 'exp')
plt.legend(loc='best')
plt.show()

### Correlations

In [None]:
df_merged[['price','reviewLength','overall']].corr()

# Characteristic Exctraction

In [None]:
def filterTags(w1,w2):
    return (nltk.pos_tag(nltk.word_tokenize(w1))[0][1] == 'JJ' and nltk.pos_tag(nltk.word_tokenize(w2))[0][1] == 'NN') or \
     (nltk.pos_tag(nltk.word_tokenize(w1))[0][1] == 'RB' and nltk.pos_tag(nltk.word_tokenize(w2))[0][1] == 'VBN')

def getBest(text):
    ignored_words = nltk.corpus.stopwords.words('english')
    word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
    
    tokens = nltk.word_tokenize(text)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    
    finder = nltk.BigramCollocationFinder.from_words(tokens)
    finder.apply_word_filter(word_filter)    
    finder.apply_freq_filter(1)
    res = finder.ngram_fd.most_common(1)
        
    res = [x for x in res if filterTags(x[0][0],x[0][1])]

    if(len(res) > 0):
        return res
    else:
        return np.nan

In [None]:
from timeit import default_timer as timer

df_product = df_elec.groupby(["asin"])['reviewText'].agg(lambda x:''.join(set(x))).reset_index()
df_product = df_product.sample(n=10000)

start = timer()
df_product["reviewText"] = df_product["reviewText"].apply(lambda x: getBest(x))
end = timer()
print(end - start)  

In [None]:
df_product = df_product.dropna(how = 'any')
df_product

# Sentiment Learning

In [None]:
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.base import TransformerMixin

df_ML = df_elec[(df_elec["reviewText"].str.len()<100)].copy()
df_ML['score'] = df_elec["overall"].apply(lambda x : -1 if x < 3 else 1)
df_ML["reviewText"] = df_ML["reviewText"].apply(lambda x: x.lower())

df_0 = (df_ML[df_ML["score"] == -1])
df_1 = (df_ML[df_ML["score"] == 1])

if df_0.shape[0] > df_1.shape[0]:
    df_0 = df_0.sample(df_1.shape[0])
else:
    df_1 = df_1.sample(df_0.shape[0])
    
sentiment_data = pd.concat([df_0, df_1])
X = sentiment_data["reviewText"]
y = sentiment_data["score"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1)


print(X_train.shape)
print(X_test.shape)

In [None]:
bigram_clf = Pipeline([
('vectorizer', CountVectorizer(analyzer="word",
ngram_range=(1, 2),
tokenizer=word_tokenize, 
# tokenizer=lambda text: mark_negation(word_tokenize(text)),
preprocessor=lambda text: text.replace("<br />", " "),)),
('classifier', MLPClassifier(verbose = True))
])
bigram_clf.fit(X_train, y_train)
bigram_clf.score(X_test, y_test)

In [None]:
X = ["low price","expensive","cheap","high quality","low quality","well made",'poorly made','good product']
y = [1,-1,-1,1,-1,1,-1,1]

print(bigram_clf.score(X,y))
print(bigram_clf.predict(X))

# Application to the products

In [None]:
def isInDescription(text,description,title):
    res = [] 
    for t in text:
        ((a,b),num) = t
        if (a + ' ' + b).lower() not in str(description).lower() and (a + ' ' + b).lower() not in str(title).lower():
            res.append(t)
    if len(res) > 0:
        return res
    else:
        return np.nan
    
def getScore(text):
    if len(text) == 0:
        return np.nan
    
    res = 0
    for t in text:
        ((a,b),num) = t
        res += bigram_clf.predict([(a + ' ' + b).lower()])* int(num)
    return res

def getText(text):
    res = ''
    for t in text:
        ((a,b),num) = t
        res += (a + ' ' + b).lower()
    return res

In [None]:
df_final = df_product.merge(df_meta, left_on='asin', right_on='asin', how='inner')

df_final['reviewText'] = df_final.apply(lambda x : isInDescription(x['reviewText'],x['description'],x['title']), axis=1)
df_final = df_final[pd.notnull(df_final['reviewText'])]

df_final["word"] = df_final.apply(lambda x : getText(x['reviewText']), axis=1)

df_final['score'] = 0
df_final['score'] =  df_final.apply(lambda x : getScore(x["reviewText"]), axis=1)

In [None]:
display(df_final)

In [None]:
f = {'score':['sum'],'word': lambda x: ' / '.join(set(x))}
df_brand = df_final[['brand','word','score']].groupby(['brand']).agg(f)
df_brand.columns = df_brand.columns.droplevel()
df_brand = df_brand.reset_index()
df_brand = df_brand.rename(columns={'sum': 'score', '<lambda>': 'word'})

In [None]:
display(df_brand.sort_values('score',ascending=False))