In [None]:
import pandas as pd
import nltk
import gzip
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
sns.set()

# Data heavy preprocessing

This section processes the raw data and write it in a format which is directly usable in memory

In [None]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)
        
def parseUncompressed(path):
    g = open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

def metadata_for_category(path, category):
    """
    Reads the metadata file and extract only metadata for the given category
    """
    i = 0
    df = {}
    for d in parseUncompressed(path):
        if 'categories' in d:
            for categories_list in d['categories']:
                if categories_list[0] == category:
                    df[i] = d
                    i += 1
    return pd.DataFrame.from_dict(df, orient='index')

def readSerialized(path):
    return pd.read_pickle(path)


In [None]:
#df = metadata_for_category('data/metadata.json', 'Electronics')
#df.to_pickle('data/metadata_electronics_serialized')

# Data (light) preprocessing

* df stores the raw review data for electronics
* metadata stores the metadata for electronics
* data stores the merged dataframe between df and metadata

In [None]:
df = readSerialized('data/serialized_electronics')
metadata = readSerialized('data/metadata_electronics_serialized')

In [None]:
# Keep only metadata for known brands
metadata = metadata[metadata.brand.notnull()]

# Remove columns which are not needed for our analysis
metadata = metadata.drop(['related', 'categories'], axis=1)

In [None]:
metadata.head()

In [None]:
df.head()

In [None]:
data = df.merge(metadata, how='left', left_on='asin', right_on='asin')

In [None]:
print("Number of items: ", len(data))
print("Number of items with metadata: ", len(data[data.title.notnull()]))

# Opinion extraction

In [None]:
def getScore(x):
    if(len(x) == 2 and  x[1] > 2):
        return x[0] / x[1]
    return 0

In [None]:
df2 = df.copy()
df2['reviewLength'] = df2['reviewText'].str.len()
df2['score'] = df2.apply(lambda x : getScore(x['helpful']), axis=1)
df2 = df2[['asin','score','reviewLength']]

In [None]:
df2 = df2.sort_values(['score','reviewLength'],ascending=False)
display(df2.head())

In [None]:
from pandas.tools.plotting import bootstrap_plot

dfplot = df2[['score','reviewLength']].sample(n=50000)

dfplot.plot.scatter(y = 'score',x ='reviewLength',style=['o', 'rx'], s=1)
plt.show()

In [None]:
df2 = df.groupby(["asin"])['reviewText'].agg(lambda x:''.join(set(x))).reset_index()
df2.head()

In [None]:
def filterTags(w1,w2):
    _, tag1 = nltk.pos_tag(nltk.word_tokenize(w1))[0]
    _, tag2 = nltk.pos_tag(nltk.word_tokenize(w2))[0]
    
    return (tag1.startswith('JJ') and tag2.startswith('NN')) or \ # Good quality
            (tag1.startswith('RB') and tag2.startswith('VBN')) or \ # Well made
            (tag1.startswith('VB') and tag2.startswith("JJ")) # Work well
    
def getBest(text):
    ignored_words = nltk.corpus.stopwords.words('english')
    word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
    
    tokens = nltk.word_tokenize(text)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    
    finder = nltk.BigramCollocationFinder.from_words(tokens)
    finder.apply_word_filter(word_filter)    
    finder.apply_freq_filter(3)
    res = finder.ngram_fd.most_common(3)
        
    res = [x for x in res if filterTags(x[0][0],x[0][1])]

    if(len(res) > 0):
        return res
    else:
        return np.nan

In [None]:
from timeit import default_timer as timer

dfProduct = df.groupby(["asin"])['reviewText'].agg(lambda x:''.join(set(x))).reset_index()
dfProduct = dfProduct.sample(n=2000)

start = timer()
dfProduct["reviewText"] = dfProduct["reviewText"].apply(lambda x: getBest(x))
end = timer()
print(end - start)  

In [None]:
dfProduct = dfProduct.dropna(how = 'any')
dfProduct

In [None]:
#df2['Best'] = df2['reviewText'].apply(lambda x: x[0][0])
#df2['Second'] = df2['reviewText'].apply(lambda x: x[1][0])
#df2['Third'] = df2['reviewText'].apply(lambda x: x[2][0])

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk import word_tokenize

In [None]:
class SentimentAnalyser:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        
    def _penn_to_wn(self, tag):
        """
        Convert between the PennTreebank tags to simple Wordnet tags
        """
        if tag.startswith('J'):
            return wn.ADJ
        elif tag.startswith('N'):
            return wn.NOUN
        elif tag.startswith('R'):
            return wn.ADV
        elif tag.startswith('V'):
            return wn.VERB
        return None

    def sentiment_for_tagged_word(self, tagged_word):
        """
        Compute the score for a given tagged word.
        The word is assumed to be tagged using the Penn Treebank Project's tags
        Return None for irrelevant words, a tuple (positive score, negative score) otherwise
        """
        word, tag = tagged_word
        
        wn_tag = self._penn_to_wn(tag)
        
        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
            return None
        
        lemma = self.lemmatizer.lemmatize(word, pos=wn_tag)
        
        if not lemma:
            return None
        
        synsets = wn.synsets(lemma, pos=wn_tag)
        
        if not synsets:
            return None
        
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        
        return swn_synset.pos_score(), swn_synset.neg_score()
    
    def sentiment_score_for_raw_sentence(self, raw_sentence):
        """
        Compute the sum of the differences in sentiment score for each word in the sentence
        """
        tagged_sentence = nltk.pos_tag(word_tokenize(raw_sentence))
        sum_deltas = 0

        for tagged_word in tagged_sentence:
            scores = self.sentiment_for_tagged_word(tagged_word)

            if scores is None:
                continue
                
            pos_score, neg_score = scores
            sum_deltas += (pos_score - neg_score)
        
        return sum_deltas

In [None]:
s = SentimentAnalyser()
a = s.sentiment_score_for_raw_sentence("low price")
a

In [None]:
raw_sentence = "Dankest object in the whole fucking world"
tagged_sentence = nltk.pos_tag(word_tokenize(raw_sentence))

for word, tag in tagged_sentence:
    wn_tag = penn_to_wn(tag)
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        continue

    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        continue

    synsets = wn.synsets(lemma, pos=wn_tag)
    if not synsets:
        continue

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    print(swn_synset)
    print(swn_synset.pos_score())
    print(swn_synset.neg_score())


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sentences=["low price","the option to pass keyword arguments to the underlying matplotlib plotting method."]
sid = SIA()
for sentence in sentences:
    ss = sid.polarity_scores(sentence)
    print(ss)

In [None]:
#data = pd.read_csv("")

import random
 
sentiment_data = list(zip(df["reviewText"], df["overall"]))
random.shuffle(sentiment_data)
 
l = 200000    

# 80% for training
train_X, train_y = zip(*sentiment_data[:l])
 
# Keep 20% for testing
test_X, test_y = zip(*sentiment_data[l:l+int(l/10)])

In [None]:
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.base import TransformerMixin

bigram_clf = Pipeline([
('vectorizer', CountVectorizer(analyzer="word",
ngram_range=(2, 2),
tokenizer=word_tokenize, 
# tokenizer=lambda text: mark_negation(word_tokenize(text)),
preprocessor=lambda text: text.replace("<br />", " "),)),
('classifier', LinearSVC(verbose=True))
])
bigram_clf.fit(train_X, train_y)
bigram_clf.score(test_X, test_y)
# with mark_negation 0.86760000000000004
# without mark_negation 0.87119999999999997

In [None]:
bigram_clf.predict(["bad laptop"])[0]