In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import gzip
import seaborn as sns
from IPython.display import display, HTML
sns.set()

In [None]:
def parse(path): 
    g = gzip.open(path, 'rb') 
    for l in g: 
        yield eval(l) 
            
def getDF(path): 
    i = 0 
    df = {} 
    for d in parse(path): 
        df[i] = d 
        i += 1 
        
    return pd.DataFrame.from_dict(df, orient='index') 

In [None]:
df = getDF('data/reviews_Electronics_5.json.gz')

In [None]:
df.to_pickle("serialized_electronics")

In [None]:
df = pd.read_pickle("data/serialized_electronics")

In [None]:
df.head()

## Long reviews

In [None]:
# Compute the length of the review
df['length'] = df.reviewText.str.len()

# Separate the helpful indicator list into two separate columns
df[['helpful_yes','helpful_total']] = pd.DataFrame(df.helpful.values.tolist())
df = df.drop('helpful', axis=1)

# Compute explicitely the number of unhelpul reviews
df['helpful_no'] = df.helpful_total - df.helpful_yes

df.head()

### Length as a function of overall rating

In [None]:
sns.boxplot(y='overall', x='length', data=df[df.length != 0], orient='h', showfliers=False)
plt.show()

### Helpfulness as a function of length

In [None]:
df[df.helpful_yes == df.helpful_yes.max()]

In [None]:
df_rated = df[df.helpful_total > 10000]
plt.scatter(df_rated.helpful_no, df_rated.helpful_yes)
plt.show()

In [None]:
df.corr()

## Group by product

In [None]:
most_reviewed_id = df.asin.value_counts().index[0]

sample = df[df.asin == most_reviewed_id]
sample

In [None]:
lengths = df['reviewText'].str.len()
lengths = lengths[(lengths > 0)]
print(lengths.describe())
sns.violinplot(lengths)
plt.show()

lengths_sorted = lengths.sort_values()
y = np.array(list(range(len(lengths_sorted))))
y = y / len(lengths_sorted)
y = y[::-1]
lengths_sorted.head()
plt.plot(lengths_sorted, y)
plt.show()

# Natural language processing

In [2]:
import nltk


In [None]:
from nltk.book import *

In [None]:
reviewText = sample.reviewText.copy()
reviewText = reviewText.str.split()
raw = [element for list_ in reviewText for element in list_]

In [None]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(raw)
finder.apply_freq_filter(7)
finder.nbest(trigram_measures.pmi, 1000)

## Tagging words

In [1]:
from nltk import word_tokenize
text = word_tokenize("And now for something completely different")
nltk.pos_tag(text)

NameError: name 'nltk' is not defined

In [5]:
print(nltk.pos_tag(word_tokenize("fffff")))
print(nltk.pos_tag(word_tokenize("poorly")))

[('fffff', 'NN')]
[('poorly', 'RB')]


In [6]:
nltk.help.upenn_tagset('RB')

RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...


In [None]:
good = wn.synset('good.n.01')
good.lemmas()[0].pertainyms()

In [None]:
for synset in wn.synsets('great'):
    print(synset.lemma_names())
    print(synset.hyponyms())

In [None]:
tmp = wn.synset('good.a.01')
print(tmp.path_similarity(wn.synset('best.a.01')))

## Sentiment analysis

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

In [None]:
class SentimentAnalyser:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        
    def _penn_to_wn(self, tag):
        """
        Convert between the PennTreebank tags to simple Wordnet tags
        """
        if tag.startswith('J'):
            return wn.ADJ
        elif tag.startswith('N'):
            return wn.NOUN
        elif tag.startswith('R'):
            return wn.ADV
        elif tag.startswith('V'):
            return wn.VERB
        return None

    def sentiment_for_tagged_word(self, tagged_word):
        """
        Compute the score for a given tagged word.
        The word is assumed to be tagged using the Penn Treebank Project's tags
        Return None for irrelevant words, a tuple (positive score, negative score) otherwise
        """
        word, tag = tagged_word
        
        wn_tag = self._penn_to_wn(tag)
        
        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
            return None
        
        lemma = self.lemmatizer.lemmatize(word, pos=wn_tag)
        
        if not lemma:
            return None
        
        synsets = wn.synsets(lemma, pos=wn_tag)
        
        if not synsets:
            return None
        
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        
        return swn_synset.pos_score(), swn_synset.neg_score()
    
    def sentiment_score_for_raw_sentence(self, raw_sentence):
        """
        Compute the sum of the differences in sentiment score for each word in the sentence
        """
        tagged_sentence = nltk.pos_tag(word_tokenize(raw_sentence))
        sum_deltas = 0

        for tagged_word in tagged_sentence:
            scores = self.sentiment_for_tagged_word(tagged_word)

            if scores is None:
                continue
                
            pos_score, neg_score = scores
            sum_deltas += (pos_score - neg_score)
        
        return sum_deltas
        


In [None]:
s = SentimentAnalyser()
a = s.sentiment_score_for_raw_sentence("Certainly the best book I have ever read")
a

In [None]:
raw_sentence = "Dankest object in the whole fucking world"
tagged_sentence = nltk.pos_tag(word_tokenize(raw_sentence))

for word, tag in tagged_sentence:
    wn_tag = penn_to_wn(tag)
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        continue

    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        continue

    synsets = wn.synsets(lemma, pos=wn_tag)
    if not synsets:
        continue

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    print(swn_synset)
    print(swn_synset.pos_score())
    print(swn_synset.neg_score())
