In [1]:
!pip install ndjson altair

Collecting ndjson
  Downloading https://files.pythonhosted.org/packages/70/c9/04ba0056011ba96a58163ebfd666d8385300bd12da1afe661a5a147758d7/ndjson-0.3.1-py2.py3-none-any.whl
Installing collected packages: ndjson
Successfully installed ndjson-0.3.1


In [2]:
# Mount the google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
import pandas as pd
import altair as alt
import numpy as np
from vega_datasets import data
from nltk.sentiment.util import mark_negation



In [4]:
workingdir = "/content/drive/MyDrive/Manning - Liveprojects/Sentiment Analysis/dataset"
small_corpus_data = f"{workingdir}/small_corpus.csv"

In [6]:
df = pd.read_csv(small_corpus_data, sep='\t')

In [7]:
print(df.columns)
print(df.head())

Index(['ratings', 'reviews'], dtype='object')
   ratings                                            reviews
0      1.0  Made in china... either a single bad luck of t...
1      1.0  Having played all of the other games, I was so...
2      1.0  Macro buttons in the way (obviously?), can't l...
3      1.0  Just received my "Cronusmax Plus" and after so...
4      1.0  Not worth the learning curve, the nunchuck+wii...


### Step1: word tokenization
Tokenize the sentences and words of the reviews with the tokenize module of NLTK
Use **word_tokenize** and **sent_tokenize** functions of the **nltk.tokenize** module


In [22]:
from tqdm.notebook import tqdm, trange

import nltk
from nltk.tokenize import TreebankWordTokenizer # contains rules for englisg contractions (dont > "do", "not")
from nltk.tokenize.casual import casual_tokenize # short, informal, emoticon-laced texts from social networks where grammar and spelling conventions vary widely
from nltk.util import ngrams
# should be used
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import opinion_lexicon


In [23]:
nltk.download('opinion_lexicon') # doesnt contain uppercase words, nor punctuation marks!!
nltk.download('punkt') # used by sent_tokenize

[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Unzipping corpora/opinion_lexicon.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [18]:
# tokenizer = TreebankWordTokenizer()

In [8]:
# review = df.iloc[0]['reviews']
# tokens_treebank = tokenizer.tokenize(sentence)
# tokens_casual = casual_tokenize(sentence)

In [45]:
# tokens_casual
# tokens_treebank

In [46]:
# list(ngrams( tokens_treebank, 2))

## Step2: sentence tokenization

use word_tokenize and sent_tokenize from nltk.tokenize

In [27]:
words_positive = set(opinion_lexicon.positive())
words_negative = set(opinion_lexicon.negative())

words_positive_with_negation = words_positive.union({wd + "_NEG" for wd in words_negative})
words_negative_with_negation = words_negative.union({wd + "_NEG" for wd in words_positive})

In [29]:
def tokenize_review(review, negation=False):
  sentences = sent_tokenize(review)
  allwords = []
  for sent in sentences:
    words = word_tokenize(sent)
    if (negation):
      words = mark_negation(words)
    for w in words:
      allwords.append(w)
  return allwords

In [28]:
def calc_sentiment(review, negation=False):
  words = tokenize_review(review, negation)
  if len(words) == 0: return 0
  if (negation):
    np = len([w for w in words if w.lower() in words_positive_with_negation])
    nn = len([w for w in words if w.lower() in words_negative_with_negation])
  else:
    np = len([w for w in words if w.lower() in words_positive])
    nn = len([w for w in words if w.lower() in words_negative])
  return (np - nn) / len(words)

In [13]:
# tokens = tokenize_review(review)
# word_tokenize(sentences[0])
# tokens

In [13]:
review = df.iloc[0]['reviews']
s = calc_sentiment(review)
print(s)

0.0


In [34]:
for i in range(10):
  review = df.iloc[i]['reviews']
  sentiment = calc_sentiment(review)
  if (sentiment != 0):
    print(sentiment)

-0.11538461538461539
-0.021739130434782608
-0.011560693641618497
0.047619047619047616
0.030303030303030304
-0.05389221556886228
-0.09090909090909091
0.027888446215139442
-0.07142857142857142


## Step3: scoring of the reviews

In [15]:
review_sentiments = []
# for index, row in df.iterrows():
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    sentiment = calc_sentiment(row['reviews'])
    review_sentiments.append(sentiment)

HBox(children=(FloatProgress(value=0.0, max=4500.0), HTML(value='')))




In [31]:
def exportcsv(dataframe, fname):
  with open(fname, "w") as outfile:
    outfile.write(dataframe.to_csv( index=False, sep="\t", encoding="utf-8"))

In [19]:
df2 = pd.DataFrame(
    {
        "ratings": df['ratings'],
        "reviews": df['reviews'],
        "sentiments": review_sentiments,
    }
)

In [None]:
# exportcsv(df2, export_small_corpus_sentiment_csv)

In [None]:
t

In [33]:
df_neg.head()

Unnamed: 0,ratings,reviews,sentiments
0,1.0,Made in china... either a single bad luck of t...,0.0
1,1.0,"Having played all of the other games, I was so...",-0.076923
2,1.0,"Macro buttons in the way (obviously?), can't l...",0.0
3,1.0,"Just received my ""Cronusmax Plus"" and after so...",-0.00578
4,1.0,"Not worth the learning curve, the nunchuck+wii...",0.047619


## Step4: comparison of the scores with the reviews in plots
Compare the scores of the product reviews with the product ratings

In [7]:
# load previous save data from Step3
workingdir = "/content/drive/MyDrive/Manning - Liveprojects/Sentiment Analysis/dataset"
export_small_corpus_sentiment_csv = f"{workingdir}/small_corpus_sentiment.csv"
df = pd.read_csv(export_small_corpus_sentiment_csv, sep='\t')

In [8]:
# plot ratings (again)
alt.Chart(df).mark_bar().encode(
    x=alt.X("ratings"), 
    y="count()")

Output hidden; open in https://colab.research.google.com to view.

In [9]:
# plot sentiments
alt.Chart(df).mark_bar().encode(
    x=alt.X("sentiments"), 
    y="count()")


Output hidden; open in https://colab.research.google.com to view.

In [16]:
# Ratings and sentiments plot
alt.Chart(df).mark_circle().encode(
    alt.X('ratings', bin=False),
    alt.Y('sentiments', bin=False),
    size='count()'
)

Output hidden; open in https://colab.research.google.com to view.

In [14]:
alt.Chart(df).mark_circle(size=50).encode(
        x="ratings", 
        y="sentiments", 
        color="ratings", 
        tooltip=["ratings", "sentiments"]
    ).interactive()

Output hidden; open in https://colab.research.google.com to view.

In [35]:
from scipy.stats import pearsonr, spearmanr

def print_corr(dataframe):
  pearsonr_corr, _ = pearsonr(dataframe['ratings'], dataframe['sentiments'])
  print(pearsonr_corr)

  spearmanr_corr, _ = spearmanr(dataframe['ratings'], dataframe['sentiments'])
  print(spearmanr_corr)

print_corr(df)

0.3644289826596749
0.5379736959381071
0.38231738534029064
0.5960469143868878


## Step5: measuring the distribution

## Step6: handling negation

In [32]:
workingdir = "/content/drive/MyDrive/Manning - Liveprojects/Sentiment Analysis/dataset"
export_small_corpus_sentiment_neg_csv = f"{workingdir}/small_corpus_sentiment_withnegation.csv"
review_sentiments2 = []
# for index, row in df.iterrows():
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    sentiment = calc_sentiment(row['reviews'], True)
    review_sentiments2.append(sentiment)

df_neg = pd.DataFrame(
    {
        "ratings": df['ratings'],
        "reviews": df['reviews'],
        "sentiments": review_sentiments2,
    }
)
exportcsv(df_neg, export_small_corpus_sentiment_neg_csv)

HBox(children=(FloatProgress(value=0.0, max=4500.0), HTML(value='')))




In [34]:
alt.Chart(df_neg).mark_circle(size=50).encode(
        x="ratings", 
        y="sentiments", 
        color="ratings", 
        tooltip=["ratings", "sentiments"]
    ).interactive()

Output hidden; open in https://colab.research.google.com to view.

In [36]:
print_corr(df_neg)

0.38231738534029064
0.5960469143868878


## Step7: adjusting your dictionary-based sentiment analyzer

## Step8: checking your results