# Scattertext for Descriptive Text Analytics and Visualization Using Hotel Review Data
---

* You can find the data here: https://www.kaggle.com/rtatman/deceptive-opinion-spam-corpus/data
* This corpus consists of truthful and deceptive hotel reviews of 20 Chicago hotels. The data is described in two papers according to the sentiment of the review. In particular, we discuss positive sentiment reviews in [1] and negative sentiment reviews in [2]. 

In [1]:
import scattertext as st
import spacy
from pprint import pprint
import en_core_web_sm

#CSV
import csv
from collections import Counter

#pandas
import pandas as pd

#Matplotlib
import matplotlib.pyplot as plt
% matplotlib inline

#numpy
import numpy as np

# nltk
import nltk
# stopwords, FreqDist, word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

#regular expression
import re

#seaborn
import seaborn as sns

#import packages for scatter text
import scattertext as st
import spacy
from pprint import pprint
import en_core_web_sm

#SKlearn packages
import sklearn
from lightning.classification import CDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# feature engineering (words to vectors)
from sklearn.feature_extraction.text import TfidfVectorizer
# classification algorithms (or classifiers)
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
# build a pipeline
from sklearn.pipeline import Pipeline
# model evaluation, validation
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV 
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
#pip install scikit-plot 
import scikitplot as skplt

##### Import data

In [2]:
df = pd.read_csv("data/deceptive-opinion.csv", encoding = 'iso-8859-1')

In [3]:
#delete columns we dont need
del df['hotel']
del df['source']

In [4]:
df.groupby(['deceptive','polarity']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
deceptive,polarity,Unnamed: 2_level_1
deceptive,negative,400
deceptive,positive,400
truthful,negative,400
truthful,positive,400


#### Visualizing topics of truthful reviews and deceptive reviews

In [5]:
nlp = en_core_web_sm.load()
empath_corpus = st.CorpusFromParsedDocuments(df, category_col='deceptive', feats_from_spacy_doc=st.FeatsFromOnlyEmpath(), parsed_col='text').build()

In [6]:
html = st.produce_scattertext_explorer(empath_corpus, category= 'deceptive', category_name='Deceptive Reviews', 
                                       not_category_name='Truthful Reviews', width_in_pixels=1000, 
                                       metadata=df['polarity'], use_non_text_features=True, use_full_doc=True)
open("deceptive_truthful.html", 'wb').write(html.encode('utf-8'))

2510993

In [7]:
from IPython.display import IFrame
from IPython.core.display import display, HTML
file_name = 'deceptive_truthful.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1300, height=800)

### Visualizing topics of postive reviews and negative reviews

In [8]:
nlp = en_core_web_sm.load()
empath_corpus = st.CorpusFromParsedDocuments(df, category_col='polarity', feats_from_spacy_doc=st.FeatsFromOnlyEmpath(), parsed_col='text').build()

In [9]:
html = st.produce_scattertext_explorer(empath_corpus, category= 'positive', category_name='Positive Reviews', 
                                       not_category_name='Negative Reviews', width_in_pixels=1000, 
                                       metadata=df['deceptive'], use_non_text_features=True, use_full_doc=True)
open("pos_neg.html", 'wb').write(html.encode('utf-8'))

2511766

In [10]:
from IPython.display import IFrame
from IPython.core.display import display, HTML
file_name = 'pos_neg.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1300, height=800)

In [11]:
df.groupby(['deceptive','polarity']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
deceptive,polarity,Unnamed: 2_level_1
deceptive,negative,400
deceptive,positive,400
truthful,negative,400
truthful,positive,400


# Lexicalized Semiotic Squares
We will use semiotic squares to visualize the sentiment or polarity of the reviews (positive and negative) and the deceptiveness of the reviews. 
* This technique will visualize the topics of the data in four quadrants
* The Y Axis will show deceptive vs truthful
* The X axis will show positive vs negative
---

##### First we need to create a category column that combines the deceptive and polarity columns. 
> This will give us four categories
* truthful positive
* truthful negative
* deceptive positive
* deceptive negative

In [13]:
df['category'] = df['deceptive'] + ' ' + df['polarity']

In [14]:
df.head(2)

Unnamed: 0,deceptive,polarity,text,category
0,truthful,positive,We stayed for a one night getaway with family ...,truthful positive
1,truthful,positive,Triple A rate with upgrade to view room was le...,truthful positive


##### Tokenize the text column using NLP

In [15]:
df['parse'] = df['text'].apply(nlp)

In [16]:
df.head(1)

Unnamed: 0,deceptive,polarity,text,category,parse
0,truthful,positive,We stayed for a one night getaway with family ...,truthful positive,"(We, stayed, for, a, one, night, getaway, with..."


#### This is a function that will let us get data from the dataframe, and pass it into the scattertext functions

In [17]:
def get_metadata_from_corpus(corpus):
    df = corpus.get_df()
    return (df.deceptive + ', ' 
            + df.polarity + ', ')

### Build a corpus for the semotic square
* category_col is set to category
* parsed_col is sett to parse
* The minimum_term and term_count count removes redundant and infrequent terms

In [19]:
# Create corpus and filter terms
# to remove stopwords: 
    # from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
    # put this after .build()
        # .remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True)
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

four_square_corpus = (st.CorpusFromParsedDocuments(df, 
                                                   category_col = 'category', 
                                                   parsed_col = 'parse')
                      .build()
                      .compact(st.CompactTerms(minimum_term_count=2, slack=5))
                      .compact(st.ClassPercentageCompactor(term_count=2)))

### Set up chart structure
**Categories**
* category_a_list=['truthful positive']
* category_b_list=['deceptive positive']
* not_category_a_list=['deceptive negative']
* not_category_b_list=['truthful negative']

**Labels**<br>
There are a total of 8 labels these will make much more sense once you look at the visualization
* a: 'Truthful and Positive'
* b: 'Deceptive and Positive'
* not_a_and_not_b: 'Negative'
* a_and_b: 'Positive'
* a_and_not_b: 'Truthful'
* b_and_not_a: 'Deceptive'
* not_a: 'Deceptive and Negative'
* not_b': 'Truthful and Negative'

In [20]:
four_square = st.FourSquare(
    four_square_corpus,
    category_a_list=['truthful positive'],
    category_b_list=['deceptive positive'],
    not_category_a_list=['deceptive negative'],
    not_category_b_list=['truthful negative'],
    scorer=st.RankDifference(),
    labels={'a': 'Truthful and Positive',
            'b': 'Deceptive and Positive',
            'not_a_and_not_b': 'Negative',
            'a_and_b': 'Positive',
            'a_and_not_b': 'Truthful',
            'b_and_not_a': 'Deceptive',
            'not_a': 'Deceptive and Negative',
            'not_b': 'Truthful and Negative'})

In [21]:
html = st.produce_four_square_explorer(four_square=four_square,
                                       x_label='Truthful - Deceptive',
                                       y_label='Positive - Negative',
                                       use_full_doc=True,
                                       pmi_threshold_coefficient=0,
                                       metadata=get_metadata_from_corpus(four_square_corpus))

In [22]:
file_name = 'semiotic_axes.html'
open(file_name, 'wb').write('<center><h2>The Semiotics of Deceptive Hotel Reviews: Deceptive vs. Truthful, Positive vs. Negative</h2></center>'.encode('utf-8') + html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=900)