In [2]:
# Import libraries and packages
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet, sentiwordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import emoji

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('vader_lexicon') # vader sentiment
nltk.download('sentiwordnet') # sentiwordnet sentiment
nltk.download('punkt')

stemmer = PorterStemmer()
sentiment_analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package stopwords to /Users/miao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/miao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/miao/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/miao/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [3]:
# data overview
def df_overview(df):
    print ('Rows     : ', df.shape[0])
    print ('Columns  : ', df.shape[1])
    print ('\nFeatures : ', df.columns.tolist())
    print ('\nMissing values :  ', df.isnull().sum().values.sum())
    print ('\nUnique values :  \n', df.nunique())
    return df.shape[0]

In [4]:
# Import Dataset
raw_df = pd.read_csv('review-info.dat', sep='\t', lineterminator='\n')
df = raw_df.fillna("")

In [5]:
df = df[['reviewAuthor', 'visitDate', 'reviewDate', 'reviewStars', 'reviewTotalReviews', 'reviewAuthorAddress', 'reviewTitle', 'translated_reviewText']]
df.columns = ['author', 'exp_date', 'review_date', 'rating', 'review_num', 'address', 'title', 'review']
curr_rows = df_overview(df)

Rows     :  2715
Columns  :  8

Features :  ['author', 'exp_date', 'review_date', 'rating', 'review_num', 'address', 'title', 'review']

Missing values :   0

Unique values :  
 author         2620
exp_date         48
review_date      41
rating            5
review_num      514
address        1341
title          2469
review         2700
dtype: int64


In [9]:
# clean data 
# remove duplicates on `same author and same review text`
df = df.drop_duplicates(subset=['author', 'review'], keep='last')
print("\n")
print("\n======================")
print("Cleaned {} duplicate rows!".format(curr_rows - df_overview(df)))




Rows     :  2620
Columns  :  8

Features :  ['author', 'exp_date', 'review_date', 'rating', 'review_num', 'address', 'title', 'review']

Missing values :   0

Unique values :  
 author         2620
exp_date         47
review_date      41
rating            5
review_num      511
address        1331
title          2394
review         2620
dtype: int64
Cleaned 95 duplicate rows!


In [11]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [12]:
# Returns list of pos-neg and objective score. But returns empty list if not present in senti wordnet.
def get_word_sentiment(word, tag):
    #Synset is a special kind of a simple interface that is present in NLTK to look up words in WordNet. 
    #Synset instances are the groupings of synonymous words that express the same concept. 
    #Some of the words have only one Synset and some have several.
    synsets = wordnet.synsets(word, pos=tag)
    
    if not synsets:
        return ['', 0, 0, 0]

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = sentiwordnet.senti_synset(synset.name())

    return [synset.name(), swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]

In [13]:
def get_sentiment(text):
    pos_tags = pos_tag(text.split())
    pos_score, neg_score, obj_score, word_count = 0, 0, 0, 0
    for word, tag in pos_tags:
        res = get_word_sentiment(word, get_wordnet_pos(tag))
        pos_score += res[1]
        neg_score += res[2]
        obj_score += res[3]
        word_count += 1
        
    return [pos_score/word_count, neg_score/word_count, obj_score/word_count]

In [14]:
def clean_text(text):
    # lower the text
    text = text.lower()
    
    # remove hyperlink
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
    
    # ad-hoc remove
    text = text.replace("cr ", "costa rica ").replace("CR ", "costa rica")\
    .replace("manual ", "manuel ").replace("Manual ", "manuel ")\
    .replace("manuel antonio", "").replace("esp ", "especially ").replace("parc ", "park ")\
    .replace("&quot;", "").replace("&#39;", "'").replace("；", ";")\
    .replace("。", ".").replace("，", ",").replace("’", "'").replace("‘", "'").replace("`", "'")

    # converting emoji
    text = emoji.demojize(text)
    text = re.sub(r':[a-z_&]+:', '', text)

    # replace all types of negations: no, n't, never
    text = text.replace("he's", "he is").replace("I'm", "I am").replace("'re", " are").replace("ain't", "are not")\
    .replace("'ve", " have").replace("'ll", " will").replace("won't", "will not").replace("can't", "can not")\
    .replace("n't", " not").replace("'d", " would")
    
    # remove all symbols
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)

    # tokenize the text with removal of non-words,  punctuation, short (< 3 symbols) and long (> 25 symbols) tokens
    text = [word.strip() for word in text.split() if len(word.strip()) >= 3 and len(word.strip()) <= 25]

    # filtering English stopwords, and remove digits
    stop = stopwords.words('english')
    text = [word for word in text if not any(c.isdigit() for c in word)]
    text = [word for word in text if word not in stop]

    # Part-Of-Speech (POS) tagging: assign a tag to every word to define if it corresponds to a noun, a verb etc. using the WordNet lexical database (retaining nouns and adjectives)
    pos_tags = pos_tag(text)
    pos_tags = list(filter(lambda x: get_wordnet_pos(x[1]) is not None, pos_tags))
    text = [w for w, t in pos_tags if get_wordnet_pos(t) == wordnet.NOUN or get_wordnet_pos(t) == wordnet.ADJ]

    # stemming (reducing inflected words to their word stems using Porter stemmer)
    text = [stemmer.stem(word) for word in text]

    # lemmatize the text: transform every word into their root form (e.g. rooms -> room, slept -> sleep)
    text = [WordNetLemmatizer().lemmatize(w, get_wordnet_pos(t)) for w, t in pos_tags]

    # transformation of words with variant spellings (e.g., terracotta and terra cotta)
    
    # join the token
    text = ' '.join(text)
    return(text)

In [15]:
df["review_clean"] = df["review"].apply(lambda x: clean_text(x))
df.head(5)

Unnamed: 0,author,exp_date,review_date,rating,review_num,address,title,review,lat,lng,country_code,country_name,review_clean
0,Viajador,2020-09-01,2020-09-01,3.0,6,,Beau mais cher,"Beautiful park with beautiful beaches, but man...",,,,,beautiful park beautiful reach many trail clos...
1,673oscar998,2020-09-01,2020-09-01,4.0,1121,"Santa Ana, Costa Rica",Muy buena opción,A park very organized in the sanitary measures...,-84.1821,-84.1821,CR,Costa Rica,park organize sanitary measure due anaemic flo...
2,around the world and back,2019-10-01,2020-09-01,5.0,18,"Denver, Colorado",Best with a guide,"Without a guide, it's a nice walk and you can'...",-104.953,-104.953,US,United States,guide nice walk miss monkey bite aggressive gu...
3,Phillip Dettleff,2019-11-01,2020-09-01,5.0,2372,"Santiago, Chile","Fauna silvestre, naturaleza, paisajes hermosos...",Manuel Antonio park is a must if you are in Co...,-70.6153,-70.6153,CL,Chile,park costa rich wellmaintained preserve park w...
4,Yuka O,2019-10-01,2020-09-01,5.0,2,"Whistler, Canada",Travel and birding,I took a guided tour. Robert Umaña is a guide ...,-122.957,-122.957,CA,Canada,take guide tour robert omaha guide park friend...


In [16]:
# (VADER) add sentiment anaylsis columns
df["sentiments"] = df["review_clean"].apply(lambda x: sentiment_analyzer.polarity_scores(x))
df = pd.concat([df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)], axis=1)
df.head(5)

Unnamed: 0,author,exp_date,review_date,rating,review_num,address,title,review,lat,lng,country_code,country_name,review_clean,neg,neu,pos,compound
0,Viajador,2020-09-01,2020-09-01,3.0,6,,Beau mais cher,"Beautiful park with beautiful beaches, but man...",,,,,beautiful park beautiful reach many trail clos...,0.276,0.455,0.27,-0.0516
1,673oscar998,2020-09-01,2020-09-01,4.0,1121,"Santa Ana, Costa Rica",Muy buena opción,A park very organized in the sanitary measures...,-84.1821,-84.1821,CR,Costa Rica,park organize sanitary measure due anaemic flo...,0.0,0.846,0.154,0.743
2,around the world and back,2019-10-01,2020-09-01,5.0,18,"Denver, Colorado",Best with a guide,"Without a guide, it's a nice walk and you can'...",-104.953,-104.953,US,United States,guide nice walk miss monkey bite aggressive gu...,0.151,0.613,0.236,0.4201
3,Phillip Dettleff,2019-11-01,2020-09-01,5.0,2372,"Santiago, Chile","Fauna silvestre, naturaleza, paisajes hermosos...",Manuel Antonio park is a must if you are in Co...,-70.6153,-70.6153,CL,Chile,park costa rich wellmaintained preserve park w...,0.0,0.659,0.341,0.9774
4,Yuka O,2019-10-01,2020-09-01,5.0,2,"Whistler, Canada",Travel and birding,I took a guided tour. Robert Umaña is a guide ...,-122.957,-122.957,CA,Canada,take guide tour robert omaha guide park friend...,0.027,0.554,0.419,0.9661


In [17]:
# sentiwordnet sentiment
df["sentiments"] = df["review_clean"].apply(lambda x: get_sentiment(x))
df = pd.concat([df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)], axis=1)
df.columns = ['author', 'exp_date', 'review_date', 'rating', 'review_num', 'address', 'title', 'raw_review', 'lat', 'lng', 'country_code', 'country_name', 'cleaned_review', 'vader_neg', 'vader_neu', 'vader_pos', 'vader_compound', 'swn_pos', 'swn_neg', 'swn_obj']
df.head(5)

Unnamed: 0,author,exp_date,review_date,rating,review_num,address,title,raw_review,lat,lng,country_code,country_name,cleaned_review,vader_neg,vader_neu,vader_pos,vader_compound,swn_pos,swn_neg,swn_obj
0,Viajador,2020-09-01,2020-09-01,3.0,6,,Beau mais cher,"Beautiful park with beautiful beaches, but man...",,,,,beautiful park beautiful reach many trail clos...,0.276,0.455,0.27,-0.0516,0.10119,0.029762,0.72619
1,673oscar998,2020-09-01,2020-09-01,4.0,1121,"Santa Ana, Costa Rica",Muy buena opción,A park very organized in the sanitary measures...,-84.1821,-84.1821,CR,Costa Rica,park organize sanitary measure due anaemic flo...,0.0,0.846,0.154,0.743,0.046512,0.069767,0.697674
2,around the world and back,2019-10-01,2020-09-01,5.0,18,"Denver, Colorado",Best with a guide,"Without a guide, it's a nice walk and you can'...",-104.953,-104.953,US,United States,guide nice walk miss monkey bite aggressive gu...,0.151,0.613,0.236,0.4201,0.125,0.007353,0.75
3,Phillip Dettleff,2019-11-01,2020-09-01,5.0,2372,"Santiago, Chile","Fauna silvestre, naturaleza, paisajes hermosos...",Manuel Antonio park is a must if you are in Co...,-70.6153,-70.6153,CL,Chile,park costa rich wellmaintained preserve park w...,0.0,0.659,0.341,0.9774,0.05819,0.015086,0.788793
4,Yuka O,2019-10-01,2020-09-01,5.0,2,"Whistler, Canada",Travel and birding,I took a guided tour. Robert Umaña is a guide ...,-122.957,-122.957,CA,Canada,take guide tour robert omaha guide park friend...,0.027,0.554,0.419,0.9661,0.101351,0.030405,0.652027


In [18]:
df.index += 1
df.to_csv('sentiment-analysis.csv', encoding='utf-8-sig')