In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import nltk
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

!pip install vaderSentiment
nltk.download('opinion_lexicon')

[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

#1. Setting up two dataframes for the Opinion Lexicon and Vader Lexicon

In [0]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import opinion_lexicon
from nltk.tokenize import TweetTokenizer

hotelDataOp = pd.read_csv('/content/drive/My Drive/Feature_generated_sets/raw/Hotel_reviews_features_selected.csv')
hotelDataVader = pd.read_csv('/content/drive/My Drive/tokenTrue_remStpwrdsTrue_stemmTrue_lemmatizeFalse_nGramFalse_nGram_length2.csv')
hotelData = pd.read_csv('/content/drive/My Drive/Feature_generated_sets/raw/Hotel_reviews_features_selected.csv')

review_HotelData = hotelData['Reviewer_Score']

review_OpinionLexicon = hotelDataOp["Review"]
review_VaderLexicon = hotelDataVader["Review"]

review_HotelData = review_HotelData.astype(str)
reviewOpinionLexicon = review_OpinionLexicon.astype(str)
reviewVaderLexicon = review_VaderLexicon.astype(str)

# 2. Opinion Lexicon
Filling two lists, one with positive and one with negative words.
Run the opinion lexicon on the review dataset.
Define the opinion lexicon result as 0 = "good", 1 = "ok" or 2 = "bad".

In [0]:
positive_list=set(opinion_lexicon.positive())
negative_list=set(opinion_lexicon.negative())

tokenizer = TweetTokenizer()

In [0]:
#counts the negative vs the positive words 
def scoreOpinionLexicon(review):
    score=0
        
    words = [word.lower() for word in tokenizer.tokenize(review)]
      
    for word in words:
            if word in positive_list:
              score += 1
            elif word in negative_list:
              score -= 1 
    return score

In [0]:
reviewOpinionLexicon = reviewOpinionLexicon.apply(scoreOpinionLexicon)

In [0]:
#rates the review as negative, postive or neutral depending on the positiv or negativ word count
def rateOpinionLexicon(review):
    rating = "0"
    
    if review < 0:
        rating = "2"
    elif review > 0:
        rating = "0"
    else:
        rating = "1"
        
    return rating

In [0]:
reviewOpinionLexicon = reviewOpinionLexicon.apply(rateOpinionLexicon)

In [0]:
reviewOpinionLexicon.to_csv("NLTK_Opinion_Lexicon.csv", header = ['Rating'], index = False)

# 2. Vader Lexicon
Run the Vader lexicon on the review dataset.
Define the Vader lexicon compound result as 0 = "good", 1 = "ok" or 2 = "bad".

In [0]:
def rateVaderLexicon(review): 
    rating = "0"

    # Create a SentimentIntensityAnalyzer object. 
    analyzer = SentimentIntensityAnalyzer()

    # Polarity_scores method of SentimentIntensityAnalyzer 
    # Generated score contains positive, negative, neutral, and compound scores. 
    score = analyzer.polarity_scores(review) 
    
    # Decide if review is positive, negative or neutral
    # Generated compound contains the offsetting of the shares of neutral, positive and negative words  
    if score['compound'] >= 0.725 : 
        rating = "0"
    elif score['compound'] <= 0.55 : 
        rating = "2" 
    else: 
        rating = "1"
    
    return rating

In [0]:
y = len(reviewVaderLexicon)
i = 0
while i<y:
    reviewVaderLexicon[i]= rateVaderLexicon(reviewVaderLexicon[i])
    i+=1

In [0]:
reviewVaderLexicon.to_csv("Vader_Lexicon.csv", header = ['Rating'], index = False)

#3. Create Classification Report

In [13]:
valid = review_HotelData
predictionOpinionLexicon = reviewOpinionLexicon


reportOpinionLexicon = classification_report(valid, predictionOpinionLexicon, output_dict=True)
reportOpinionLexicon

{'0': {'f1-score': 0.7390001205949744,
  'precision': 0.6509166088064914,
  'recall': 0.8546538129222312,
  'support': 293974},
 '1': {'f1-score': 0.23471682301729557,
  'precision': 0.31556545905210187,
  'recall': 0.18684633801042153,
  'support': 134913},
 '2': {'f1-score': 0.40022235387912436,
  'precision': 0.5486283789203498,
  'recall': 0.3150107655640119,
  'support': 86851},
 'accuracy': 0.5890839922596357,
 'macro avg': {'f1-score': 0.4579797658304648,
  'precision': 0.5050368155929811,
  'recall': 0.4521703054988882,
  'support': 515738},
 'weighted avg': {'f1-score': 0.5500329311632559,
  'precision': 0.545965907635247,
  'recall': 0.5890839922596357,
  'support': 515738}}

In [14]:
valid = review_HotelData
predictionVaderLexicon = reviewVaderLexicon

reportVaderLexicon = classification_report(valid, predictionVaderLexicon, output_dict=True)
reportVaderLexicon

{'0': {'f1-score': 0.5588439977194843,
  'precision': 0.6924136613423028,
  'recall': 0.4684734024097369,
  'support': 293974},
 '1': {'f1-score': 0.20961373390557944,
  'precision': 0.26839200805210733,
  'recall': 0.17195526005648085,
  'support': 134913},
 '2': {'f1-score': 0.37690816535594396,
  'precision': 0.2594920227079391,
  'recall': 0.6883973702087484,
  'support': 86851},
 'accuracy': 0.42794209462944366,
 'macro avg': {'f1-score': 0.38178863232700255,
  'precision': 0.4067658973674497,
  'recall': 0.44294201089165536,
  'support': 515738},
 'weighted avg': {'f1-score': 0.43684986201776915,
  'precision': 0.5085883264874477,
  'recall': 0.42794209462944366,
  'support': 515738}}