In [None]:
## Libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.lm.preprocessing import flatten
from nltk.util import ngrams
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud
import unicodedata
import stop_words
import spacy
from spacy.lang.en import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
# Import the data and clean/preprocess the text data.
df = pd.read_csv('wine-raitngs.csv')
df.dropna(inplace=True)
df['notes'] = df['notes'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore'))
df['notes'] = df['notes'].str.lower()
df['notes'] = df['notes'].str.replace(r'[^\w\s]','', regex = True)
df['notes'] = df['notes'].str.replace('\d+', '', regex=True)
stop_words = stop_words.STOP_WORDS
df['notes'] = df['notes'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df['notes'].head(5)
df.head(5)

In [None]:
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

for index, row in df.iterrows():
    vs = analyzer.polarity_scores(row['notes'])
    df.loc[index, 'vader_pos'] = vs['pos']
    df.loc[index, 'vader_neg'] = vs['neg']
    df.loc[index, 'vader_neu'] = vs['neu']
    df.loc[index, 'vader_compound'] = vs['compound']

df

In [None]:
# Using df.iterrows()
for index, row in df.iterrows():
    tb = TextBlob(row['notes'])
    df.loc[index, 'textblob_polarity'] = tb.sentiment.polarity
    df.loc[index, 'textblob_subjectivity'] = tb.sentiment.subjectivity

df

In [None]:
from flair.data import Sentence
from flair.nn import Classifier

tagger = Classifier.load('sentiment-fast')

i = 0
for overview in df['notes']:
  try:
    text = Sentence(overview)
    tagger.predict(text)
    df.loc[i, 'flair_sentiment'] = text.tag
    df.loc[i, 'flair_score'] = text.score
  except:
    continue
  i = i+1

df.dropna(inplace=True)
df

In [None]:
import transformers
from transformers import pipeline

specific_model = pipeline(model="bhadresh-savani/distilbert-base-uncased-emotion")
for i in range(len(df.index)):
  tags = specific_model(df.loc[i, 'notes'])
  df.loc[i, 'bert_emotion'] = [tag['label'] for tag in tags]
  df.loc[i, 'bert_score'] = [tag['score'] for tag in tags]

df


import transformers
from transformers import pipeline

specific_model = pipeline(model="bhadresh-savani/distilbert-base-uncased-emotion")

# Using df.iterrows() to avoid KeyError
for index, row in df.iterrows():
    tags = specific_model(row['notes'])
    df.loc[index, 'bert_emotion'] = [tag['label'] for tag in tags]
    df.loc[index, 'bert_score'] = [tag['score'] for tag in tags]

df

In [None]:
most_frequent_sentiments = pd.DataFrame({
    'Flair Sentiment': df.groupby('variety')['flair_sentiment'].apply(lambda x: x.value_counts().index[0]),
    'Flair Score': df.groupby('variety')['flair_score'].apply(lambda x: round(x.mean(), 2)),
    'BERT Emotion': df.groupby('variety')['bert_emotion'].apply(lambda x: x.value_counts().index[0]),
    'BERT Score': df.groupby('variety')['bert_score'].apply(lambda x: round(x.mean(), 2)),
    'Vader Compound': df.groupby('variety')['vader_compound'].apply(lambda x: round(x.mean(), 2)),
    'Vader Positive': df.groupby('variety')['vader_pos'].apply(lambda x: round(x.mean(), 2)),
    'Vader Negative': df.groupby('variety')['vader_neg'].apply(lambda x: round(x.mean(), 2)),
    'Vader Neutral': df.groupby('variety')['vader_neu'].apply(lambda x: round(x.mean(), 2)),
    'TextBlob Polarity': df.groupby('variety')['textblob_polarity'].apply(lambda x: round(x.mean(), 2)),
    'TextBlob Subjectivity': df.groupby('variety')['textblob_subjectivity'].apply(lambda x: round(x.mean(), 2))

print(most_frequent_sentiments)