In [None]:
import re
import nltk
import spacy
import pandas as pd
import seaborn as sns
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from textblob import TextBlob
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import Word2Vec, KeyedVectors, LdaModel
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF

# Exploratory Data Analysis (EDA)

In [None]:
file_path = 'data/DisneylandReviews.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')
print(df.head())

In [None]:
print(df.info())

In [None]:
print(df.describe())

In [None]:
print(df.isnull().sum())

In [None]:
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_percentage = (missing_values / len(df)) * 100
missing_info = pd.DataFrame({'Missing Values': missing_values, 'Missing Percentage': missing_percentage})

In [None]:
# Visualization 1: Rating Distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='Rating', hue='Rating', dodge=False, legend=False)
plt.title('Rating Distribution')
plt.ylabel('Count')
plt.xlabel('Rating')
plt.tight_layout()
plt.savefig('images/rating_distribution.png')
plt.show()

In [None]:
# Visualization 2: Reviews per Disneyland Branch
plt.figure(figsize=(10, 6))
sns.countplot(data=df, y='Branch', hue='Branch', dodge=False, order=df['Branch'].value_counts().index,
                legend=False)
plt.title('Reviews per Disneyland Branch')
plt.xlabel('Count')
plt.ylabel('Branch')
plt.tight_layout()
plt.savefig('images/branch_distribution.png')
plt.show()

In [None]:
# Visualization 3: Review Length Distribution
df['Review_Length'] = df['Review_Text'].apply(len)
plt.figure(figsize=(8, 5))
sns.histplot(df['Review_Length'], bins=50, kde=True, color='blue')
plt.title('Review Length Distribution')
plt.xlabel('Length of Review')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('images/review_length.png')
plt.show()

In [None]:
# Visualization 4: Reviews per Year
df['Year'] = df['Year_Month'].apply(lambda x: x.split('-')[0] if x != 'missing' else 'Unknown')
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Year', hue='Year', dodge=False, order=sorted(df['Year'].unique()), legend=False)
plt.title('Reviews per Year')
plt.ylabel('Count')
plt.xlabel('Year')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('images/year_distribution.png')
plt.show()

In [None]:
# Visualization 5: Reviews per Reviewer Location (Top 10)
top_locations = df['Reviewer_Location'].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_locations, y=top_locations.index, hue=top_locations.index, dodge=False, legend=False)
plt.title('Top 10 Reviewer Locations')
plt.xlabel('Number of Reviews')
plt.ylabel('Reviewer Location')
plt.tight_layout()
plt.savefig('images/top_locations.png')
plt.show()

In [None]:
# Visualization 6: Rating Over Years by Branch
rating_per_year_branch = df.groupby(['Year', 'Branch'])['Rating'].mean().unstack()
plt.figure(figsize=(14, 8))
sns.heatmap(rating_per_year_branch, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Rating Over Years by Branch')
plt.tight_layout()
plt.savefig('images/rating_over_years_by_branch.png')
plt.show()

In [None]:
# Visualization 7: Wordcloud for positive reviews with 4 or 5 rating
positive_reviews = df[df['Rating'] >= 4]
positive_reviews_text = ' '.join(positive_reviews['Review_Text'])
wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110, background_color='white',
                        max_words=100).generate(positive_reviews_text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Wordcloud for Positive Reviews')
plt.tight_layout()
plt.savefig('images/wordcloud_positive_reviews.png')
plt.show()

In [None]:
# Visualization 8: Wordcloud for negative reviews with 1, 2 or 3 rating
negative_reviews = df[df['Rating'] <= 3]
negative_reviews_text = ' '.join(negative_reviews['Review_Text'])
wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110, background_color='white',
                        max_words=100).generate(negative_reviews_text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Wordcloud for Negative Reviews')
plt.tight_layout()
plt.savefig('images/wordcloud_negative_reviews.png')
plt.show()

# Data preprocessing

In [None]:
# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load English tokenizer, tagger, parser, NER, and word vectors
# Disabling unnecessary components for efficiency
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [None]:
# Custom stopwords
custom_stopwords = {
    'disney', 'land', 'disneyland', 'rides', 'ride', 'good', 'really', 'very', 'quite',
    'pretty', 'especially', 'actually', 'probably', 'maybe', 'sure', 'time', 'day', 'year',
    'thing', 'world', 'point', 'bit', 'number', 'week', 'make', 'say', 'come', 'go', 'know',
    'take', 'see', 'get', 'want', 'think', 'look', 'tell', 'try', 'use', 'need', 'feel',
    'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my',
    'your', 'his', 'its', 'our', 'their', 'a', 'an', 'the', 'in', 'on', 'at', 'from', 'with',
    'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
    'over', 'under', 'to', 'of', 'for', 'by', 'and', 'but', 'or', 'so', 'yet', 'because',
    'as', 'until', 'than', '10', '20', '30', '45', '15', 'minute', 'second', 'hour', 'day', 'pm'
                                                                                            'park', 'go', 'one', 'kid'
}

# Update stop words list
stop_words = set(stopwords.words('english')).union(custom_stopwords)

In [None]:
low_rating_threshold = 3
clean_df = df[df['Rating'] <= low_rating_threshold]

In [None]:
# Handling negations by creating bi-grams with negation word and subsequent word.
def handle_negations(text):
    # Define the negation pattern
    negation_pattern = re.compile(
        r"\b(not|no|never|none|cannot|can't|couldn't|shouldn't|won't|wouldn't|don't|doesn't|didn't|isn't|aren't|ain't"
        r")\s([a-z]+)\b",
        re.IGNORECASE
    )
    negated_form = r'\1_\2'  # E.g., "not_good"
    return negation_pattern.sub(negated_form, text)


# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Normalize text to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
    text = handle_negations(text)  # Handle negations
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [token for token in tokens if token not in stop_words]  # Remove stop words
    lemmatized = nlp(' '.join(tokens))  # Lemmatization
    lemmatized = [token.lemma_ for token in lemmatized]
    return ' '.join(lemmatized)


# Apply preprocessing to the Review_Text column of the DataFrame
clean_df['Clean_Text'] = clean_df['Review_Text'].apply(preprocess_text)

In [None]:
# Display the first few rows of the processed data
print(clean_df[['Review_Text', 'Clean_Text']].head())

# Export to a new CSV file
clean_df.to_csv('data/cleaned_reviews.csv', index=False)

# Bag of Words (BoW) model + LDA, LSA, NMF

In [None]:
# Initialize CountVectorizer and fit and transform
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(2, 3), min_df=5, max_df=0.5)
count_vectors = count_vectorizer.fit_transform(clean_df['Clean_Text'])

In [None]:
# Sum up the counts of each vocabulary word
sum_words = count_vectors.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()]
sorted_words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)

In [None]:
# Display the top N most frequent words
top_n = 30
print("\nTop {} most frequent words/ngrams:".format(top_n))
print("-" * 40)
for word, freq in sorted_words_freq[:top_n]:
    print("{:<20} : {}".format(word, freq))
print("-" * 40)

In [None]:
# Plotting the top N words/ngrams
top_n = 30
words, freqs = zip(*sorted_words_freq[:top_n])
plt.figure(figsize=(10, 8))
plt.barh(range(len(words)), freqs, align='center')
plt.yticks(range(len(words)), words)
plt.gca().invert_yaxis()  # Invert y-axis to have the highest frequency on top
plt.xlabel('Frequency')
plt.title('Top {} Words/N-grams Frequency'.format(top_n))
plt.show()

In [None]:
# Number of topics and top words to display
n_topics = 4
no_top_words = 10

# Initialize and fit LDA, LSA, and NMF models
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42).fit(count_vectors)
lsa = TruncatedSVD(n_components=n_topics).fit(count_vectors)
nmf = NMF(n_components=n_topics, random_state=42).fit(count_vectors)

In [None]:
# Function to display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic {}:".format(topic_idx + 1))
        print(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Display topics for each model
print("\nLDA Model Topics:")
display_topics(lda, count_vectorizer.get_feature_names_out(), no_top_words)
print("\nLSA Model Topics:")
display_topics(lsa, count_vectorizer.get_feature_names_out(), no_top_words)
print("\nNMF Model Topics:")
display_topics(nmf, count_vectorizer.get_feature_names_out(), no_top_words)

In [None]:
# Function to generate a word cloud
def generate_word_cloud(topic, feature_names, no_top_words):
    word_freqs = {feature_names[i]: topic[i] for i in topic.argsort()[:-no_top_words - 1:-1]}
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freqs)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

# Generate word clouds for LDA topics
for topic_idx, topic in enumerate(lda.components_):
    print("Word Cloud for LDA Topic {}:".format(topic_idx + 1))
    generate_word_cloud(topic, count_vectorizer.get_feature_names_out(), no_top_words)

In [None]:
# t-SNE Visualization for LDA
def tsne_visualization(model, data):
    print("\nPerforming t-SNE Visualization...")
    topic_weights = model.transform(data)
    tsne_model = TSNE(n_components=2, verbose=0, random_state=0, angle=.99, init='pca')
    tsne_lda = tsne_model.fit_transform(topic_weights)

    # Plot the t-SNE visualization
    plt.figure(figsize=(10, 5))
    plt.scatter(tsne_lda[:, 0], tsne_lda[:, 1], alpha=0.5)
    plt.title('t-SNE Visualization of LDA Topics')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.show()

tsne_visualization(lda, count_vectors)

# TF-IDF model + LDA, LSA, NMF

In [None]:
# Initialize TfidfVectorizer and fit and transform
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=5, max_df=0.5)
tfidf_vectors = tfidf_vectorizer.fit_transform(clean_df['Clean_Text'])

In [None]:
# Sum up the TF-IDF scores of each vocabulary word
sum_tfidf = tfidf_vectors.sum(axis=0)
words_tfidf = [(word, sum_tfidf[0, idx]) for word, idx in tfidf_vectorizer.vocabulary_.items()]
sorted_words_tfidf = sorted(words_tfidf, key=lambda x: x[1], reverse=True)

# Display the top N words with the highest TF-IDF score
top_n = 30
print("\nTop {} words with the highest TF-IDF scores:".format(top_n))
print("-" * 40)
for word, score in sorted_words_tfidf[:top_n]:
    print("{:<20} : {}".format(word, score))
print("-" * 40)

In [None]:
# Plotting the top N words with the highest TF-IDF scores
tfidf_words, tfidf_scores = zip(*sorted_words_tfidf[:top_n])

plt.figure(figsize=(10, 8))
plt.barh(range(len(tfidf_words)), tfidf_scores, align='center')
plt.yticks(range(len(tfidf_words)), tfidf_words)
plt.gca().invert_yaxis()  # Invert y-axis to have the highest frequency on top
plt.xlabel('Frequency')
plt.title('Top {} Words with Highest TF-IDF Scores'.format(top_n))
plt.savefig('images/word_frequencies.png')
plt.show()

In [None]:
# Number of topics and top words to display
n_topics = 4
no_top_words = 10

# Initialize and fit LDA, LSA, and NMF models
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42).fit(tfidf_vectors)
lsa = TruncatedSVD(n_components=n_topics).fit(tfidf_vectors)
nmf = NMF(n_components=n_topics, random_state=42).fit(tfidf_vectors)

In [None]:
# Function to display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic {}:".format(topic_idx + 1))
        print(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Display topics for each model
print("\nLDA Model Topics:")
display_topics(lda, tfidf_vectorizer.get_feature_names_out(), no_top_words)
print("\nLSA Model Topics:")
display_topics(lsa, tfidf_vectorizer.get_feature_names_out(), no_top_words)
print("\nNMF Model Topics:")
display_topics(nmf, tfidf_vectorizer.get_feature_names_out(), no_top_words)

In [None]:
# Function to generate a word cloud
def generate_word_cloud(topic, feature_names, no_top_words):
    word_freqs = {feature_names[i]: topic[i] for i in topic.argsort()[:-no_top_words - 1:-1]}
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freqs)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.savefig('images/tfidf_word_cloud.png')
    plt.show()

# Generate word clouds for LDA topics
for topic_idx, topic in enumerate(lda.components_):
    print("Word Cloud for LDA Topic {}:".format(topic_idx + 1))
    generate_word_cloud(topic, tfidf_vectorizer.get_feature_names_out(), no_top_words)

In [None]:
# t-SNE Visualization for LDA
def tsne_visualization(model, data):
    print("\nPerforming t-SNE Visualization...")
    topic_weights = model.transform(data)
    tsne_model = TSNE(n_components=2, verbose=0, random_state=0, angle=.99, init='pca')
    tsne_lda = tsne_model.fit_transform(topic_weights)
    return tsne_lda

# Plot t-SNE
def plot_tsne(tsne_results, title):
    plt.figure(figsize=(12, 8))
    plt.scatter(tsne_results[:, 0], tsne_results[:, 1], alpha=0.7)
    plt.xlabel('t-SNE feature 1')
    plt.ylabel('t-SNE feature 2')
    plt.title(title)
    plt.savefig('images/tfidf_tsne_lda.png')
    plt.show()

tsne_lda = tsne_visualization(lda, tfidf_vectors)
plot_tsne(tsne_lda, 't-SNE Visualization of LDA Topics')

# Word2Vec model + LDA, LSA, NMF

In [None]:
# Load the preprocessed reviews
preprocessed_reviews = []
for review in clean_df['Clean_Text']:
    preprocessed_reviews.append(word_tokenize(review))

# Train Word2Vec model
model = Word2Vec(preprocessed_reviews, window=5, min_count=1, workers=4)

# Represent reviews as vectors
review_vectors = [model.wv[token] for review in preprocessed_reviews for token in review]

In [None]:
# Apply LDA for topic modeling
dictionary = Dictionary(preprocessed_reviews)
corpus = [dictionary.doc2bow(review) for review in preprocessed_reviews]
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary)

# Save the Word2Vec model
model.wv.save('models/word2vec.model')

# Print the topics
for topic in lda_model.print_topics():
    print(f"Topic {topic[0]}: {topic[1]}")
    print("\n")

In [None]:
# Load the model
word2vec_model = KeyedVectors.load('word2vec.model')

# Use the Word2Vec model to find similar words
similar_words = word2vec_model.similar_by_word('line')
print(f"The words similar to 'line' are: {similar_words}")
print("\n")

# Use the Word2Vec model to find the similarity between two words
similarity = word2vec_model.similarity('queue', 'long')
print(f"The similarity between 'queue' and 'long' is {similarity}")

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
# Visualize the Word2Vec model using t-SNE
# Get the word vectors
word_vectors = word2vec_model.vectors

# Reduce the dimensionality of the word vectors using t-SNE
tsne = TSNE(n_components=2, random_state=42)
word_vectors_2d = tsne.fit_transform(word_vectors)

# Plot the word vectors in 2D
plt.figure(figsize=(10, 10))
plt.scatter(word_vectors_2d[:, 0], word_vectors_2d[:, 1], marker='o')
plt.title('t-SNE visualization of Word2Vec model')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.tight_layout()
plt.savefig('images/word2vec_tsne.png')
plt.show()

# Sentiment Analysis using TextBlob and VADER

In [None]:
# calculate sentiment and put it in a new column
clean_df['sentiment'] = clean_df['Clean_Text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# convert sentiment to positive or negative and put it in a new column
clean_df['sentiment_cat'] = clean_df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative')

In [None]:
# plot the sentiment distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['sentiment'], kde=True, color='skyblue')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('images/sentiment_distribution.png')
plt.show()

In [None]:
# plot the sentiment distribution by category
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='sentiment_cat', hue='sentiment_cat', dodge=False, palette='pastel')
plt.title('Sentiment Distribution by Category')
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('images/sentiment_category_distribution.png')
plt.show()

In [None]:
# Sentiment Analysis using Vader
# Create a SentimentIntensityAnalyzer object
analyzer = SentimentIntensityAnalyzer()

# Define a function to get the sentiment score
def get_sentiment_score(text):
    return analyzer.polarity_scores(text)['compound']

# Calculate the sentiment score and put it in a new column
clean_df['sentiment_vader'] = clean_df['Clean_Text'].apply(get_sentiment_score)

# Convert sentiment to positive or negative and put it in a new column
clean_df['sentiment_cat_vader'] = clean_df['sentiment_vader'].apply(lambda x: 'positive' if x > 0 else 'negative')

In [None]:
# Compare the sentiment scores from TextBlob and vaderSentiment
print("The average sentiment score from TextBlob is:")
print(df['sentiment'].mean())
print("The average sentiment score from vaderSentiment is:")
print(df['sentiment_vader'].mean())

# Compare the sentiment categories from TextBlob and vaderSentiment
print("The sentiment category distribution from TextBlob is:")
print(df['sentiment_cat'].value_counts())
print("The sentiment category distribution from vaderSentiment is:")
print(df['sentiment_cat_vader'].value_counts())

# Compare the standard deviation of the sentiment scores from TextBlob and vaderSentiment
print("The standard deviation of the sentiment scores from TextBlob is:")
print(df['sentiment'].std())
print("The standard deviation of the sentiment scores from vaderSentiment is:")
print(df['sentiment_vader'].std())

In [None]:
# plot the sentiment distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['sentiment_vader'], kde=True, color='skyblue')
plt.title('Sentiment Distribution (vaderSentiment)')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('images/sentiment_distribution_vader.png')
plt.show()


In [None]:
# plot the sentiment distribution by category
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='sentiment_cat_vader', hue='sentiment_cat_vader', dodge=False, palette='pastel')
plt.title('Sentiment Distribution by Category (vaderSentiment)')
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('images/sentiment_category_distribution_vader.png')
plt.show()

In [None]:
# Save the sentiment analysis results
clean_df.to_csv('data/sentiment_analysis.csv', index=False)

In [None]:
# load the preprocessed reviews
df = pd.read_csv('data/DisneylandReviews.csv', encoding='ISO-8859-1')

# calculate sentiment and put it in a new column
df['sentiment'] = df['Review_Text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# convert sentiment to positive or negative and put it in a new column
df['sentiment_cat'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative')

# Save the sentiment analysis results
df.to_csv('data/sentiment_analysis.csv', index=False)

# plot the sentiment distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['sentiment'], kde=True, color='skyblue')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('images/sentiment_distribution.png')
plt.show()

# plot the sentiment distribution by category
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='sentiment_cat', hue='sentiment_cat', dodge=False, palette='pastel')
plt.title('Sentiment Distribution by Category')
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('images/sentiment_category_distribution.png')
plt.show()

# Sentiment Analysis using Vader
# Create a SentimentIntensityAnalyzer object
analyzer = SentimentIntensityAnalyzer()

# Define a function to get the sentiment score
def get_sentiment_score(text):
    return analyzer.polarity_scores(text)['compound']

# Calculate the sentiment score and put it in a new column
df['sentiment_vader'] = df['Review_Text'].apply(get_sentiment_score)

# Convert sentiment to positive or negative and put it in a new column
df['sentiment_cat_vader'] = df['sentiment_vader'].apply(lambda x: 'positive' if x > 0 else 'negative')

# Compare the sentiment scores from TextBlob and vaderSentiment
print("The average sentiment score from TextBlob is:")
print(df['sentiment'].mean())
print("The average sentiment score from vaderSentiment is:")
print(df['sentiment_vader'].mean())

# Compare the sentiment categories from TextBlob and vaderSentiment
print("The sentiment category distribution from TextBlob is:")
print(df['sentiment_cat'].value_counts())
print("The sentiment category distribution from vaderSentiment is:")
print(df['sentiment_cat_vader'].value_counts())

# Compare the standard deviation of the sentiment scores from TextBlob and vaderSentiment
print("The standard deviation of the sentiment scores from TextBlob is:")
print(df['sentiment'].std())
print("The standard deviation of the sentiment scores from vaderSentiment is:")
print(df['sentiment_vader'].std())

# plot the sentiment distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['sentiment_vader'], kde=True, color='skyblue')
plt.title('Sentiment Distribution (vaderSentiment)')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('images/sentiment_distribution_vader.png')
plt.show()

# plot the sentiment distribution by category
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='sentiment_cat_vader', hue='sentiment_cat_vader', dodge=False, palette='pastel')
plt.title('Sentiment Distribution by Category (vaderSentiment)')
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('images/sentiment_category_distribution_vader.png')
plt.show()


# Save the sentiment analysis results
df.to_csv('data/sentiment_analysis.csv', index=False)