In [175]:
import pandas as pd
import nltk
import json
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

In [176]:
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/felixvu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/felixvu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixvu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [177]:
df = pd.read_csv('../data/preprocessed_data.csv')

In [178]:
# Perform sentiment analysis
sia = SentimentIntensityAnalyzer()
df['Sentiment'] = df['Descriptions_clean'].apply(lambda x: sia.polarity_scores(x))

In [179]:
# Categorize sentiment scores
def categorize_scores(sentiment_scores):
    compound_score = sentiment_scores['compound']
    if compound_score < 0:
        return 'Negative', -compound_score, 0, 1 - compound_score
    elif compound_score > 0:
        return 'Positive', 0, compound_score, 1 - compound_score
    else:
        return 'Neutral', 0, 0, 1

df[['Sentiment_Category', 'Negative_Score', 'Positive_Score', 'Neutral_Score']] = df['Sentiment'].apply(categorize_scores).apply(pd.Series)


In [180]:
# Group by title and calculate normalized scores
grouped_scores = df.groupby('Title')[['Negative_Score', 'Positive_Score', 'Neutral_Score']].sum()
grouped_scores['Total_Score'] = grouped_scores.sum(axis=1)
grouped_scores[['Negative_Score', 'Positive_Score', 'Neutral_Score']] = grouped_scores[['Negative_Score', 'Positive_Score', 'Neutral_Score']].div(grouped_scores['Total_Score'], axis=0)
grouped_scores = grouped_scores.drop('Total_Score', axis=1)
grouped_scores = grouped_scores.round(4)

In [181]:
# Export scores to JSON file
sentiment_dict = grouped_scores.to_dict(orient='index')
with open('../data/sentiment_scores.json', 'w') as f:
    json.dump(sentiment_dict, f)

In [182]:
# Extract words for each sentiment category, grouped by title
word_categories = {}
for title, group in df.groupby('Title'):
    negative_words = set(word_tokenize(' '.join(group[group['Sentiment_Category'] == 'Negative']['Descriptions_clean'])))
    positive_words = set(word_tokenize(' '.join(group[group['Sentiment_Category'] == 'Positive']['Descriptions_clean'])))
    neutral_words = set(word_tokenize(' '.join(group[group['Sentiment_Category'] == 'Neutral']['Descriptions_clean'])))
    word_categories[title] = {
        'Negative': list(negative_words),
        'Positive': list(positive_words),
        'Neutral': list(neutral_words)
    }

In [183]:
# Export words to JSON file
with open('../data/word_categories.json', 'w') as f:
    json.dump(word_categories, f)