In [51]:
import pandas as pd
import nltk
import json
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from collections import Counter

In [52]:
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/felixvu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/felixvu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixvu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
df = pd.read_csv('../data/preprocessed_data.csv')

In [54]:
# Perform sentiment analysis
sia = SentimentIntensityAnalyzer()
df['Sentiment'] = df['Descriptions_clean'].apply(lambda x: sia.polarity_scores(x))

In [55]:
# Extract negative, positive, and neutral scores
df['Negative Score'] = df['Sentiment'].apply(lambda x: x['neg'])
df['Positive Score'] = df['Sentiment'].apply(lambda x: x['pos'])
df['Neutral Score'] = df['Sentiment'].apply(lambda x: x['neu'])

In [56]:
# Display the scores for each title
print(df[['Title', 'Negative Score', 'Positive Score', 'Neutral Score']])

                         Title  Negative Score  Positive Score  Neutral Score
0                       Others           0.006           0.138          0.856
1               Data Scientist           0.033           0.100          0.867
2               Data Scientist           0.033           0.100          0.867
3             Business Analyst           0.025           0.132          0.843
4                Data Engineer           0.000           0.143          0.857
..                         ...             ...             ...            ...
325           Business Analyst           0.000           0.068          0.932
326             Data Scientist           0.016           0.166          0.818
327             Data Scientist           0.050           0.234          0.717
328  Machine Learning Engineer           0.009           0.176          0.815
329  Machine Learning Engineer           0.009           0.176          0.815

[330 rows x 4 columns]


In [57]:
# Group scores by title
grouped_df = df.groupby('Title').agg({'Negative Score': 'mean', 'Positive Score': 'mean', 'Neutral Score': 'mean'}).reset_index()
grouped_df = grouped_df.round(4)


In [58]:
# Remove 'Others' category
grouped_df = grouped_df[grouped_df['Title'] != 'Others']

In [59]:
# Sort by title
grouped_df = grouped_df.sort_values('Title')

In [60]:
# Initialize dictionary to store words by title
word_data = {}

In [61]:
# Define words to be excluded
excluded_words = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'or', 'that', 'the', 'to', 'was', 'were', 'will', 'with']

In [62]:
# Categorize words by title
for title, description in zip(df['Title'], df['Descriptions_clean']):
    tokens = word_tokenize(description)
    negative_words = []
    positive_words = []
    neutral_words = []
    for token in tokens:
        # Skip single-letter words
        if len(token) == 1:
            continue
        # Skip excluded words
        if token.lower() in excluded_words:
            continue
        
        score = sia.polarity_scores(token)['compound']
        if score < 0:
            negative_words.append(token)
        elif score > 0:
            positive_words.append(token)
        else:
            neutral_words.append(token)
    
    if title not in word_data:
        word_data[title] = {'Negative Words': negative_words, 'Positive Words': positive_words, 'Neutral Words': neutral_words}
    else:
        word_data[title]['Negative Words'].extend(negative_words)
        word_data[title]['Positive Words'].extend(positive_words)
        word_data[title]['Neutral Words'].extend(neutral_words)


In [63]:
# # Sort words by counts in descending order
# for title in word_data:
#     negative_word_counts = Counter(word_data[title]['Negative Words'])
#     positive_word_counts = Counter(word_data[title]['Positive Words'])
#     neutral_word_counts = Counter(word_data[title]['Neutral Words'])
    
#     word_data[title]['Negative Words'] = sorted(negative_word_counts.keys(), key=lambda x: negative_word_counts[x], reverse=True)
#     word_data[title]['Positive Words'] = sorted(positive_word_counts.keys(), key=lambda x: positive_word_counts[x], reverse=True)
#     word_data[title]['Neutral Words'] = sorted(neutral_word_counts.keys(), key=lambda x: neutral_word_counts[x], reverse=True)

In [64]:
# Sort words by counts in descending order and keep only top 15 words
for title in word_data:
    negative_word_counts = Counter(word_data[title]['Negative Words'])
    positive_word_counts = Counter(word_data[title]['Positive Words'])
    neutral_word_counts = Counter(word_data[title]['Neutral Words'])
    
    word_data[title]['Negative Words'] = sorted(negative_word_counts.keys(), key=lambda x: negative_word_counts[x], reverse=True)[:15]
    word_data[title]['Positive Words'] = sorted(positive_word_counts.keys(), key=lambda x: positive_word_counts[x], reverse=True)[:15]
    word_data[title]['Neutral Words'] = sorted(neutral_word_counts.keys(), key=lambda x: neutral_word_counts[x], reverse=True)[:15]


In [65]:
# # Save words to JSON
# with open('../data/word_categories_descending.json', 'w') as file:
#     json.dump(word_data, file)

In [66]:
# Save words to JSON
with open('../data/word_categories_descending_15.json', 'w') as file:
    json.dump(word_data, file)

In [67]:
# Convert grouped data to JSON
result = grouped_df.to_json(orient='records')

In [68]:
# Save JSON to a file
with open('../data/sentiment_scores.json', 'w') as file:
    json.dump(json.loads(result), file)