In [208]:
import pandas as pd
import nltk
import json
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

In [209]:
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/felixvu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/felixvu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixvu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [210]:
df = pd.read_csv('../data/preprocessed_data.csv')

In [211]:
# Perform sentiment analysis
sia = SentimentIntensityAnalyzer()
df['Sentiment'] = df['Descriptions_clean'].apply(lambda x: sia.polarity_scores(x))

In [212]:
# Extract negative, positive, and neutral scores
df['Negative Score'] = df['Sentiment'].apply(lambda x: x['neg'])
df['Positive Score'] = df['Sentiment'].apply(lambda x: x['pos'])
df['Neutral Score'] = df['Sentiment'].apply(lambda x: x['neu'])

In [213]:
# Display the scores for each title
print(df[['Title', 'Negative Score', 'Positive Score', 'Neutral Score']])

                         Title  Negative Score  Positive Score  Neutral Score
0                       Others           0.006           0.138          0.856
1               Data Scientist           0.033           0.100          0.867
2               Data Scientist           0.033           0.100          0.867
3             Business Analyst           0.025           0.132          0.843
4                Data Engineer           0.000           0.143          0.857
..                         ...             ...             ...            ...
325           Business Analyst           0.000           0.068          0.932
326             Data Scientist           0.016           0.166          0.818
327             Data Scientist           0.050           0.234          0.717
328  Machine Learning Engineer           0.009           0.176          0.815
329  Machine Learning Engineer           0.009           0.176          0.815

[330 rows x 4 columns]


In [214]:
# Group scores by title
grouped_df = df.groupby('Title').agg({'Negative Score': 'mean', 'Positive Score': 'mean', 'Neutral Score': 'mean'}).reset_index()
grouped_df = grouped_df.round(4)


In [215]:
# Remove 'Others' category
grouped_df = grouped_df[grouped_df['Title'] != 'Others']

In [216]:
# Sort by title
grouped_df = grouped_df.sort_values('Title')

In [217]:
# Initialize dictionary to store words by title
word_data = {}

In [218]:
# Categorize words by title
for title, description in zip(df['Title'], df['Descriptions_clean']):
    tokens = word_tokenize(description)
    negative_words = set()
    positive_words = set()
    for token in tokens:
        if sia.polarity_scores(token)['compound'] < 0:
            negative_words.add(token)
        elif sia.polarity_scores(token)['compound'] > 0:
            positive_words.add(token)
    
    if title not in word_data:
        word_data[title] = {'Negative Words': list(negative_words), 'Positive Words': list(positive_words)}
    else:
        word_data[title]['Negative Words'].extend(list(negative_words))
        word_data[title]['Positive Words'].extend(list(positive_words))
     # Remove duplicates from the word lists
    word_data[title]['Negative Words'] = list(set(word_data[title]['Negative Words']))
    word_data[title]['Positive Words'] = list(set(word_data[title]['Positive Words']))

In [219]:
# Save words to JSON
with open('words.json', 'w') as file:
    json.dump(word_data, file)

In [220]:
# Convert grouped data to JSON
result = grouped_df.to_json(orient='records')

In [221]:
# Save JSON to a file
with open('scores.json', 'w') as file:
    json.dump(json.loads(result), file)