In [15]:
import pandas as pd
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Read the CSV file
df = pd.read_csv('../data/preprocessed_data.csv')

In [16]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
df['Descriptions_clean'] = df['Descriptions_clean'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

In [17]:
# Group by Title and calculate word count for each word in Descriptions_clean
word_counts = df.groupby('Title')['Descriptions_clean'].apply(lambda x: pd.value_counts(' '.join(x).split(" "))).reset_index()

In [18]:
# Rename the column
word_counts.columns = ['title', 'word', 'size']

In [19]:
# Multiply the size by 5 for words with title "Data Analyst"
word_counts.loc[word_counts['title'] == 'Data Analyst', 'size'] *= 6

In [21]:
# Export DataFrame to JSON
output_data = {}

for title, group in word_counts.groupby('title'):
    filtered_group = group[group['size'] > 10]
    if not filtered_group.empty:
        output_data[title] = filtered_group[['word', 'size']].to_dict('records')

with open('../data/word_counts.json', 'w') as file:
    json.dump(output_data, file)