In [8]:
import pandas as pd
import altair as alt
import numpy as np

data = pd.read_csv('ds4200_spotify_data.csv')

words = data['track_name'].dropna().str.split(expand=True).stack().reset_index(drop=True)
words = words.str.lower()

stop_words = [
    'the', 'a', "it's", 'and', 'of', 'in', 'to', 'for', 'is', 'it',
    'you', 'that', 'with', 'on', 'as', 'this', 'by', 'at',
    'be', 'from', 'not', 'or', 'but', 'all', 'any', 'so',
    'if', 'my', 'your', 'we', 'me', 'they', 'her', 'his',
    'feat', 'featuring', '-', '(', ')', 'version', 'edit', 'remix'
]
filtered_words = words[~words.isin(stop_words)]

word_counts = filtered_words.value_counts().reset_index()
word_counts.columns = ['word', 'count']

top_k = 150
word_counts = word_counts.nlargest(top_k, 'count')

np.random.seed(42)
word_counts['x'] = np.random.uniform(50, 750, size=top_k)
word_counts['y'] = np.random.uniform(50, 750, size=top_k)

word_cloud = alt.Chart(word_counts).mark_text(
    stroke='white',
    strokeWidth=0.5,
    opacity=0.9
).encode(
    text='word',
    size=alt.Size('count', scale=alt.Scale(range=[20, 140])),
    color=alt.Color('count', scale=alt.Scale(scheme='blues'), legend=None),
    x='x:Q',
    y='y:Q',
    tooltip=['word', 'count']
).properties(
    width=800,
    height=800,
    title=alt.TitleParams(
        text="Most Common Words in Song Titles",
        fontSize=24,
        font='Helvetica',
        anchor='middle'
    )
).configure_axis(
    grid=False
).configure_view(
    stroke=None
)

word_cloud

# Save as HTML
word_cloud.save('/Users/carissa/Desktop/cv/word_cloud.html', embed_options={'renderer': 'svg'})