# Music Trends (1921-2020) - Exploratory Data Analysis

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [None]:
DATA_DIR = Path("../data/raw")

df_year = pd.read_csv(DATA_DIR / "data_by_year.csv")
df_tracks = pd.read_csv(DATA_DIR / "data.csv")

df_year.head(), df_tracks.head()

In [None]:
print(df_year.shape)
print(df_tracks.shape)

df_year.isna().sum().sort_values(ascending=False).head(10)

In [None]:
df_year.columns

In [None]:
df_year = df_year.sort_values("year").reset_index(drop=True)
df_year[["year"]].head()

In [None]:
plt.figure()
plt.plot(df_year["year"], df_year["loudness"])
plt.title("Average Loudness Over Time (1921–2020)")
plt.xlabel("Year")
plt.ylabel("Loudness (dB)")
plt.show()

In [None]:
plt.figure()
plt.plot(df_year["year"], df_year["danceability"], label="danceability")
plt.plot(df_year["year"], df_year["energy"], label="energy")
plt.title("Danceability and Energy Over Time (1921–2020)")
plt.xlabel("Year")
plt.ylabel("Average value")
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

fig, ax1 = plt.subplots()

# Tempo (left axis) - blue
tempo_line = ax1.plot(df_year["year"], df_year["tempo"], color="tab:blue", label="tempo")
ax1.set_xlabel("Year")
ax1.set_ylabel("Tempo (BPM)", color="tab:blue")
ax1.tick_params(axis="y", colors="tab:blue")

# Valence (right axis) - orange
ax2 = ax1.twinx()
valence_line = ax2.plot(df_year["year"], df_year["valence"], color="tab:orange", label="valence")
ax2.set_ylabel("Valence (0–1)", color="tab:orange")
ax2.tick_params(axis="y", colors="tab:orange")

plt.title("Tempo and Valence Over Time (1921–2020)")

# Single combined legend
lines = tempo_line + valence_line
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, loc="best")

plt.tight_layout()
plt.show()

In [None]:
year_counts = df_tracks['year'].value_counts().sort_index()
print(year_counts)

In [None]:
tracks_per_year = df_tracks.groupby("year").size()

plt.figure()
plt.plot(tracks_per_year.index, tracks_per_year.values)
plt.title("Track Count by Year (dataset coverage)")
plt.xlabel("Year")
plt.ylabel("Number of tracks")
plt.show()


In [None]:
tracks_per_year.sort_values().head(15)


In [None]:
tmp = df_tracks.copy()
tmp["decade"] = (tmp["year"] // 10) * 10
tmp.groupby("decade").size().rename("track_count")

In [None]:
plt.figure()
plt.plot(df_year["year"], df_year["loudness"])
plt.title("Average Loudness Over Time (1921–2020)")
plt.xlabel("Year")
plt.ylabel("Loudness (dB)")
plt.tight_layout()
plt.savefig("../outputs/figures/loudness_over_time.png", dpi=200)
plt.show()

In [None]:
tracks_per_year = df_tracks.groupby("year").size()

plt.figure()
plt.plot(tracks_per_year.index, tracks_per_year.values)
plt.title("Dataset Coverage: Track Count by Year")
plt.xlabel("Year")
plt.ylabel("Tracks")
plt.show()