## Pronouns and Perspective


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

ALBUM_YEARS = {
    # Lana Del Rey albums
    "aka_lizzy_grant": 2010,
    "born_to_die": 2012,
    "ultraviolence": 2014,
    "honeymoon": 2015,
    "lust_for_life": 2017,
    "nfr": 2019,
    "chemtrails_over_the_country_club": 2021,
    "blue_banisters": 2021,
    "did_you_know_ocean_blvd": 2023,

    # Non-Lana Albums
    "rumours": 1977,
    "say_you_will": 2003,
    "street_angel": 1994,
    "badlands": 2015,
    "hopeless_fountain_kingdom": 2017,

    # Poetry
    "ariel": 1965,
    "violet_bent_backwards_over_the_grass": 2020,
    "ocean_vuong_poetry": 2019,
    "my_poetry": 2025,
    "the_colossus": 1960
}


In [9]:
import pandas as pd

# 1) Load the features table (adjust path if needed)
df = pd.read_csv("../data/features/song_features.csv")

# 2) Define which "albums" are actually poetry collections
POETRY_ALBUMS = {
    "ariel",
    "violet_bent_backwards_over_the_grass",
    "my_poetry",
    "ocean_vuong_poetry",
    "the_colossus",
}

LANA_ALBUMS = {
    "aka_lizzy_grant",
    "born_to_die",
    "ultraviolence",
    "honeymoon",
    "lust_for_life",
    "nfr",
    "chemtrails_over_the_country_club",
    "blue_banisters",
    "did_you_know_ocean_blvd",
}

NON_LANA_ALBUMS = {
    "rumours",
    "say_you_will",
    "street_angel",
    "badlands",
    "hopeless_fountain_kingdom",
}



# 3) Ensure a text_type column exists (no KeyError ever again)
df["text_type"] = df["album"].apply(lambda a: "poem" if a in POETRY_ALBUMS else "song")



def classify_artist(album):
    if album in LANA_ALBUMS:
        return "lana"
    elif album in NON_LANA_ALBUMS:
        return "non_lana"
    else:
        return "other"

df["artist_group"] = df["album"].apply(classify_artist)

df["year"] = df["album"].map(ALBUM_YEARS)
df[["album", "year"]].drop_duplicates().sort_values("year")


# 4) Clean splits you can reuse everywhere
df_songs = df[df["text_type"] == "song"].copy()
df_poems = df[df["text_type"] == "poem"].copy()

df_lana = df_songs[df_songs["artist_group"] == "lana"]
df_non_lana = df_songs[df_songs["artist_group"] == "non_lana"]



# 5) Quick sanity checks (so you can trust it)
print(df["text_type"].value_counts())
print("Song albums:", sorted(df_songs["album"].unique()))
print("Poetry collections:", sorted(df_poems["album"].unique()))


text_type
song    203
poem     71
Name: count, dtype: int64
Song albums: ['aka_lizzy_grant', 'badlands', 'blue_banisters', 'born_to_die', 'chemtrails_over_the_country_club', 'did_you_know_ocean_blvd', 'honeymoon', 'hopeless_fountain_kingdom', 'lust_for_life', 'nfr', 'rumours', 'say_you_will', 'street_angel', 'ultraviolence']
Poetry collections: ['ariel', 'my_poetry', 'ocean_vuong_poetry', 'the_colossus', 'violet_bent_backwards_over_the_grass']


In [10]:
FIRST_PERSON = {"i", "me", "my", "mine", "i'm", "i’ve", "i'll"}
SECOND_PERSON = {"you", "your", "yours", "you’re", "you've", "you’ll"}
THIRD_PERSON = {"he", "she", "they", "him", "her", "them", "his", "hers", "their"}


In [11]:
import re

def pronoun_rates(text):
    tokens = re.findall(r"\b\w+\b", text.lower())
    total = len(tokens) if len(tokens) > 0 else 1

    return {
        "first_person_rate": sum(t in FIRST_PERSON for t in tokens) / total,
        "second_person_rate": sum(t in SECOND_PERSON for t in tokens) / total,
        "third_person_rate": sum(t in THIRD_PERSON for t in tokens) / total,
    }


In [13]:
df.columns

Index(['lexical_entropy', 'second_person_rate', 'repetition_rate',
       'phonetic_entropy', 'phonetic_repetition_rate', 'album',
       'first_person_rate', 'third_person_rate', 'song', 'line_end_similarity',
       'text_type', 'artist_group', 'year'],
      dtype='object')

In [12]:
pronoun_df = df.copy()

rates = pronoun_df["lyrics"].apply(pronoun_rates).apply(pd.Series)
pronoun_df = pd.concat([pronoun_df, rates], axis=1)


KeyError: 'lyrics'

In [None]:
album_pronouns = (
    pronoun_df
    .groupby(["artist_group", "album"], as_index=False)
    .agg({
        "first_person_rate": "mean",
        "second_person_rate": "mean",
        "third_person_rate": "mean"
    })
)


In [None]:
plt.figure(figsize=(6.5, 6))

plt.scatter(
    album_pronouns["first_person_rate"],
    album_pronouns["second_person_rate"],
    alpha=0.7
)

for _, row in album_pronouns.iterrows():
    if row["artist_group"] == "lana":
        plt.text(
            row["first_person_rate"],
            row["second_person_rate"],
            row["album"],
            fontsize=9,
            weight="bold"
        )

plt.xlabel("First-Person Rate (I / me)")
plt.ylabel("Second-Person Rate (you)")
plt.title("Perspective Space: Self vs Address")

plt.tight_layout()
plt.show()


In [None]:
lana_pronouns = (
    album_pronouns[album_pronouns["artist_group"] == "lana"]
    .merge(df[["album", "year"]].drop_duplicates(), on="album")
    .sort_values("year")
)

for i in range(len(lana_pronouns) - 1):
    x1, y1 = lana_pronouns.iloc[i][["first_person_rate", "second_person_rate"]]
    x2, y2 = lana_pronouns.iloc[i + 1][["first_person_rate", "second_person_rate"]]

    plt.annotate(
        "",
        xy=(x2, y2),
        xytext=(x1, y1),
        arrowprops=dict(arrowstyle="->", linewidth=2)
    )
