### Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import nltk
import numpy as np

# nltk.download('punkt')
# nltk.download('vader_lexicon')


### Data Loading

In [None]:
df = pd.read_csv("data/Lyrics_Noah_Kahan.csv")
df["release_date"] = pd.to_datetime(df["release_date"])
df["year"] = df["release_date"].dt.year
df.head()


### NLTK Analysis

In [None]:
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

df['sentiment'] = df['cleaned_lyrics'].apply(get_sentiment)
df.head()

In [None]:
df["year"] = df["year"].fillna("Unreleased")
df["year"] = df["year"].astype(str)

df = df.sort_values(by=["year", "sentiment"], ascending=[True, True])

# Increase figure size
plt.figure(figsize=(15, 12))

palette = sns.color_palette("husl", n_colors=df["year"].nunique())

sns.scatterplot(x=df["sentiment"], y=np.arange(len(df)), hue=df["year"], palette=palette, s=100)

for i, (song, year) in enumerate(zip(df["title"], df["year"])):
    plt.text(df["sentiment"].iloc[i], i, f"{song}", fontsize=8, ha="right", va="center")

# Titles and labels
plt.xlabel("Sentiment Score", fontsize=8)
plt.ylabel("Songs", fontsize=8)
plt.title("Sentiment Analysis of Noah Kahan's Songs by Year", fontsize=14)

plt.axvline(0, color="gray", linestyle="dashed")

plt.grid(axis="x", linestyle="--", alpha=0.6)
plt.yticks([])  
plt.legend(title="Year", loc="best")  

plt.show()


### BERT Analysis

In [None]:
# could not run BERT locally, ran it on Google Colab, but this was the code used. 

# import torch
# from transformers import pipeline

df_bert = pd.read_csv("data/Lyrics_Noah_Kahan_Sentiment_BERT.csv")
df_bert.head()

# sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# def get_bert_sentiment(text):
#     if isinstance(text, str):  # Ensure text is valid
#         result = sentiment_pipeline(text[:512])[0]  # BERT has a 512-token limit
#         score = result['score'] if result['label'] == "POSITIVE" else -result['score']
#         return score
#     return None  # Return None for missing lyrics

# # Apply sentiment analysis to the dataset
# df["sentiment_score"] = df["lyrics"].apply(get_bert_sentiment)


In [None]:
df_bert["release_date"] = pd.to_datetime(df_bert["release_date"])
df_bert["year"] = df_bert["release_date"].dt.year
df_bert["year"] = df_bert["year"].fillna("Unreleased")
df_bert["year"] = df_bert["year"].astype(str)

df_bert = df_bert.sort_values(by=["year", "sentiment_score"], ascending=[True, True])

# Increase figure size
plt.figure(figsize=(15, 12))

palette = sns.color_palette("husl", n_colors=df_bert["year"].nunique())

df_bert_negative = df_bert[df_bert['sentiment_score'] < 0]
sns.scatterplot(x=df_bert_negative["sentiment_score"], y=np.arange(len(df_bert_negative)), hue=df_bert_negative["year"], palette=palette, s=100, legend = False) 

for i, (song, year) in enumerate(zip(df_bert_negative["title"], df_bert_negative["year"])):
    plt.text(df_bert_negative["sentiment_score"].iloc[i], i, f"{song}", fontsize=8, ha="right", va="center")

# Titles and labels
plt.xlabel("Sentiment Score", fontsize=8)
plt.ylabel("Songs", fontsize=8)
plt.title("Negative Sentiments", fontsize=14)
plt.xlim([-1.1, -0.6])

plt.axvline(0, color="gray", linestyle="dashed")
plt.grid(axis="x", linestyle="--", alpha=0.6)
plt.yticks([])  

plt.show()


In [None]:
plt.figure(figsize=(15, 12))

palette = sns.color_palette("husl", n_colors=df_bert["year"].nunique())

df_bert_positive = df_bert[df_bert['sentiment_score'] > 0]
sns.scatterplot(x=df_bert_positive["sentiment_score"], y=np.arange(len(df_bert_positive)), hue=df_bert_positive["year"], palette=palette, s=100)

for i, (song, year) in enumerate(zip(df_bert_positive["title"], df_bert_positive["year"])):
    plt.text(df_bert_positive["sentiment_score"].iloc[i], i, f"{song}", fontsize=8, ha="right", va="center")

# Titles and labels
plt.xlabel("Sentiment Score", fontsize=8)
plt.ylabel("Songs", fontsize=8)
plt.title("Sentiment Analysis of Noah Kahan's Songs by Year", fontsize=14)
plt.xlim([0.6,1])

plt.axvline(0, color="gray", linestyle="dashed")
plt.grid(axis="x", linestyle="--", alpha=0.6)
plt.yticks([])  
plt.legend(title="Year", loc="best")  

plt.show()

### Compare results of BERT and Normal 

In [None]:
df_bert["sentiment_class"] = np.where(df_bert["sentiment_score"] > 0, "Positive", "Negative")
df["sentiment_class"] = np.where(df["sentiment"] > 0, "Positive", "Negative")

In [None]:
df_comparison = df.merge(df_bert, on="title", suffixes=("_textblob", "_bert"))

df_mismatched = df_comparison[df_comparison["sentiment_class_textblob"] != df_comparison["sentiment_class_bert"]]

print(df_mismatched[["title", "sentiment_class_textblob", "sentiment_class_bert"]])