In [None]:
# --- STEP 1: Import Libraries ---
# Load necessary libraries for data handling, text processing, visualization, and machine learning.
import pandas as pd
import numpy as np
import re
import sqlite3
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:

# --- STEP 2: Data Acquisition ---
# Load the dataset from the CSV file
data_file_path = os.path.join("resources", "spotify_millsongdata.csv")
df = pd.read_csv(data_file_path)
df_processed = df.copy()  # Work with a copy to avoid modifying the original data

# Backup the original CSV file
# Not that this is necessary if the original file is not modified in the process, but I like having backups
original_csv_backup = os.path.join("resources", "original_spotify_millsongdata.csv")
df.to_csv(original_csv_backup, index=False)

# Store data in SQLite database
db_path = os.path.join("resources", "music_lyrics.db")
conn = sqlite3.connect(db_path)
df_processed.to_sql("raw_lyrics", conn, if_exists="replace", index=False)

In [None]:
# --- STEP 3: Text Preprocessing ---
# Define function to clean text


# Apply text cleaning function to the 'text' column
# note: lyrics are not named lyrics in the dataset 


In [None]:
# --- STEP 4: Sentiment Analysis using TextBlob ---
# Define function to compute sentiment scores
def get_sentiment(text):
    """Compute sentiment polarity score using TextBlob (-1 negative to +1 positive)."""
    return TextBlob(text).sentiment.polarity

df_processed["sentiment_score"] = df_processed["clean_text"].apply(get_sentiment)

# Save processed data
processed_csv_path = os.path.join("resources", "processed_spotify_millsongdata.csv")
df_processed.to_csv(processed_csv_path, index=False)
df_processed.to_sql("processed_lyrics", conn, if_exists="replace", index=False)



In [None]:
# --- STEP 5: Sentiment Visualization ---
# Plot distribution of sentiment scores
# for our visualizations we need to decide what we are trying to pull from this set

# Generate word cloud of lyrics
# I kept this in for fun since I got it working, but totally useless in the context of this project
all_text = " ".join(df_processed["clean_text"].dropna())
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(all_text)

plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Song Lyrics")
wordcloud_output_path = os.path.join("resources", "wordcloud.png")
plt.savefig(wordcloud_output_path)
plt.show()



In [None]:
# --- STEP 6: Sentiment Prediction Using Machine Learning ---
# Create a binary sentiment label (1 = positive/neutral, 0 = negative)
df_processed["sentiment_label"] = df_processed["sentiment_score"].apply(lambda x: 1 if x >= 0 else 0)

# Extract text features using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(df_processed["clean_text"])
y = df_processed["sentiment_label"]

# Split data into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression classifier
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# the above is untested but should work


In [None]:

# --- STEP 7: Close Database Connection ---
conn.close()
print("SQL connection closed.")