# Install Requests

In [None]:
%%bash
pip install requests

# SANITY CHECK - Search for a song

In [2]:
import requests

# Token from Genius API
access_token = "ULcds9ErDkTAQR14A6TzBFdaTfP-TGTZKtqGp1Bzk0Bam7IoftmGEysS_xm3sXYF"

headers = {
    "Authorization": f"Bearer {access_token}"
}

def search_song(query):
    base_url = "https://api.genius.com"
    search_url = f"{base_url}/search"
    params = {"q": query}
    response = requests.get(search_url, params=params, headers=headers)
    return response.json()

# Search for a song
results = search_song("Welcome to New York Taylor Swift")

# Print first few results
for hit in results["response"]["hits"][:3]:
    print("Title:", hit["result"]["full_title"])
    print("Genius URL:", hit["result"]["url"])
    print()

Title: Welcome to New York by Taylor Swift
Genius URL: https://genius.com/Taylor-swift-welcome-to-new-york-lyrics

Title: ​portorosso’s DISCONTINUED 2023 Listening Log by still-life starlet
Genius URL: https://genius.com/Still-life-starlet-portorossos-discontinued-2023-listening-log-annotated

Title: ​youth group - background music by Josiah Botting
Genius URL: https://genius.com/Josiah-botting-youth-group-background-music-annotated



# Set up Lambda Function Connection

In [3]:
import boto3
import json

# SQS setup
sqs = boto3.client('sqs')
queue_name = 'genius-queue'

In [4]:
# create queue
try:
    response = sqs.create_queue(
        QueueName=queue_name,
        Attributes={
            'VisibilityTimeout': '60'
        }
    )
    queue_url = response['QueueUrl']
    print(f"SQS queue created: {queue_url}")
except sqs.exceptions.QueueNameExists:
    queue_url = [url for url in sqs.list_queues()['QueueUrls'] if queue_name in url][0]
    print(f"SQS queue already exists: {queue_url}")

SQS queue created: https://sqs.us-east-1.amazonaws.com/654654514107/genius-queue


In [6]:
# deploy and connect lambda function
lambda_client = boto3.client('lambda')
iam = boto3.client('iam')
sqs = boto3.client('sqs')

function_name = 'genius_lambda'
zip_path = 'genius_lambda.zip'
sqs_queue_url = queue_url
role = iam.get_role(RoleName='LabRole')['Role']['Arn']

with open(zip_path, 'rb') as f:
    zipped_code = f.read()

In [10]:
# create lambda function
try:
    response = lambda_client.create_function(
        FunctionName=function_name,
        Runtime='python3.9',
        Role=role,
        Handler='lambda_function.lambda_handler',
        Code={'ZipFile': zipped_code},
        Timeout=10
    )
    print("lambda function created!")
except lambda_client.exceptions.ResourceConflictException:
    print("lambda function already exists")
    response = lambda_client.update_function_code(
        FunctionName=function_name,
        ZipFile=zipped_code
    )

lambda function created!


In [7]:
# get ARN for trigger
sqs_attrs = sqs.get_queue_attributes(
    QueueUrl=sqs_queue_url,
    AttributeNames=['QueueArn']
)
sqs_arn = sqs_attrs['Attributes']['QueueArn']

In [11]:
import boto3

lambda_client = boto3.client('lambda', region_name='us-east-1')  # Make sure you're in the correct region
functions = lambda_client.list_functions()

for f in functions['Functions']:
    print(f['FunctionName'])

RedshiftEventSubscription
RoleCreationFunction
genius_lambda
RedshiftOverwatch
MainMonitoringFunction
ModLabRole
q1_lambda_function
MACS30123_0424
a2_1_lambda


In [12]:
# create SQS trigger
try:
    lambda_client.create_event_source_mapping(
        EventSourceArn=sqs_arn,
        FunctionName=function_name,
        Enabled=True,
        BatchSize=10
    )
    print("trigger creation successful")
except lambda_client.exceptions.ResourceConflictException:
    print("trigger already exists")

trigger already exists


## Set Up Lambda Environment Variable

In [13]:
import boto3

lambda_client = boto3.client('lambda')
function_name = 'genius_lambda'  # Replace with your actual Lambda function name

response = lambda_client.update_function_configuration(
    FunctionName=function_name,
    Environment={
        'Variables': {
            'GENIUS_API_TOKEN': 'WQcICxVxX0LlqnCDFstnowelGCDLUDkHiThnFMHzY7RBybx6S2hu2jnsELsg9RPP'
        }
    }
)

print("Environment variable updated!")


Environment variable updated!


In [30]:
import boto3
import json

sqs = boto3.client('sqs')
queue_url = 'https://sqs.us-east-1.amazonaws.com/654654514107/genius-queue'

message_body = {
    "track_id": "123",
    "title": "Welcome to New York",
    "artist": "Taylor Swift"
}

response = sqs.send_message(
    QueueUrl=queue_url,
    MessageBody=json.dumps(message_body)
)

print("Message sent to SQS:", response['MessageId'])

Message sent to SQS: 43842ffd-151c-4c68-8435-8ee8f9698768


In [33]:
import boto3

s3 = boto3.client('s3')
bucket_name = 'genius-bucket-654654514107'

response = s3.list_objects_v2(Bucket='genius-bucket-654654514107')
for obj in response.get('Contents', []):
    print(obj['Key'])

lyrics/123.json


This confirms my entire pipeline is working end-to-end:
- Lambda is being triggered by SQS
- Lyrics are being scraped and stored in S3

# Use Dask to clean and match data

1. Load all lyrics into a Dask dataframe

In [2]:
storage_options = {
'client_kwargs': {
        'region_name': 'us-east-1'
    }
}


In [3]:
import dask.dataframe as dd

df = dd.read_json(
    's3://genius-bucket-654654514107/lyrics/*.json',
)



Unnamed: 0,track_id,title,artist,url,lyrics
0,123,Welcome to New York,Taylor Swift,https://genius.com/Taylor-swift-welcome-to-new...,Lyrics not found.


In [4]:
from dask import delayed
import pandas as pd
import dask.dataframe as dd
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

@delayed
def clean_lyrics(doc):
    text = re.sub(r'[^\w\s]', '', doc.lower())
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered)

# apply to a Dask DataFrame
df['clean_lyrics'] = df['lyrics'].apply(clean_lyrics, meta=('clean_lyrics', 'str'))


In [5]:
csv_df = dd.read_csv('final_joined_table.csv')

In [10]:
print(df.columns)
print(csv_df.columns)

Index(['track_id', 'title', 'artist', 'url', 'lyrics', 'clean_lyrics'], dtype='object')
Index(['track_id', 'spotify_id', 'emotion', 'title', 'artist', 'id',
       'acousticness', 'analysis_url', 'danceability', 'duration', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')


In [12]:
csv_subset = csv_df[['track_id', 'title', 'artist']]
print(csv_subset.columns)

Index(['track_id', 'title', 'artist'], dtype='object')


In [13]:
merged = df.merge(csv_subset, on=['title', 'artist'], how='inner')
print(merged.columns)  # <- check here if 'track_id' is present

Index(['track_id_x', 'title', 'artist', 'url', 'lyrics', 'clean_lyrics',
       'track_id_y'],
      dtype='object')


In [14]:
# Rename 'track_id_y' (Spotify's ID) to just 'track_id'
merged = merged.rename(columns={'track_id_y': 'track_id'})

# Select the columns you want to retain
merged = merged[['track_id', 'title', 'artist', 'clean_lyrics']]

In [15]:
merged.head()

TypeError: Truth of Delayed objects is not supported

In [None]:
edit
edit 