# Install Requests

In [None]:
%%bash
pip install requests

# SANITY CHECK - Search for a song

In [2]:
import requests

# Token from Genius API
access_token = "ULcds9ErDkTAQR14A6TzBFdaTfP-TGTZKtqGp1Bzk0Bam7IoftmGEysS_xm3sXYF"

headers = {
    "Authorization": f"Bearer {access_token}"
}

def search_song(query):
    base_url = "https://api.genius.com"
    search_url = f"{base_url}/search"
    params = {"q": query}
    response = requests.get(search_url, params=params, headers=headers)
    return response.json()

# Search for a song
results = search_song("Welcome to New York Taylor Swift")

# Print first few results
for hit in results["response"]["hits"][:3]:
    print("Title:", hit["result"]["full_title"])
    print("Genius URL:", hit["result"]["url"])
    print()

Title: Welcome to New York by Taylor Swift
Genius URL: https://genius.com/Taylor-swift-welcome-to-new-york-lyrics

Title: ​portorosso’s DISCONTINUED 2023 Listening Log by still-life starlet
Genius URL: https://genius.com/Still-life-starlet-portorossos-discontinued-2023-listening-log-annotated

Title: ​youth group - background music by Josiah Botting
Genius URL: https://genius.com/Josiah-botting-youth-group-background-music-annotated



# Set Up Table and Bucket

In [4]:
import boto3

# DDB table
dynamodb = boto3.resource('dynamodb')
table_name = 'genius-table'

table = dynamodb.create_table(
    TableName=table_name,
    KeySchema=[
        {
            'AttributeName': 'user_id',
            'KeyType': 'HASH'
        }
    ],
    AttributeDefinitions=[
        {
            'AttributeName': 'user_id',
            'AttributeType': 'S'
        }
    ],
    ProvisionedThroughput={
        'ReadCapacityUnits': 1,
        'WriteCapacityUnits': 1
    }
)

print(f"DDB table '{table_name}' created!")

DDB table 'genius-table' created!


In [8]:
# S3 bucket
s3 = boto3.client('s3')
bucket_name = 'genius-bucket-sowder'
s3.create_bucket(Bucket=bucket_name)

print(f"S3 bucket '{bucket_name}' created!")

S3 bucket 'genius-bucket-sowder' created!


# Set up Lambda Function Connection

In [9]:
import boto3
import json

# SQS setup
sqs = boto3.client('sqs')
queue_name = 'genius-queue'

In [10]:
# create queue
try:
    response = sqs.create_queue(
        QueueName=queue_name,
        Attributes={
            'VisibilityTimeout': '60'
        }
    )
    queue_url = response['QueueUrl']
    print(f"SQS queue created: {queue_url}")
except sqs.exceptions.QueueNameExists:
    queue_url = [url for url in sqs.list_queues()['QueueUrls'] if queue_name in url][0]
    print(f"SQS queue already exists: {queue_url}")

SQS queue created: https://sqs.us-east-1.amazonaws.com/573410794262/genius-queue


In [13]:
# deploy and connect lambda function
lambda_client = boto3.client('lambda')
iam = boto3.client('iam')
sqs = boto3.client('sqs')

function_name = 'genius_lambda'
zip_path = 'genius_lambda_package/genius_lambda.zip'
sqs_queue_url = queue_url
role = iam.get_role(RoleName='LabRole')['Role']['Arn']

with open(zip_path, 'rb') as f:
    zipped_code = f.read()

In [14]:
# create lambda function
try:
    response = lambda_client.create_function(
        FunctionName=function_name,
        Runtime='python3.9',
        Role=role,
        Handler='lambda_function.lambda_handler',
        Code={'ZipFile': zipped_code},
        Timeout=10
    )
    print("lambda function created!")
except lambda_client.exceptions.ResourceConflictException:
    print("lambda function already exists")
    response = lambda_client.update_function_code(
        FunctionName=function_name,
        ZipFile=zipped_code
    )

lambda function created!


In [15]:
# get ARN for trigger
sqs_attrs = sqs.get_queue_attributes(
    QueueUrl=sqs_queue_url,
    AttributeNames=['QueueArn']
)
sqs_arn = sqs_attrs['Attributes']['QueueArn']

In [16]:
import boto3

lambda_client = boto3.client('lambda', region_name='us-east-1')
functions = lambda_client.list_functions()

for f in functions['Functions']:
    print(f['FunctionName'])

MainMonitoringFunction
genius_lambda
ModLabRole
RedshiftOverwatch
RoleCreationFunction
RedshiftEventSubscription


In [17]:
# create SQS trigger
try:
    lambda_client.create_event_source_mapping(
        EventSourceArn=sqs_arn,
        FunctionName=function_name,
        Enabled=True,
        BatchSize=10
    )
    print("trigger creation successful")
except lambda_client.exceptions.ResourceConflictException:
    print("trigger already exists")

trigger creation successful


## Set Up Lambda Environment Variable

In [18]:
import boto3

lambda_client = boto3.client('lambda')
function_name = 'genius_lambda'  # Replace with your actual Lambda function name

response = lambda_client.update_function_configuration(
    FunctionName=function_name,
    Environment={
        'Variables': {
            'GENIUS_API_TOKEN': 'WQcICxVxX0LlqnCDFstnowelGCDLUDkHiThnFMHzY7RBybx6S2hu2jnsELsg9RPP'
        }
    }
)

print("Environment variable updated!")


Environment variable updated!


### Sanity Check

In [20]:
import boto3
import json

sqs = boto3.client('sqs')
queue_url = 'https://sqs.us-east-1.amazonaws.com/573410794262/genius-queue'

message_body = {
    "track_id": "123",
    "title": "Welcome to New York",
    "artist": "Taylor Swift"
}

response = sqs.send_message(
    QueueUrl=queue_url,
    MessageBody=json.dumps(message_body)
)

print("Message sent to SQS:", response['MessageId'])

Message sent to SQS: bc10547b-6ab9-4774-a74c-e7d33c6e4574


In [10]:
import boto3

s3 = boto3.client('s3')
bucket_name = 'genius-bucket-654654514107'

response = s3.list_objects_v2(Bucket='genius-bucket-654654514107')
for obj in response.get('Contents', []):
    print(obj['Key'])

lyrics/123.json


This confirms my entire pipeline is working end-to-end:
- Lambda is being triggered by SQS
- Lyrics are being scraped and stored in S3

# Use Dask to clean and match data

1. Load all lyrics into a Dask dataframe

In [21]:
storage_options = {
'client_kwargs': {
        'region_name': 'us-east-1'
    }
}

In [23]:
import dask.dataframe as dd
import boto3
import json

# Load your CSV file
df = dd.read_csv('final_joined_table.csv')
df = df[['track_id', 'title', 'artist']].dropna().drop_duplicates(subset=['title', 'artist'])

# Loop through the first N rows — adjust as needed
n = 0
for row in df.head(6000).itertuples(index=False):
    message = {
        "track_id": getattr(row, "track_id", ""),
        "title": getattr(row, "title", ""),
        "artist": getattr(row, "artist", "")
    }

    response = sqs.send_message(
        QueueUrl=queue_url,
        MessageBody=json.dumps(message)
    )
    n += 1

    print(f"No: {n} Sent to SQS: {message['title']} by {message['artist']} — MsgID: {response['MessageId']}")

No: 1 Sent to SQS: Collapsing New People by Fad Gadget — MsgID: c06b7688-1863-4dec-a468-a90f08e51300
No: 2 Sent to SQS: We Rule The Fucking Land by Zimmers Hole — MsgID: 6e13db35-b6c3-427c-a708-5c5566655880
No: 3 Sent to SQS: Blood by Candiria — MsgID: a12a8c6b-f88e-46fc-95c8-1f3bd5fb41be
No: 4 Sent to SQS: Deathly by Aimee Mann — MsgID: dc3c68ae-2474-40b0-842d-3ac32f837108
No: 5 Sent to SQS: My Definition Of A Boombastic Jazz Style by Dream Warriors — MsgID: f8cf60a4-aeac-469b-9e55-38b130d922f9
No: 6 Sent to SQS: Take A Load Off by Bubba Sparxxx — MsgID: a155ae08-b15a-488c-b977-8ce996a50b79
No: 7 Sent to SQS: Rapper's Delight by The Sugarhill Gang — MsgID: 28c68b10-d19a-47fa-8550-414b22830def
No: 8 Sent to SQS: So I Thought by Flyleaf — MsgID: d9636d97-46f8-4ecf-a42e-b6df4eefba1a
No: 9 Sent to SQS: Twentytwofourteen by The Album Leaf — MsgID: 2c0fe370-4675-426a-96cc-fc7d78b3056a
No: 10 Sent to SQS: 21st Century Boy by Willy Mason — MsgID: d068b207-73ce-4d41-9cdb-17c02ca35920
No: 11 Se

In [24]:
import dask.dataframe as dd

df = dd.read_json(
    's3://genius-bucket-sowder/lyrics/*.json',
    storage_options=storage_options
)

df.head()

TypeError: An error occurred while calling the read_json method registered to the pandas backend.
Original Message: Must supply at least one delayed object

In [1]:
import dask.dataframe as dd
import pandas as pd
import re
from nltk.corpus import stopwords

# Load stopwords once
stop_words = set(stopwords.words("english"))

# Reload the data cleanly (assumes you've already confirmed this loads correctly)
df = dd.read_json(
    "s3://genius-bucket-654654514107/lyrics/*.json",
    storage_options={"anon": False},
    dtype={"track_id": "object", "title": "object", "artist": "object", "url": "object", "lyrics": "object"}
)

# Define a safe cleaning function
def clean_lyrics_partition(pdf):
    def clean(text):
        if not isinstance(text, str):
            return ""
        text = re.sub(r"[^\w\s]", "", text.lower())
        tokens = text.split()
        return " ".join([word for word in tokens if word not in stop_words])
    
    pdf["clean_lyrics"] = pdf["lyrics"].apply(clean)
    return pdf

# Apply map_partitions with correct meta
meta = df.head(1).assign(clean_lyrics="")  # Use actual meta schema
df_cleaned = df.map_partitions(clean_lyrics_partition, meta=meta)

# Safely inspect first few rows
result = df_cleaned.get_partition(0).compute()
print(result[["title", "artist", "clean_lyrics"]].head())


                 title        artist  clean_lyrics
0  Welcome to New York  Taylor Swift  lyrics found


## Match Lyrics to Spotify 

In [4]:
csv_df = dd.read_csv("final_joined_table.csv")
merged = df_cleaned.merge(csv_df, on=["title", "artist"], how="inner")
merged = merged.rename(columns={"track_id_y": "track_id"}).drop(columns=["track_id_x"])


In [3]:
# After merge
merged = df_cleaned.merge(csv_df, on=["title", "artist"], how="inner")

# Check what columns are present
print(merged.columns)

Index(['track_id_x', 'title', 'artist', 'url', 'lyrics', 'clean_lyrics',
       'track_id_y', 'spotify_id', 'emotion', 'id', 'acousticness',
       'analysis_url', 'danceability', 'duration', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')


# Load Dask Dataframe with CSV file

In [5]:
import dask.dataframe as dd
import boto3
import json

# Load your CSV file
df = dd.read_csv('final_joined_table.csv')
df = df[['track_id', 'title', 'artist']].dropna().drop_duplicates(subset=['title', 'artist'])

# Create SQS client
sqs = boto3.client('sqs')
queue_url = 'https://sqs.us-east-1.amazonaws.com/654654514107/genius-queue'



In [8]:
# Loop through the first N rows — adjust as needed
n = 0
for row in df.head(6000).itertuples(index=False):
    message = {
        "track_id": getattr(row, "track_id", ""),
        "title": getattr(row, "title", ""),
        "artist": getattr(row, "artist", "")
    }

    response = sqs.send_message(
        QueueUrl=queue_url,
        MessageBody=json.dumps(message)
    )
    n += 1

    print(f"No: {n} Sent to SQS: {message['title']} by {message['artist']} — MsgID: {response['MessageId']}")


No: 1 Sent to SQS: Collapsing New People by Fad Gadget — MsgID: a757855e-1879-4057-9652-f9a7fe5b5f73
No: 2 Sent to SQS: We Rule The Fucking Land by Zimmers Hole — MsgID: baea63d0-56ce-48bf-a948-23c8147a7d5f
No: 3 Sent to SQS: Blood by Candiria — MsgID: 632e555c-0beb-4664-beb0-be2fc6476fb3
No: 4 Sent to SQS: Deathly by Aimee Mann — MsgID: 7f2e1cab-8841-4f4f-9c60-d661ef0922ac
No: 5 Sent to SQS: My Definition Of A Boombastic Jazz Style by Dream Warriors — MsgID: c6d86a00-0d6e-4794-97ad-a99aefdb8a16
No: 6 Sent to SQS: Take A Load Off by Bubba Sparxxx — MsgID: 2c5f4010-9286-4665-b7ff-2acaf5650c10
No: 7 Sent to SQS: Rapper's Delight by The Sugarhill Gang — MsgID: f6fa069f-4870-42b8-b400-0289e3ef58ed
No: 8 Sent to SQS: So I Thought by Flyleaf — MsgID: 763e2e96-af77-484e-85c6-26e1e0a86448
No: 9 Sent to SQS: Twentytwofourteen by The Album Leaf — MsgID: d3d9e860-be76-4f92-8ec5-4e0f5317c224
No: 10 Sent to SQS: 21st Century Boy by Willy Mason — MsgID: 2e51a1b5-3e4c-4612-a855-a5d8e0bc58e5
No: 11 Se

In [1]:
import boto3

s3 = boto3.client('s3')
bucket = 'genius-bucket-654654514107'

response = s3.list_objects_v2(Bucket=bucket, Prefix='lyrics/')
for obj in response.get('Contents', []):
    print(obj['Key'])


NoSuchBucket: An error occurred (NoSuchBucket) when calling the ListObjectsV2 operation: The specified bucket does not exist

In [12]:
import boto3

s3 = boto3.client('s3')
bucket = 'genius-bucket-654654514107'

response = s3.list_objects_v2(Bucket=bucket, Prefix='lyrics/')
for obj in response.get('Contents', []):
    print(obj['Key'])

lyrics/123.json
