# Install Requests

In [None]:
%%bash
pip install requests

# SANITY CHECK - Search for a song

In [1]:
import requests

# Token from Genius API
access_token = "ULcds9ErDkTAQR14A6TzBFdaTfP-TGTZKtqGp1Bzk0Bam7IoftmGEysS_xm3sXYF"

headers = {
    "Authorization": f"Bearer {access_token}"
}

def search_song(query):
    base_url = "https://api.genius.com"
    search_url = f"{base_url}/search"
    params = {"q": query}
    response = requests.get(search_url, params=params, headers=headers)
    return response.json()

# Search for a song
results = search_song("Welcome to New York Taylor Swift")

# Print first few results
for hit in results["response"]["hits"][:3]:
    print("Title:", hit["result"]["full_title"])
    print("Genius URL:", hit["result"]["url"])
    print()

Title: Welcome to New York by Taylor Swift
Genius URL: https://genius.com/Taylor-swift-welcome-to-new-york-lyrics

Title: ​portorosso’s DISCONTINUED 2023 Listening Log by still-life starlet
Genius URL: https://genius.com/Still-life-starlet-portorossos-discontinued-2023-listening-log-annotated

Title: ​youth group - background music by Josiah Botting
Genius URL: https://genius.com/Josiah-botting-youth-group-background-music-annotated



# Set up Lambda Function Connection

In [3]:
import boto3
import json

# SQS setup
sqs = boto3.client('sqs')
queue_name = 'genius-queue'

In [4]:
# create queue
try:
    response = sqs.create_queue(
        QueueName=queue_name,
        Attributes={
            'VisibilityTimeout': '60'
        }
    )
    queue_url = response['QueueUrl']
    print(f"SQS queue created: {queue_url}")
except sqs.exceptions.QueueNameExists:
    queue_url = [url for url in sqs.list_queues()['QueueUrls'] if queue_name in url][0]
    print(f"SQS queue already exists: {queue_url}")

SQS queue created: https://sqs.us-east-1.amazonaws.com/654654514107/genius-queue


In [6]:
# deploy and connect lambda function
lambda_client = boto3.client('lambda')
iam = boto3.client('iam')
sqs = boto3.client('sqs')

function_name = 'genius_lambda'
zip_path = 'genius_lambda.py.zip'
sqs_queue_url = queue_url
role = iam.get_role(RoleName='LabRole')['Role']['Arn']

with open(zip_path, 'rb') as f:
    zipped_code = f.read()

In [7]:
# create lambda function
try:
    response = lambda_client.create_function(
        FunctionName=function_name,
        Runtime='python3.9',
        Role=role,
        Handler='lambda_function.lambda_handler',
        Code={'ZipFile': zipped_code},
        Timeout=10
    )
    print("lambda function created!")
except lambda_client.exceptions.ResourceConflictException:
    print("lambda function already exists")
    response = lambda_client.update_function_code(
        FunctionName=function_name,
        ZipFile=zipped_code
    )

lambda function created!


In [8]:
# get ARN for trigger
sqs_attrs = sqs.get_queue_attributes(
    QueueUrl=sqs_queue_url,
    AttributeNames=['QueueArn']
)
sqs_arn = sqs_attrs['Attributes']['QueueArn']

In [9]:
# create SQS trigger
try:
    lambda_client.create_event_source_mapping(
        EventSourceArn=sqs_arn,
        FunctionName=function_name,
        Enabled=True,
        BatchSize=10
    )
    print("trigger creation successful")
except lambda_client.exceptions.ResourceConflictException:
    print("trigger already exists")

trigger creation successful


2. Use Dask to clean and match data

In [2]:
import dask.dataframe as dd
import pandas as pd
import boto3
import json
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_lyrics(df):
    df['cleaned_lyrics'] = df['lyrics'].apply(
        lambda text: ' '.join(
            word for word in re.sub(r'[^a-zA-Z\s]', '', text.lower()).split()
            if word not in stop_words
        ),
        meta=('lyrics', 'str')
    )
    return df

lyrics_df = dd.read_json('s3://genius-bucket-654654514107/lyrics/*.json')
cleaned_df = clean_lyrics(lyrics_df)


TypeError: An error occurred while calling the read_json method registered to the pandas backend.
Original Message: An error occurred while calling the read_json method registered to the pandas backend.
Original Message: ClientArgsCreator.compute_endpoint_resolver_builtin_defaults() missing 2 required positional arguments: 'credentials' and 'account_id_endpoint_mode'

In [None]:
metadata_df = dd.read_csv("final_joined_table.csv")
joined = cleaned_df.merge(metadata_df, on=['title', 'artist'], how='inner')
