# **Set-up:**

These are the additional packages to be installed on the terminal:

```bash
pip install scikit-learn
pip install rpy2
pip install spotipy
pip install nltk
pip install wordcloud
```

Run this code to install the packages required:

In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from plotnine import *
import plotnine as p9
import re
from scrapy import Selector
import requests as requests
import json
import statsmodels.api as sm
import spotipy
import base64
from requests import post
from spotipy.oauth2 import SpotifyClientCredentials
from datetime import datetime
from sklearn import *
from base64 import *

# Expand Dataframe from merged YouTube Data

In [None]:
merged_df = pd.read_csv('../data/merged_new.csv')

In [None]:
merged_df.head()

### Function to extract artist and song from 'title'.

In [None]:
def extract_artist_and_song(string):

    pattern_in_brackets = re.compile(r'\[.*?\]')
    pattern_in_parentheses = re.compile(r'\(.*?\)')

    string = re.sub(pattern_in_brackets, '', string)
    string = re.sub(pattern_in_parentheses, '', string)

    if '-' in string:
        artist, song = string.split('-', 1)
    else:
        return 'None'

    return [artist.strip(), song.strip()]

In [None]:
new_merge = merged_df

new_merge['Artist'] = new_merge['title'].apply(lambda x: extract_artist_and_song(x)[0])
new_merge['Song'] = new_merge['title'].apply(lambda x: extract_artist_and_song(x)[1])

new_merge.head()

### Funtion to generate the song URL _(this will be used for web scraping later on)_

In [None]:
def generate_song_url(song_artist, song_title):
    '''
    Returns a string of the URL for the Genius page of the song

        Parameters:
            song_artist (str): The artist of the song
            song_title (str): The title of the song

        Returns:
            song_url (str): The URL for the Genius page of the song
    '''
    
    base_url = 'https://genius.com/'
    
    # format the artist name and song title
    song_artist = song_artist.replace('&', 'and')
    formatted_artist = song_artist.lower().replace(' ', '-')
    formatted_title = song_title.lower().replace(' ', '-')
    
    # generate the song URL by concatenating strings according to Genius formatting
    song_url = f'{base_url}{formatted_artist}-{formatted_title}-lyrics'

    return song_url

In [None]:
new_merge['Genius_URL'] = new_merge.apply(lambda row: generate_song_url(row['Artist'], row['Song']), axis=1)

new_merge.head()

### Function to scrape the lyrics based on the URL

In [None]:
def scrape_lyrics(session, song_url):
    '''
    Returns a string of song lyrics, with each line separated by a new line

        Parameters:
            session (variable): The session that has been initialised for requesting from the Genius website
            song_url (str): The URL of the Genius page for the song

        Returns:
            lyrics (str): The lyrics of the song
    '''
    
    # use initialised session to enhance performance
    response = session.get(song_url, timeout=10)
    sel = Selector(text=response.text)

    if response.status_code != 200: return None
    
    print(response.status_code)
    
    # scrape lyrics into one large string
    raw_lyrics = ' '.join(sel.css('div.Lyrics__Container-sc-1ynbvzw-1.kUgSbL ::text').getall())

    # clean lyrics using regular expression to remove words in square brackets
    pattern = r'\[.*?\]'
    result_string = re.sub(pattern, '', raw_lyrics)
    lyrics = ' '.join(result_string.split())

    return lyrics

In [None]:
credentials_file_path = "../credentials.json"

with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

In [None]:
my_session = requests.Session()

### Under 'wikipedia_categories', there are separate links for different potential genres

We have noticed that:
* Each link is separated by a comma ','
* Every song has at least one category – 'music'
* Some songs are in multiple categories, majority of them only has only one, some has two, songs with two and more categories are extremely rare

Therefore, we can count the number of commas to determine the number of categories, with the function as such:

In [None]:
def get_category_number(x):
    string = str(x)
    return string.count(',')

In [None]:
new_merge['category_number'] = new_merge['wikipedia_categories'].apply(lambda x: get_category_number(x))

In [None]:
new_merge.head()

In [None]:
new_merge2 = new_merge.head(150)

In [None]:
big_merge = new_merge.head(200)
big_merge['lyrics'] = big_merge.apply(lambda row: scrape_lyrics(my_session, row['Genius_URL']), axis=1)

In [None]:
new_merge2['lyrics'] = new_merge2.apply(lambda row: scrape_lyrics(my_session, row['Genius_URL']), axis=1)

new_merge2.head()

# Clean and Analyse Data

### We create a new dataframe with the necessary headers only, removing 'None' values or duplicates

In [None]:
new_merge3 = new_merge2.dropna()

df = new_merge3[['Artist', 'Song', 'like_count', 'view_count', 'comment_count', 'lyrics', 'category_number']].dropna().drop_duplicates(subset = ['Song'])

df = df[df['lyrics'] != '']

df.head()

### Imported package to analyse sentiments

We create function and apply it to dataframe|

In [None]:
sid = SentimentIntensityAnalyzer()

def get_sentiment_score(lyric):
    scores = sid.polarity_scores(lyric)
    list = [scores['neg'], scores['neu'], scores['pos'], scores['compound']]
    return list

In [None]:
df['sentiment_positive'] = df['lyrics'].apply(lambda x: get_sentiment_score(x)[2])
df['sentiment_neutral'] = df['lyrics'].apply(lambda x: get_sentiment_score(x)[1])
df['sentiment_negative'] = df['lyrics'].apply(lambda x: get_sentiment_score(x)[0])
df['sentiment_compound'] = df['lyrics'].apply(lambda x: get_sentiment_score(x)[3])

df.head()

### We define lexical richness as the proportion of unique words to total words used, a quantitative way to analyse the richness of vocabulary used in a song. Using function below:

In [None]:
def get_lexical_richness(lyric):
    total_words = len(lyric.split())
    unique_words = len(set(lyric.split()))
    lexical_richness = unique_words/total_words*100
    return round(lexical_richness)

In [None]:
df['lexical_richness'] = df['lyrics'].apply(lambda x: get_lexical_richness(x))

df.head()

### Find song length as well

In [None]:
df['song_length'] = df['lyrics'].apply(lambda x: len(x.split()))

df.head()

In [None]:
df['sentiment_compound_absolute'] = df['sentiment_compound'].abs()

df.head()

# Integrate Spotify API

Lastly, we integrate spotify API as well to find even more categories

In [None]:
client_id = credentials['client_id']
client_secret = credentials['client_secret']

client_creds = f"{client_id}:{client_secret}"
base64_client_creds = b64encode(client_creds.encode()).decode()

auth_url = 'https://accounts.spotify.com/api/token'
headers = {
    'Authorization': f'Basic {base64_client_creds}'
}
payload = {
    'grant_type': 'client_credentials'
}

response = requests.post(auth_url, headers=headers, data=payload)

response.json()


### Using 'spotipy' package and the search() function, we are able to get data in the json file such as release date, a popularity score, whether the song is explicit, and the number of markets that the song is in during its initial release

In [None]:
def get_release_date(song):
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    result = sp.search(song)
    release_date = result['tracks']['items'][0]['album']['release_date']
    return release_date

In [None]:
def get_popularity(song):
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    result = sp.search(song)
    popularity = result['tracks']['items'][0]['popularity']
    return popularity

In [None]:
def get_explicitness(song):
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    result = sp.search(song)
    explicitness = result['tracks']['items'][0]['explicit']
    return explicitness

In [None]:
def get_market_number(song):
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    result = sp.search(song)
    available_markets = result['tracks']['items'][0]['available_markets']
    return len(available_markets)

### Integrating these into our existing dataframe:

In [None]:
df['release_date'] = df['Song'].apply(lambda x: get_release_date(x))
df['popularity'] = df['Song'].apply(lambda x: get_popularity(x))
df['explicitness'] = df['Song'].apply(lambda x: get_explicitness(x))
df['markets'] = df['Song'].apply(lambda x: get_market_number(x))

### We want to convert our date to datetime format for ease of plotting later on

In [None]:
def convert_date(x):
    try:
        pd.to_datetime(x)
        return pd.to_datetime(x)
    except:
        None
        return None

In [None]:
df['release_date'] = df['release_date'].apply(lambda x: convert_date(x)).dropna()
# df['release_date'] = pd.to_datetime(df['release_date'])
df.head()

### For the number of markets of song release, we found some interesting facts:

For an initial release of song, it is in either:
* all 184 markets in the world
* slightly less than 184 markets (a sign that there are some censorship in some countries, a hint that the song may be culturally inappropriate/politically sensitive)
* or very little markets (<50) (a sign that the song is deliberately only released in some markets, targeting niche categories)

Hence justifying the below function, categorising them into high, medium, or low level of outreach

In [None]:
def market_availability_category(x):
    number = int(x)
    if number == 184:
        return 'High'
    elif 50 < number < 184:
        return 'Medium'
    else:
        return 'Low'

In [None]:
df['markets'] = df['markets'].apply(lambda x: market_availability_category(x))
df.head()

### Similarly, for song categories:

We initially attempted to obtain song genres via YouTube, Genius or Spotify. However, we faced significant difficulties due to the fact that:
* The data is not explicitly available – these platforms offer limited sources of data to public due to privacy reasons
* It is very difficult to get the genre via the API itself

Therefore, we enlisted Wikipedia, an open source, to find out on the song genre/category. However, due to the limited amount of categorisations there are on Wikipedia, we focus on the number of categories, i.e. number of wikipedia pages they occur instead.
* Most songs do not belong to any specific category on Wikipedia, they are being categorised as "music".
* For most of the other songs, they belong to two Wikipedia categories, "music" and something else, such as "electro"
* The rest of the songs are extreme minorities which belongs to three or more Wikipedia categories

Hence justifying our rationale to have broad categories. Songs that are not relevant enough to have more than one genre are categorised as "Low" in terms of category popularity; two as "Medium", three or more as "High". The function below:

In [None]:
def category_popularity(x):
    number = int(x)
    if number == 1:
        return 'Low'
    elif number == 2:
        return 'Medium'
    else:
        return 'High'

In [None]:
df['category_number'] = df['category_number'].apply(lambda x: category_popularity(x))
df.head()

In [None]:
wordcloud = WordCloud().generate(df.iloc[0,5])

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
corr_df = df[['like_count','view_count','comment_count', 'sentiment_positive', 'sentiment_neutral', 'sentiment_negative', 'sentiment_compound_absolute', 'lexical_richness', 'song_length', 'popularity']].corr()

In [None]:
corr_df2 = corr_df. \
        melt(ignore_index=False) \
        .reset_index()

corr_df2['rounded_value'] = corr_df2['value'].apply(lambda x: np.round(x, 2))

In [None]:
g = p9.ggplot(
        mapping = p9.aes('index', 'variable', fill = 'value'),
        data = corr_df2
    ) + \
        p9.geom_tile() + \
        p9.geom_label(
            p9.aes(label = 'rounded_value'),
            fill = 'white',
            size = 8
        ) + \
        p9.scale_fill_distiller() + \
        p9.theme_minimal() + \
        p9.labs(
            title = 'Correlation Matrix',
            x = '',
            y = ''
        ) + \
        p9.theme(
            axis_text_x = element_text(angle = 90)
        )

g

In [None]:
hist = p9.ggplot(
    mapping = p9.aes(x = 'sentiment_compound'),
    data = df
) + \
geom_histogram(binwidth=0.05)

hist

In [None]:
boxplot = (
    ggplot(df) +
    aes(x = 'explicitness', y = 'popularity') +
    geom_boxplot()
)

boxplot

In [None]:
line = (
    ggplot(df) +
    aes(x = 'release_date', y = 'song_length', colour = 'explicitness') +
    geom_point(alpha = 0.5) +
    geom_smooth(method = "lm") +
    scale_x_datetime(
        limits=(datetime(2000, 1, 1), datetime(2024, 1, 1)),
    )
)

line

In [None]:
contour = (
    ggplot(df) +
    aes(x = 'popularity', y = 'song_length') +
    geom_bin2d() +
    theme_classic()
)

contour

In [None]:
df.to_json("../data/json_for_plot.json")

In [None]:
contour = (
    ggplot(df) +
    aes(x = 'lexical_richness', y = 'sentiment_compound', z = 'popularity') +
    geom_contour_filled(aes(fill = 'level') +
    geom_contour(colour = 'black'))
)

contour

In [None]:
distribution = (
    ggplot(df) +
    aes(x = 'popularity', colour = 'category_number', fill = 'category_number') +
    geom_density(alpha = 0.2)
)

distribution

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
install.packages("IRkernel")

In [None]:
%%R
install.packages("IRkernel")
plot = (
    ggplot(df, aes(x='lexical_richness', y='sentiment_compound', z='popularity')) +
    geom_contour_filled(aes(fill='..level..')) +
    geom_contour(color='black') +
    scale_fill_cmap(name='viridis')
)