## AE1 - Working With Text

#### Library Imports

In [2]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import defaultdict
import random
from nltk.stem import WordNetLemmatizer
import numpy as np

#### Download Stopwords

In [3]:
# get stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('wordnet')

# remove apostrophes
stop_words = list(stopwords.words('english'))
stop_words = [word.replace("'", "") for word in stop_words]

# adding my own stop-words
my_stop_words = ['im', 'like', 'get', 'got', 'aint',
                    'dont', 'oh', 'yeah', 'cause', 'verse',
                    'chorus', 'know', 'na', 'right', 'thats',
                    'cant', 'never', 'see', 'say', 'back', 'go',
                    'tell', 'make', 'need', 'take', 'let', 'youre',
                    'want', 'ya', 'hook', 'wrong', 'look', 'come',
                    'thought', 'wan', 'way', 'ill', 'keep', 'feel',
                    'could', 'even', 'gon', 'em', 'still', 'think',
                    'every']

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/owensharpe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/owensharpe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/owensharpe/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/owensharpe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


DISClAIMER: I made all of these functions and additions in order of the prompt, and then created a 'main' where the user can input two artists and then see the desired outputs for each artist

#### 1. Loading and Pre-Processing Text

In [None]:
def clean_text(text):
    """
    :param text: a specific song text
    :return: a list of the song's words
    """
    
    # turn to lowercase
    cleaned_text = text.strip().lower()
    
    # remove special characters, digits, and redundant whitespace
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)  
    cleaned_text = re.sub(r'\d', '', cleaned_text) 
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        
    return cleaned_text

In [4]:
def get_cleaned_words(text):
    """
    :param text: a group of a text from a cell of a dataframe
    :return: cleaned words that are declared as meaningful
    """
    
    # get the words from the cleaned text
    cleaned_text = clean_text(text)
    words = word_tokenize(cleaned_text)

    # what are considered 'meaningless' words? anyway, remove them
    cleaned_words = []
    for word in words:
        if word not in stop_words and word not in my_stop_words:
            cleaned_words.append(word)
    
    return cleaned_words

In [None]:
# read data
spotify_df = pd.read_csv("spotify_millsongdata.csv")

# make a new cleaned text column and clean text for each row/song
spotify_df['cleaned_words'] = spotify_df['text'].apply(get_cleaned_words)

# get rid of link column, as not really needed
del spotify_df['link']

#### 2. Analysing Text

##### i) Get Word Frequencies

In [None]:
def get_word_frequencies(artist_dict, word_list):
    """
    :param artist_dict: a dictionary of an artists word frequency
    :param word_list: a list of words from a specific song
    :return: updated artist dictionary having added song
    """
    
    for word in word_list:
        if word in artist_dict:
            artist_dict[word] += 1
        else:
            artist_dict[word] = 1
    
    return artist_dict

In [None]:
# get the word frequencies of each artist
artist_words = []

# gather all unique artists from the dataframe
artists = spotify_df['artist'].unique()

for artist in artists:
    temp_list = [artist]
    temp_word_freq = {}
    
    # get the specific rows of the artist from the dataframe 
    artist_df = spotify_df.loc[spotify_df['artist'] == artist]
    
    # get the word frequencies for all the songs related to the artist
    for index, row in artist_df.iterrows():
        temp_word_freq = get_word_frequencies(temp_word_freq, row['cleaned_words'])
    
    # add the frequencies to the temp_list, and then add the temp_list to the artist list
    temp_list.append(temp_word_freq)
    artist_words.append(temp_list)

##### ii) Determining the Word Richness

In [None]:
def calc_word_richness(all_artist_info):
    """
    :param all_artist_info: each artist's information and unique word count
    :return: artists with their richness score between 0 and 10
    """
    
    # sort the artists based upon the amount of unique words 
    sorted_artists = sorted(all_artist_info, key = lambda x: x[2], reverse=True)    
    
    # create a rank for the artist
    ranks = {sorted_artist[0]: i+1 for i, sorted_artist in enumerate(sorted_artists)}
    
    # getting the top rank
    max_rank = len(sorted_artists)
    
    # calculate the score 
    for spec_artist in all_artist_info:
        
        temp_rank = ranks[spec_artist[0]]
        
        # get a score between 0-10
        temp_score = (max_rank - temp_rank) / max_rank * 10
        spec_artist.append(temp_score)

In [None]:
# get unique word counts for each artist
for artist in artist_words:
    
    # get the count and append information to the artist
    unique_word_count = len(artist[1])
    artist.append(unique_word_count)

# creating a richness score from 0-10; I will rank each artist on unique words then formulate a score
calc_word_richness(artist_words)

##### iii) Getting the Sentiment Score

In [None]:
def get_sentiment(word_list):
    """
    :param word_list: a list of words from a specific song
    :return: the sentiment score of a specific song
    """
    
    # initialize the sentiment analyzer
    sia = SentimentIntensityAnalyzer()

    # get the score using the words
    total_sent_score = 0
    for word in word_list:
        sentiment = sia.polarity_scores(word)
            
        # aggregate the sentiment scores
        total_sent_score += sentiment['compound']
        
    # get the average score and append to the artist's information
    avg_sent_score = total_sent_score / len(word_list)
    
    return avg_sent_score

In [None]:
# get the sentiment scores for the songs, ranging from -1 to 1
spotify_df['sentiment_score'] = spotify_df['cleaned_words'].apply(get_sentiment)

##### iv) Identifying any Common Words between Artists

For this, I am getting the top 25 most common words between artists, but you could pick any number 'n' to be used

In [None]:
def find_common_words(both_artist_info, top_n=25):
    """
    :param both_artist_info: each artist's information
    :param top_n: the top n number of common words between artists
    :return: the top n number most common words between all artists
    """
    
    # find common words
    com_words = set(both_artist_info[0][1].keys()) & set(both_artist_info[1][1].keys())

    # count occurrences of common words across both artists
    common_word_counts = {word: both_artist_info[0][1][word] + both_artist_info[1][1][word]
                          for word in com_words}
    
    # sort common words by frequency
    top_n_common_words = sorted(common_word_counts.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    return top_n_common_words

#### 3. Visualizing Text

##### i) Generating a Word Cloud for each Artist

In [None]:
def generate_word_cloud(artist_info):
    """
    :param artist_info: an artist's specific information
    :return: null (plotting)
    """
        
    # generate word cloud
    artist_wordcloud = WordCloud(width=800, height=400,
                                 background_color='white').generate_from_frequencies(artist_info[1])
        
    # plot specific artist wordcloud
    plt.figure(figsize=(10, 5))
    plt.imshow(artist_wordcloud, interpolation='bilinear')
    plt.title(artist_info[0] + "'s Word Cloud")
    plt.axis('off')
    plt.show()

##### ii) Plotting Sentiment Scores in a Scatter Plot

In [None]:
def plot_sentiment(dataframe, artist_name):
    """
    :param dataframe: specific spotify song information
    :param artist_name: the artist's name
    :return: null (plotting)
    """
    
    # create a filtered dataframe
    spec_artist_df = dataframe.loc[spotify_df['artist'] == artist_name]
    
    # make scatter plot
    plt.figure(figsize=(10,5))
    count = 0
    for _, curr_row in spec_artist_df.iterrows():
        plt.scatter(count, curr_row['sentiment_score'])
        plt.text(count, curr_row['sentiment_score'], curr_row['song'], fontsize=5, ha='right')
        count += 1
    
    # give plot additions
    plt.title(f"{artist_name}'s Song Sentiment Scores")
    plt.xlabel('Index')
    plt.ylabel('Sentiment Score')
    plt.grid(True)

##### iii) Creating a Heat Map to Show Word Overlap Between Artists

In [None]:
def get_top_words(word_dict, top_n=25):
    """
    :param word_dict: an artist's word frequency dictionary
    :param top_n: the top n number of common words for an artist
    :return: the most common words
    """
    
    sorted_word_freq = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
    
    # take the top n words
    top_n_words = sorted_word_freq[:top_n]
    
    return top_n_words

In [None]:
def create_heatmap(both_artist_info):
    """
    :param both_artist_info: each artist's specific information
    :return: null (plotting)
    """
    
    # get each artists top n common words and frequencies
    art1_top_words, art1_word_frequencies = map(list, zip(*get_top_words(both_artist_info[0][1])))
    art2_top_words, art2_word_frequencies = map(list, zip(*get_top_words(both_artist_info[1][1])))
    
    # find all words
    all_words = art1_top_words + art2_top_words

    # initialize frequency matrices
    art1_freq_matrix = []
    art2_freq_matrix = []
    
    # iterate over all words
    for word in all_words:
        if word in art1_top_words:
            art1_freq_matrix.append(art1_word_frequencies[art1_top_words.index(word)])
        else:
            art1_freq_matrix.append(0)  
        
        if word in art2_top_words:
            art2_freq_matrix.append(art2_word_frequencies[art2_top_words.index(word)])
        else:
            art2_freq_matrix.append(0) 
    
    # convert to numpy arrays and reshape
    art1_np_matrix = (np.array(art1_freq_matrix)).reshape(1, -1)
    art2_np_matrix = (np.array(art2_freq_matrix)).reshape(1, -1)
    
    # stack matrices vertically (this helped my visualization)
    heatmap_data = np.vstack((art1_np_matrix, art2_np_matrix))
    heatmap_data_transposed = heatmap_data.T

    # get artist names
    artist_names = [spec_artist[0] for spec_artist in both_artist_info]

    # plot heatmap
    plt.figure(figsize=(10, 6))
    plt.imshow(heatmap_data_transposed, cmap='viridis', aspect='auto')
    
    # additional plot adjustments
    plt.colorbar(label='Frequency')
    plt.yticks(range(len(all_words)), all_words, fontsize=8, rotation=0)
    plt.xticks(range(2), [artist_names[0], artist_names[1]])
    plt.title('Common Words Heatmap for Both Artists')
    
    # show plot
    plt.tight_layout()
    plt.show()

#### 4) Generating Text

In [None]:
def get_artists():
    """
    :return: specified artist1 and artist2
    """
    
    # prompt user for the artists
    art1 = (input("Who is the first artist you want?: "))
    art2 = (input("Who is the second artist you want?: "))
    
    return art1, art2

In [None]:
def get_all_song_texts(spec_artist, dataframe):
    """
    :param spec_artist: a specified artist
    :param dataframe: specific spotify song information
    :return: all the cleaned song lyrics of the artist
    """
    
    # create a filtered df for the artist
    spec_artist_df = dataframe[dataframe['artist'] == spec_artist]
    
    # initialize a list for the artist's lyrics and append all song lyrics to that list
    all_lyrics = ""
    
    # call the clean_text function from earlier to get all the natural words from the text
    for _, curr_row in spec_artist_df.iterrows():
        temp_cleaned_text = clean_text(curr_row['text'])
        all_lyrics += (" " + temp_cleaned_text)
    
    return all_lyrics

In [None]:
def create_ngrams(text, n):
    """
    :param text: text containing all song lyrics from the artist 
    :param n: number for n-grams (i.e. two-grams, three-grams, four-grams)
    :return: the n-grams
    """
    
    # initialize ngrams and split the text for the words
    ngrams = defaultdict(list)
    words = text.split()
    
    # loop through words and create n-grams
    for i in range(len(words) - n):
        ngrams[tuple(words[i:i+n])].append(words[i+n])
        
    return ngrams

In [None]:
def generate_song(ngrams, n_words=150):
    """
    :param ngrams: n-grams created for an artist
    :param n_words: number of words for the song
    :return: the generated song text
    """
    
    # get a random starter word using the ngram keys
    starter_word = random.choice(list(ngrams.keys()))
    
    # initialize text
    generated_song = list(starter_word)
    
    # loop until number of words is hit, adding words through the n-grams along the way
    for _ in range(n_words):
        
        # this line gives us a window of n words from the end of the generated song
        # it's used as the starter word for the next word prediction
        next_word = random.choice(ngrams[tuple(generated_song[-len(starter_word):])])
        generated_song.append(next_word)
    
    return ' '.join(generated_song)

##### User Prompting/Getting Visualizations and Outputs

In [None]:
def get_spec_artist_info(artist_info, artist_name):
    """
    :param artist_info: each artist's specific information
    :param artist_name: the name of a specific artist
    :return: the information for that specific artist
    """
    
    for spec_artist in artist_info:
        if artist_name in spec_artist:
            return spec_artist

In [None]:
# prompt user for n_grams
n_gms = int(input("What is the number of n-grams you want for lyric generation?: "))

# get artists
artist1, artist2 = get_artists()

# check if artists exist in the available artists
while artist1 not in spotify_df['artist'].values or artist2 not in spotify_df['artist'].values:
        
    if artist1 not in spotify_df['artist'].values:
        print(f"{artist1} is not in the available list!")
    elif artist2 not in spotify_df['artist'].values:
        print(f"{artist2} is not in the available list!")
        
    artist1, artist2 = get_artists()

# get the main information for each artist
artist1_info = get_spec_artist_info(artist_words, artist1)
artist2_info = get_spec_artist_info(artist_words, artist2)

In [None]:
# run to print the word frequencies for each artist
art1_sorted_dict = dict(sorted(artist1_info[1].items(), key=lambda item: item[1], reverse=True))
art2_sorted_dict =  dict(sorted(artist2_info[1].items(), key=lambda item: item[1], reverse=True))

print(f"{artist1_info[0]}'s Word Frequencies: \n\n {art1_sorted_dict} \n\n")
print(f"{artist2_info[0]}'s Word Frequencies: \n\n {art2_sorted_dict}\n\n")

In [None]:
# run to print the amount of unique words and the word richness scores of each artist
print(f"{artist1_info[0]}'s Number of Unique Words: {artist1_info[2]}\n"
      f"{artist1_info[0]}'s Word Richness Score: {artist1_info[3]}\n\n")
print(f"{artist2_info[0]}'s Number of Unique Words: {artist2_info[2]}\n"
      f"{artist2_info[0]}'s Word Richness Score: {artist2_info[3]}\n\n")

In [None]:
# run to print the top 25 words between artist 1 and artist 2
both_artist_information = [artist1_info, artist2_info]
common_words = find_common_words(both_artist_information)
print(f"The top 25 most common words between {artist1_info[0]} and {artist2_info[0]}: {common_words}")

In [None]:
# run to generate a wordcloud for each artist
generate_word_cloud(artist1_info)
generate_word_cloud(artist2_info)

In [None]:
# run to make song sentiment score graphs for each artist
plot_sentiment(spotify_df, artist1_info[0])
plot_sentiment(spotify_df, artist2_info[0])

In [None]:
# run to make heat maps to see the common word overlap between two artists
create_heatmap(both_artist_information)

In [None]:
# run to create two generated songs for each artist

# get all song lyrics from both artists
artist_lyrics_1 = get_all_song_texts(artist1, spotify_df)
artist_lyrics_2 = get_all_song_texts(artist2, spotify_df)

# train n-grams for each artist
artist1_ngrams = create_ngrams(artist_lyrics_1, n_gms)
artist2_ngrams = create_ngrams(artist_lyrics_2, n_gms)

# generate a new song for each artist
new_song_artist1 = generate_song(artist1_ngrams)
new_song_artist2 = generate_song(artist2_ngrams)

print(f"{artist1} generated song: \n\n {new_song_artist1}\n")
print(f"{artist2} generated song: \n\n {new_song_artist2}\n")

#### 5) Reflecting on the process

I enjoyed most of the elements of this assignment, especially the creation of some unique functions. I would say everything up to the text generation was pretty straightforward. The only thing that I could really change was determining word richness because things like cleaning text, getting word frequencies, and getting sentiment scores are usually done in a specific way that everyone follows. 

I liked making a personal word richness score because it allowed me to add my own little flair to the code. I would say that the idea of 'meaningless words' is somewhat subjective and could differ based on the person, but I went to the internet for the best definition, as in the nltk library. 

My visualizations/plots were very straightforward, but I had the greatest amount of trouble in that section because I was initially confused by the prompt on how much I was plotting, and graphs can come out gross if you don't plot correctly. I will definitely work on teaching myself better plotting strategies to prevent these issues from happening next time. I had to rewrite certain aspects of those functions many times over.
 
I rewrote over my original n-gram code for the 'hambot' assignment because I wanted to see if I could do it differently. Rewriting the code took some difficulty, but I feel I've updated it well for this current assignment. Generating the lyrics was definitely my favorite part of the assignment, so I'm glad I could create some unique code for that. 

For improvements, I could definitely optimize my code in certain aspects like the plotting areas or just cleaning text in general for next time. Maybe it was due to the immense number of data cells, but the run time for some functions does take quite a while. I also have some gripes with the way the prompt is written. There were certain sentences that were very vague, which made me have to consolidate with professors frequently on whether what I was doing was the correct implementation. 

Overall, I enjoyed this assignment, even when I became frustrated with the prompt and what I was supposed to do at times.

Additional comment about things I did within the assignment:

For the 'artist_words' list I had, within each artist element, it was ordered as such:
    Element 1: Artist name
    Element 2: Dictionary with the keys as the artist's unique words and the values as their word counts
    Element 3: The count of unique words in the artist's lyrics
    Element 4: The artist's word richness score which I created

#### 6) Going above and beyond

I am adding more, but I would say doing a word richness score should count for some part of this extra credit :)

In [None]:
# clean text but don't split into words by stripping punctuation and converting to lowercase
spotify_df['cleaned_text'] = spotify_df['text'].str.lower().str.replace('[^\w\s]', '')

I added a 'cleaned_text' column which just contains the cleaned text without splitting anything

#### Finding Artist's Emotion Distribution from Their Songs

In [None]:
def preprocess_text(text):
    """
    :param text: cleaned text
    :return: the same text but with lemmatization and stopwords removed 
    """
    
    tokens = word_tokenize(text)  
    
    # remove words with numbers
    tokens = [token for token in tokens if token.isalpha()]  
    
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words] 
    
    return " ".join(tokens)

In [None]:
# I'm going to attempt to extract certain emotions based upon analyzing song lyrics
# to briefly explain, lemmatizing is the process of taking the variants of the same word and assigning them to a single word (i.e. 'changing', 'changed', and 'change' all become 'change')

# preprocessing
lemmatizer = WordNetLemmatizer()

# lemmatize the text
spotify_df['lemmatized_text'] = spotify_df['cleaned_text'].apply(preprocess_text)

In [None]:
# create an emotion Lexicon 
emotion_lexicon = \
    {'happy': ['joy', 'happy', 'excited', 'cheerful', 'content', 'gleeful', 'wonderful'],
    'sad': ['sad', 'unhappy', 'melancholy', 'gloomy', 'depressed', 'woeful'],
    'angry': ['angry', 'rage', 'irritated', 'frustrated', 'fuming', 'livid'],
    'love': ['affection', 'adoration', 'romance', 'caring', 'passion'],
    'fear': ['fear', 'anxiety', 'dread', 'terror', 'panic', 'fright', 'horror'],
    'envy': ['envy', 'jealousy', 'grudge', 'bitterness', 'spite', 'malice', 'pettiness'],
    'boredom': ['boredom', 'tedium', 'monotony', 'dullness', 'lethargy', 'fatigue', 'staleness'],
    'indifference': ['detachment', 'unconcern', 'nonchalance', 'disinterest', 'aloofness'],
    'disgust': ['disgust', 'revulsion', 'repulsion', 'nausea', 'loathing', 'hatred', 'dislike' 'displeasure'],
    'surprise': ['surprise', 'astonishment', 'amazement', 'shock', 'startle', 'awe', 'stun', 'confusion'],
    'guilt': ['guilt', 'remorse', 'regret', 'shame', 'blame', 'sorrow', 'apology', 'guiltiness'],
    'pride': ['pride', 'dignity', 'honor', 'vanity', 'arrogance', 'ego', 'superiority', 'swagger']}

In [None]:
# calculate emotion scores
def calculate_emotion_score(text):
    """
    :param text: cleaned song text
    :return: a dictionary of emotion scores
    """
    
    # loop through text and attempt to find any words in the emotion lexicon
    temp_emotion_scores = defaultdict(int)
    for word in word_tokenize(text):
        for temp_emotion, words in emotion_lexicon.items():
            if word in words:
                temp_emotion_scores[temp_emotion] += 1
    
    return dict(temp_emotion_scores)

In [None]:
# get emotion scores for each score
spotify_df['emotion_scores'] = spotify_df['lemmatized_text'].apply(calculate_emotion_score)

# get emotion distribution for each artist
artist_emotion_distribution = {}
for artist, group in spotify_df.groupby('artist'):
    emotion_scores = defaultdict(int)
    
    for emotion_dict in group['emotion_scores']:
        for emotion, score in emotion_dict.items():
            emotion_scores[emotion] += score
    artist_emotion_distribution[artist] = dict(emotion_scores)

In [None]:
def plot_emotion_distribution(artist_name, artist_emotion_scores):
    """
    :param artist_name: the artist's name
    :param artist_emotion_scores: a specific artist
    :return: null (plotting)
    """
    
    art_emotions = list(artist_emotion_scores.keys())
    art_emotion_scores = list(artist_emotion_scores.values())
    
    # tried to make a cool visualization for the emotion score distribution; a pie chart
    plt.figure(figsize=(8, 8))
    plt.pie(art_emotion_scores, labels=art_emotions,
            autopct='%1.1f%%', startangle=140,
            colors=plt.cm.tab10.colors)
    
    plt.axis('equal')
    plt.title(f"Emotion Distribution in {artist_name}'s Lyrics", pad=20)
    plt.tight_layout()
    plt.show()

In [None]:
# plot distribution
# (i.e. plot_emotion_distribution('Kanye West', artist_emotion_distribution['Kanye West'])
# plot_emotion_distribution('Kanye West', artist_emotion_distribution['Kanye West'])