In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import re
import time

In [2]:
# Create DataFrame from CSV file
nlp_df = pd.read_csv('../../Data/nlp_df.csv')
nlp_df = nlp_df.drop(['non_alpha_words'], axis=1)
nlp_df.head(3)

Unnamed: 0,song,song_id,artist,artist_id,category,category_id,popularity,genres,audio_ft_danceability,audio_ft_energy,...,audio_ft_acousticness,audio_ft_instrumentalness,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,audio_ft_duration_ms,audio_ft_time_signature,lyrics,filtered,language
0,willow,0lx2cLdOt3piJbcaXIV74f,Taylor Swift,06HL4z0CvFAxyc27GXpf02,pop,8,93,"['dance', 'pop']",0.392,0.574,...,0.833,0.00179,0.145,0.529,81.112,214707.0,4.0,Im like the water when your ship rolled in th...,"['', 'im', 'like', 'water', 'ship', 'rolled', ...","(1, 'en')"
1,Stay Next To Me (with Chelsea Cutler),6SGG5AxHShqSYiV9fCWpZz,Quinn XCII,3ApUX1o6oSz321MMECyIYd,pop,8,78,"['indie', 'pop', 'electropop']",0.581,0.584,...,0.0805,0.0,0.366,0.756,179.954,206046.0,4.0,Didnt even wanna go out whyd you call me ? Iv...,"['', 'didnt', 'even', 'wanna', 'go', 'whyd', '...","(1, 'en')"
2,WITHOUT YOU,27OeeYzk6klgBh83TSvGMA,The Kid LAROI,2tIP7SsRs7vjIcLrU85W8J,pop,8,95,['australian'],0.662,0.413,...,0.213,0.0,0.134,0.467,93.005,161385.0,4.0,You cut out a piece of me and now I bleed int...,"['', 'cut', 'piece', 'bleed', 'internally', 'l...","(1, 'en')"


In [3]:
# Create a list of all words, word counts, unique word counts, and filtered words
t0 = time.time()
words_list = []
word_counts = []
unique_word_counts = []
filtered_words_list = []
for index, row in nlp_df.iterrows():
    filtered_words = row['filtered']
    filtered_words = filtered_words.replace(',', '').replace("'", '')
    filtered_words = filtered_words.replace('[', '').replace(']', '')
    filtered_words = filtered_words.replace('#', '').replace('&nbsp', '')
    filtered_words = filtered_words.replace('?', '? ').replace(',', '')
    filtered_words = filtered_words.replace('/', ' ')
    filtered_words = filtered_words.replace('\\u200a', '').replace('\\u200b', '')
    filtered_words = filtered_words.replace('\\u2063', '').replace('\u202f', '')
    filtered_words = filtered_words.replace('\\u2028', ' ').replace('\\u2008', ' ')
    while ('\\u200e' in filtered_words) or ('\\xa0' in filtered_words):
        filtered_words = filtered_words.replace('\\u200e', '')
        filtered_words = filtered_words.replace('\\xa0', '')
    filtered_words_list.append(filtered_words)
    words = filtered_words.strip().split(' ')
    word_counts.append(len(words))
    unique_words = list(set(words))
    unique_word_counts.append(len(unique_words))
    words_list.extend(unique_words)
word_columns = list(set(words_list))
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
len(word_columns)

Run time: 1.7382009029388428 seconds


36234

In [4]:
# Add word_count and unique_word_count columns and replace the filtered column
nlp_df['word_count'] = word_counts
nlp_df['unique_word_count'] = unique_word_counts
nlp_df['filtered'] = filtered_words_list
nlp_df.head(1)

Unnamed: 0,song,song_id,artist,artist_id,category,category_id,popularity,genres,audio_ft_danceability,audio_ft_energy,...,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,audio_ft_duration_ms,audio_ft_time_signature,lyrics,filtered,language,word_count,unique_word_count
0,willow,0lx2cLdOt3piJbcaXIV74f,Taylor Swift,06HL4z0CvFAxyc27GXpf02,pop,8,93,"['dance', 'pop']",0.392,0.574,...,0.145,0.529,81.112,214707.0,4.0,Im like the water when your ship rolled in th...,im like water ship rolled night rough surface...,"(1, 'en')",227,87


In [5]:
# Remove songs with less than 25 unique words
nlp_df = nlp_df[nlp_df['unique_word_count']>=25]
len(nlp_df)

8041

# Examine popularities by genre

In [6]:
# Remove the language column
nlp_df = nlp_df.drop('language', axis=1)

In [7]:
# Drop songs with incorrect lyrics from the DataFrame
nlp_df = nlp_df[(nlp_df['unique_word_count']!=3878) & (nlp_df['unique_word_count']!=1153)]
nlp_df = nlp_df[(nlp_df['unique_word_count']!=1000) & (nlp_df['unique_word_count']!=880)]
nlp_df = nlp_df[(nlp_df['unique_word_count']!=842) & (nlp_df['unique_word_count']!=607)]
nlp_df = nlp_df[(nlp_df['unique_word_count']!=569) & (nlp_df['word_count']!=1383)]
len(nlp_df)

8031

In [8]:
# Describe the distribution of unique word counts for each cateogry
popularities = {}
blues_popularity = nlp_df[nlp_df['category']=='blues']['popularity'].describe()
popularities['blues'] = blues_popularity

classical_unique_words = nlp_df[nlp_df['category']=='classical']['popularity'].describe()
popularities['classical'] = classical_unique_words

country_popularity = nlp_df[nlp_df['category']=='country']['popularity'].describe()
popularities['country'] = country_popularity

funk_popularity = nlp_df[nlp_df['category']=='funk']['popularity'].describe()
popularities['funk'] = funk_popularity

hiphop_popularity = nlp_df[nlp_df['category']=='hiphop']['popularity'].describe()
popularities['hiphop'] = hiphop_popularity

indie_alt_popularity = nlp_df[nlp_df['category']=='indie_alt']['popularity'].describe()
popularities['indie_alt'] = indie_alt_popularity

jazz_popularity = nlp_df[nlp_df['category']=='jazz']['popularity'].describe()
popularities['jazz'] = jazz_popularity

metal_popularity = nlp_df[nlp_df['category']=='metal']['popularity'].describe()
popularities['metal'] = metal_popularity

pop_popularity = nlp_df[nlp_df['category']=='pop']['popularity'].describe()
popularities['pop'] = pop_popularity

punk_popularity = nlp_df[nlp_df['category']=='punk']['popularity'].describe()
popularities['punk'] = punk_popularity

rnb_popularity = nlp_df[nlp_df['category']=='rnb']['popularity'].describe()
popularities['rnb'] = rnb_popularity

rock_popularity = nlp_df[nlp_df['category']=='rock']['popularity'].describe()
popularities['rock'] = rock_popularity

romance_popularity = nlp_df[nlp_df['category']=='romance']['popularity'].describe()
popularities['romance'] = romance_popularity

soul_popularity = nlp_df[nlp_df['category']=='soul']['popularity'].describe()
popularities['soul'] = soul_popularity
len(popularities)

14

In [9]:
# Print the distributions for each genre
popularities

{'blues': count    424.000000
 mean      31.233491
 std       18.104365
 min        0.000000
 25%       22.000000
 50%       36.000000
 75%       44.000000
 max       73.000000
 Name: popularity, dtype: float64,
 'classical': count    93.000000
 mean     33.387097
 std      22.941339
 min       0.000000
 25%       0.000000
 50%      39.000000
 75%      54.000000
 max      71.000000
 Name: popularity, dtype: float64,
 'country': count    1257.000000
 mean       51.346858
 std        20.552922
 min         0.000000
 25%        45.000000
 50%        56.000000
 75%        65.000000
 max        86.000000
 Name: popularity, dtype: float64,
 'funk': count    282.000000
 mean      37.429078
 std       20.763118
 min        0.000000
 25%       27.000000
 50%       40.000000
 75%       53.000000
 max       85.000000
 Name: popularity, dtype: float64,
 'hiphop': count    680.000000
 mean      56.620588
 std       17.282378
 min        0.000000
 25%       49.750000
 50%       58.000000
 75%       

In [10]:
# Create a dictionary of mean word distributions
categories = list(popularities.keys())
means = []
for category in popularities:
    means.append(popularities[category]['mean'])
mean_popularities = dict(zip(categories, means))
mean_popularities = dict(sorted(mean_popularities.items(), key=lambda item: item[1], reverse=True))
mean_popularities

{'pop': 57.61616161616162,
 'hiphop': 56.620588235294115,
 'rnb': 54.530386740331494,
 'romance': 54.06810035842294,
 'country': 51.34685759745425,
 'rock': 49.66285714285714,
 'metal': 44.6380846325167,
 'indie_alt': 42.9560606060606,
 'soul': 41.647201946472016,
 'funk': 37.4290780141844,
 'jazz': 37.2979797979798,
 'punk': 35.85940246045694,
 'classical': 33.38709677419355,
 'blues': 31.233490566037737}

In [11]:
# Create DataFrame of popularity distributions
ranges = []
for category in popularities:
    ranges.append(popularities[category]['max'] - popularities[category]['min'])

columns = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'range']
df_rows = []
for index, category in enumerate(popularities):
    distribution = list(popularities[category])
    distribution.extend([ranges[index]])
    df_rows.append(distribution)
popularity_df = pd.DataFrame(df_rows, index=list(popularities.keys()), columns=columns)
popularity_df

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,range
blues,424.0,31.233491,18.104365,0.0,22.0,36.0,44.0,73.0,73.0
classical,93.0,33.387097,22.941339,0.0,0.0,39.0,54.0,71.0,71.0
country,1257.0,51.346858,20.552922,0.0,45.0,56.0,65.0,86.0,86.0
funk,282.0,37.429078,20.763118,0.0,27.0,40.0,53.0,85.0,85.0
hiphop,680.0,56.620588,17.282378,0.0,49.75,58.0,66.0,99.0,99.0
indie_alt,660.0,42.956061,24.893452,0.0,34.0,51.0,61.0,87.0,87.0
jazz,198.0,37.29798,18.824519,0.0,29.0,41.0,52.0,73.0,73.0
metal,898.0,44.638085,19.20236,0.0,35.0,47.0,58.0,85.0,85.0
pop,693.0,57.616162,20.537499,0.0,46.0,58.0,73.0,99.0,99.0
punk,569.0,35.859402,19.578784,0.0,25.0,39.0,50.0,89.0,89.0


In [13]:
# Save DataFrame to CSV
popularity_df.to_csv('../../Data/popularity.csv')

# Conclusions about word distributions by genre
- The pop genre has the highest mean popularity of 57.6 words
- The blues genre has the lowest mean word count of 31.2 words
- Genres with the highest average popularities:
    - Pop: 57.6
    - Hiphop: 56.6 
    - R&B: 54.5
- Genres with the lowest average popularities:
    - Blues: 31.2
    - Classical: 33.4 
    - Punk: 35.9