# Summary: Visualize how words in lyrics affect Age-Ratings.

#  Load Lyrics and Preprocess

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spotipy
import os
%matplotlib inline

In [2]:
df = pd.read_csv('../data/all.csv')
df = df.dropna(subset=['lyrics'])
df = df.reset_index(drop=True)

import re

def remove_disclaimer(string):
    s = re.sub(r'(\*)+ This Lyrics is NOT for Commercial use .*\r\n\([0-9]*\)',"", string)
    return s

df['lyrics'] = df['lyrics'].apply(remove_disclaimer)

In [3]:
import spacy
import re

nlp = spacy.load("en_core_web_sm",  disable=['parser','ner', 'textcat'])


def my_lemmatizer(doc):
    doc_cleaned = ' '.join(re.findall(r'\b\w\w\w+\b', doc)) 
    return [ w.lemma_.lower() for w in nlp(doc_cleaned) 
                      if len(w.lemma_)>2 ]

def process_text(text, stop_words=set()):
    if not text:
        return []

    text = ' '.join(re.findall(r'\b\w\w\w+\b', text))   #only keep word with at least 3 chars
    doc = nlp(text)
    result=  [token.lemma_.lower() for token in doc if token.lemma_.lower() not in stop_words]
    result = " ".join(result)
    return result



stopwords = spacy.lang.en.STOP_WORDS.union(['-pron-', 'oh','ooh','la'])
stopwords = set(my_lemmatizer(' '.join(list(stopwords))))



In [4]:
df['processed_lyrics'] = df['lyrics'].apply(process_text, args=(stopwords,))

df['processed_lyrics'].head(3)

0    old transylvania be lad castle be poor be sad ...
1    saw monster mirror wake today monster mirror d...
2    big red car roll street people meet like hello...
Name: processed_lyrics, dtype: object

## Divide data into two age groups:  Young (Age=2~8), Old (Age=12-18)

In [25]:
#df_age = df_age.drop(columns='group')
df_age = df_age.dropna(subset=['processed_lyrics'])

In [26]:
df_age = df[(df['Age']<9) | (df['Age']>11)].copy()
df_age['group'] = df_age['Age'].apply(lambda x: 'young' if x<9 else 'old' )
df_age = df_age[['processed_lyrics','group']]
df_age.sample(5)

Unnamed: 0,processed_lyrics,group
752,come boy sail away lady sail away tennessee sa...,young
12994,yeah yeah walk girl look like damn fly pimp be...,old
11133,head nigga walle play jump boo cudi funk hit h...,old
679,got to jump spin pick bale cotton got to jump ...,young
11078,ohh yeah waaaahhh yeah ciara lead floor bring ...,old


## Find the most polarized 50 words ( 25 young , 25 old)

In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidfVectorizer = TfidfVectorizer()
tfidfVectorizer.fit_transform(df_age['processed_lyrics'])

<11586x22361 sparse matrix of type '<class 'numpy.float64'>'
	with 393594 stored elements in Compressed Sparse Row format>

In [28]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
X_train = tfidfVectorizer.transform(df_age['processed_lyrics'])
clf.fit(X_train, df_age['group'])

MultinomialNB()

In [37]:
from pprint import pprint
matrix = clf.feature_log_prob_ 
_, ncols = matrix.shape
polarity = [(i,0) for i in range(ncols)]
for j in range(ncols):
    polarity[j] = (j, matrix[0][j] - matrix[1][j])

polarity.sort(key=lambda x: x[1])

polar_words = [tfidfVectorizer.get_feature_names_out()[x[0]] for x in polarity[0:25]]+ [tfidfVectorizer.get_feature_names()[x[0]] for x in polarity[-25::]]

print (polar_words[0:25])

print (polar_words[25:50])

['mistletoe', 'christmas', 'santa', 'wiggles', 'twinkle', 'carly', 'roast', 'pirate', 'sleigh', 'reindeer', 'claus', 'quack', 'kryptonite', 'carol', 'murray', 'freddie', 'turkey', 'choo', 'aglow', 'yuletide', 'magical', 'muppet', 'henry', 'wubba', 'gary']
['verse', 'drug', 'fucking', 'gun', 'hood', 'niggaz', 'lil', 'club', 'pussy', 'hell', 'homie', 'dick', 'wit', 'damn', 'hoe', 'motherfucker', 'fuckin', 'sex', 'ass', 'sexy', 'niggas', 'nigga', 'bitch', 'fuck', 'shit']


## Plot the word using conditional probabilities as the (x,y) coordinates, and frequency as the marker size.

In [None]:
-- TODO