In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import ast

df = pd.read_csv('../data/processed/cleaned_lyrics.csv', index_col=0)

# Convert lyrics tokens to list again
df['tokens'] = df['tokens'].apply(ast.literal_eval)

## Finding rap-specific words

I got this idea from another project I saw online. If we compare the relative frequency in rap songs vs the normal english, we can find words that are specific to rap (likely to be slang). We can do this easily by comparing it to the brown corpus through NLTK.

In [39]:
from collections import Counter

all_tokens = [token for tokens in df["tokens"] for token in tokens]
rap_counts = Counter(all_tokens)
N_rap = sum(rap_counts.values())

In [40]:
from nltk.corpus import brown

brown_words = [w.lower() for w in brown.words()]
brown_counts = Counter(brown_words)
N_brown = sum(brown_counts.values())


In [41]:
import math

def uniqueness_score(word):
    # Get frequency of words in rap vs english
    rap_freq = rap_counts.get(word, 1)  # avoid zero
    brown_freq = brown_counts.get(word, 1)
    return math.log(rap_freq / N_rap) - math.log(brown_freq / N_brown)

# Example: top 10 unique words
scores = {w: uniqueness_score(w) for w in rap_counts.keys()}
top_unique = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:10]

for word, score in top_unique:
    print(word, score)


nigga 8.890797605639927
niggas 8.57214953392354
shit 7.823774837792608
gon 7.633154478183958
bitches 7.370200111582892
ayy 7.297027505693232
tryna 7.227401558877951
fuck 7.124006104122295
lil 6.976457158353349
fuckin 6.946890031992808


In [42]:
bottom_unique = sorted(scores.items(), key=lambda x: x[1], reverse=False)[:10]

for word, score in bottom_unique:
    print(word, score)

af -7.423942537399532
development -6.331336272791635
particularly -6.196949082083217
industrial -6.176187090634787
aj -5.984027084840545
facilities -5.81851264636297
significant -5.667689756628388
proposed -5.644159259218194
requirements -5.632183068171477
providence -5.595369095048762


In [45]:
from nltk import pos_tag

df['pos_tags'] = df['tokens'].apply(pos_tag)


In [None]:
verb_tags = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}

def extract_verbs(df):
    pos_list = df['pos_tags']
    return [token for token, tag in pos_list if tag in verb_tags]

test = df['pos_tags'].apply(extract_verbs)
all_verbs = [token for tokens in test for token in tokens]
verb_counts = Counter(all_verbs)
return verb_counts

Counter({'am': 59580,
         'is': 51754,
         'i': 38287,
         'do': 37777,
         'got': 33126,
         'are': 29904,
         'know': 23054,
         'get': 20000,
         'be': 19278,
         'want': 16822,
         'was': 14572,
         'go': 11239,
         'have': 9774,
         'let': 9162,
         'see': 9143,
         'make': 8917,
         'fuck': 8461,
         'say': 8236,
         'been': 7817,
         'take': 6694,
         'need': 6545,
         'tell': 6191,
         'put': 6064,
         'keep': 5939,
         'love': 5903,
         'had': 5810,
         'going': 5692,
         'come': 5208,
         'feel': 5080,
         'give': 5033,
         'think': 4914,
         'said': 4665,
         'gon': 4412,
         'shit': 4321,
         'did': 4315,
         'hit': 4311,
         'made': 3428,
         'told': 3280,
         'look': 3217,
         'call': 3197,
         'yeah': 3104,
         'came': 2919,
         'run': 2813,
         'done': 2687,


In [53]:
def get_most_common_pos(df, pos_tag_set):

    # Extract tokens matching the given POS tags
    filtered_tokens = df['pos_tags'].apply(
        lambda pos_list: [token for token, tag in pos_list if tag in pos_tag_set]
    )
    
    # Flatten list of lists
    all_tokens = [token for sublist in filtered_tokens for token in sublist]
    
    # Count frequencies
    token_counts = Counter(all_tokens)
    
    return token_counts

In [55]:
noun_tags = {'NN', 'NNS', 'NNP', 'NNPS'}
get_most_common_pos(df, noun_tags)

Counter({'i': 123759,
         'bitch': 15974,
         'yeah': 13631,
         'nigga': 11250,
         'niggas': 10307,
         'shit': 9836,
         'money': 9074,
         'time': 8045,
         'man': 7310,
         'fuck': 6256,
         'life': 6042,
         'way': 5799,
         'baby': 5706,
         'bitches': 4988,
         'girl': 4074,
         'love': 3938,
         'day': 3923,
         'ass': 3827,
         'nothing': 3632,
         'night': 3301,
         'hoes': 2977,
         'ayy': 2674,
         'world': 2647,
         'everything': 2615,
         'something': 2598,
         'mind': 2528,
         'head': 2507,
         'look': 2466,
         'boy': 2412,
         'face': 2265,
         'game': 2262,
         'lot': 2252,
         'people': 2194,
         'thing': 2157,
         'name': 2069,
         'god': 2052,
         'heart': 2023,
         'everybody': 2022,
         'dick': 1978,
         'gang': 1959,
         'mama': 1889,
         'home': 1858,
      