In [87]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util
import nltk

import re
import requests
import pandas as pd
import numpy as np
import json
import os
import dotenv
import sys
import lyricsgenius
import base64
from glob import glob
sys.tracebacklimit = 0 # turn off the error tracebacks

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from scipy.linalg import norm

In [88]:
OHCO = ['decade_id', 'song_num', 'stanza_num', 'line_num', 'token_num']

In [117]:
CORPUS = pd.read_csv('data/core_tables/CORPUS.csv', index_col= OHCO)
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
decade_id,song_num,stanza_num,line_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,0,0,0,"(""Here's"", 'NNP')",NNP,Here's,heres,NN
0,0,0,0,1,"('the', 'DT')",DT,the,the,DT
0,0,0,0,2,"('thing,', 'NN')",NN,"thing,",thing,NN
0,0,0,0,3,"('we', 'PRP')",PRP,we,we,PR
0,0,0,0,4,"('started', 'VBD')",VBD,started,started,VB
...,...,...,...,...,...,...,...,...,...
5,107,9,2,1,"(""y'all"", 'PRP')",PRP,y'all,yall,PR
5,107,9,2,2,"('deep', 'VBP')",VBP,deep,deep,VB
5,107,9,3,0,"(""Y'all"", 'DT')",DT,Y'all,yall,DT
5,107,9,3,1,"('are', 'VBP')",VBP,are,are,VB


In [90]:
VOCAB = pd.read_csv('data/core_tables/VOCAB.csv', index_col = 'term_str')
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,stem_porter,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2,1,0.000010,16.660497,CD,CD,1,0
10k,2,3,0.000010,16.660497,CD,CD,10k,0
1k,1,2,0.000005,17.660497,CD,CD,1k,0
1s,1,2,0.000005,17.660497,CD,CD,1s,0
2,3,1,0.000014,16.075535,CD,CD,2,0
...,...,...,...,...,...,...,...,...
è,12,1,0.000058,14.075535,NNP,NN,è,0
él,3,2,0.000014,16.075535,NNP,NN,él,0
еlla,1,4,0.000005,17.660497,IN,IN,еlla,0
еsta,1,4,0.000005,17.660497,VBZ,VB,еsta,0


In [91]:
def gather_docs(CORPUS, ohco_level, term_col='term_str'):
    OHCO = CORPUS.index.names
    CORPUS[term_col] = CORPUS[term_col].astype('str')
    DOC = CORPUS.groupby(OHCO[:ohco_level])[term_col].apply(lambda x:' '.join(x)).to_frame('doc_str')
    return DOC

In [92]:
DOC = gather_docs(CORPUS, 2)
DOC['n_tokens'] = DOC.doc_str.apply(lambda x: len(x.split()))

In [93]:
DOC = DOC.query("n_tokens > 1")

DOC

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_str,n_tokens
decade_id,song_num,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,heres the thing we started out friends it was ...,279
0,1,sweet disposition never too soon oh reckless a...,231
0,2,walking on a dream how can i explain talking t...,252
0,3,five four three two one ah you were a child cr...,220
0,4,if i told you things i did before told you how...,307
...,...,...,...
5,103,oohohohoh come on oh yeah well i tried to tell...,517
5,104,you and me we come from different worlds you l...,299
5,105,lifes like a road that you travel on when ther...,390
5,106,earn their lives again earn their li to pull u...,453


# Create BOW

In [94]:
bags = dict(
    SENTS = OHCO[:4],
    STANZAS = OHCO[:3],
    SONGS = OHCO[:2],
    DECADES = OHCO[:1]
)

In [95]:
def create_bag_of_words(CORPUS, bag):
    BOW = CORPUS.groupby(bag+['term_str']).term_str.count().to_frame('n')
    return BOW

In [96]:
BOW = create_bag_of_words(CORPUS, bags['DECADES'])
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,n
decade_id,term_str,Unnamed: 2_level_1
0,2g,1
0,5,1
0,5foot,1
0,8th,1
0,9,1
...,...,...
5,youve,44
5,yow,1
5,zombie,8
5,zombieieie,2


# Compute TFIDF/DFIDF/DTM

In [97]:
def get_TFIDF(BOW, tf_method, idf_method ='standard'):
    DTCM = BOW.n.unstack(fill_value=0)

    DF = DTCM.astype('bool').sum() 
    N = len(DTCM)

    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()

    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()
        
    elif tf_method == 'log':
        TF = np.log2(1 + DTCM.T)
        
    elif tf_method == 'raw':
        TF = DTCM.T
        
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    
    TF = TF.T

    if idf_method == 'standard':
        IDF = np.log2(N / DF)

    elif idf_method == 'max':
        IDF = np.log2(DF.max() / DF) 

    elif idf_method == 'smooth':
        IDF = np.log2((1 + N) / (1 + DF)) + 1

    return [(TF * IDF), (DF*IDF), DTCM]

### TFIDF

In [98]:
TFIDF = get_TFIDF(BOW, 'max')[0]
TFIDF

term_str,1,10k,1k,1s,2,220kid,22nd,2g,2k,3,...,zone,zu,zum,zumban,zón,è,él,еlla,еsta,еyes
decade_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001685,0.0,0.0,...,0.000652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.001098,0.0,0.00179,0.0,0.000693,0.0,0.0,0.0,0.0,0.0,...,0.001385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000812,0.002647,0.0,0.0,0.000512,0.001324,0.001324,0.0,0.003971,0.0,...,0.000512,0.0,0.0,0.002647,0.001324,0.0,0.003971,0.001324,0.001324,0.001324
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.002219,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.002219,0.004438,0.0,0.0,0.026626,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.00067,0.0,0.0,0.0,0.0,0.003463,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
BOW['tfidf'] = TFIDF.stack()

In [100]:
BOW.sort_values('n', ascending = False).head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,tfidf
decade_id,term_str,Unnamed: 2_level_1,Unnamed: 3_level_1
2,i,1953,0.0
0,you,1534,0.0
2,you,1494,0.0
0,i,1493,0.0
5,i,1493,0.0
1,i,1444,0.0
1,you,1444,0.0
0,the,1411,0.0
5,the,1209,0.0
1,the,1189,0.0


### DFIDF (and add to VOCAB)

In [101]:
DFIDF = get_TFIDF(BOW, 'max')[1]
DFIDF = DFIDF.to_frame('dfidf')
DFIDF

Unnamed: 0_level_0,dfidf
term_str,Unnamed: 1_level_1
1,3.169925
10k,2.584963
1k,2.584963
1s,2.584963
2,3.000000
...,...
è,2.584963
él,2.584963
еlla,2.584963
еsta,2.584963


In [102]:
# add DFIDF to VOCAB
VOCAB = VOCAB.merge(DFIDF, left_on='term_str', right_on='term_str', how = 'left')#.drop('dfidf_x', axis=1).rename({'dfidf_y':'dfidf'}, axis=1)
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,stem_porter,stop,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2,1,0.000010,16.660497,CD,CD,1,0,3.169925
10k,2,3,0.000010,16.660497,CD,CD,10k,0,2.584963
1k,1,2,0.000005,17.660497,CD,CD,1k,0,2.584963
1s,1,2,0.000005,17.660497,CD,CD,1s,0,2.584963
2,3,1,0.000014,16.075535,CD,CD,2,0,3.000000
...,...,...,...,...,...,...,...,...,...
è,12,1,0.000058,14.075535,NNP,NN,è,0,2.584963
él,3,2,0.000014,16.075535,NNP,NN,él,0,2.584963
еlla,1,4,0.000005,17.660497,IN,IN,еlla,0,2.584963
еsta,1,4,0.000005,17.660497,VBZ,VB,еsta,0,2.584963


## List the 20 most significant words in the corpus by dfidf

In [103]:
list(VOCAB.sort_values('dfidf', ascending=False).index[:20])

['1',
 'ridin',
 'extra',
 'ey',
 'roamin',
 'buzz',
 'buying',
 'timeless',
 'roadside',
 'rivers',
 'butter',
 'rises',
 'facts',
 'ting',
 'rips',
 'faded',
 'fags',
 'fail',
 'bursting',
 'rights']

### DTM

In [104]:
DTM = get_TFIDF(BOW, 'max')[2]
DTM

term_str,1,10k,1k,1s,2,220kid,22nd,2g,2k,3,...,zone,zu,zum,zumban,zón,è,él,еlla,еsta,еyes
decade_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1,0,1,0,1,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
2,1,2,0,0,1,1,1,0,3,0,...,1,0,0,2,1,0,3,1,1,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,1,2,0,0,12,0,0,0,0
5,0,0,0,0,1,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0


## TFIDF_L2

In [105]:

TFIDF_reduced = BOW['tfidf'].unstack(fill_value=0)
pos_set = ['NN', 'VB']
VSHORT = VOCAB[VOCAB.max_pos_group.isin(['NN', 'VB', 'JJ']) & ~VOCAB.max_pos.isin(['NNP'])].sort_values('dfidf', ascending=False).head(5000)
TFIDF_reduced = TFIDF_reduced[VSHORT.index]

TFIDF_reduced

term_str,summat,tape,tamed,tan,pads,extra,ey,coat,internet,tap,...,irish,kufuffin,ladys,amar,lads,ladidah,laddie,ladadidada,ladadida,ladadadada
decade_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.001033,0.001033,0.001033,0.0,0.001033,0.0,0.015498,0.001033,0.0,0.002066,...,0.0,0.001685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.001098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001098,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000812,0.000812,0.000812,0.021912,0.0,0.001623,0.004869,...,0.002647,0.0,0.0,0.003971,0.002647,0.0,0.0,0.006618,0.001324,0.003971
3,0.0,0.003164,0.0,0.0,0.0,0.0,0.0,0.003164,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00516,0.0,0.0,0.0
4,0.0,0.0,0.00136,0.0,0.0,0.004081,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002219,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.001062,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.003463,0.0,0.0,0.0,0.0


In [114]:
TFIDF_L2 = (TFIDF_reduced.T / norm(TFIDF_reduced, axis=1)).T

TFIDF_L2

term_str,summat,tape,tamed,tan,pads,extra,ey,coat,internet,tap,...,irish,kufuffin,ladys,amar,lads,ladidah,laddie,ladadidada,ladadida,ladadadada
decade_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.006468,0.006468,0.006468,0.0,0.006468,0.0,0.097014,0.006468,0.0,0.012935,...,0.0,0.010548,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.006871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006871,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.004617,0.004617,0.004617,0.12466,0.0,0.009234,0.027702,...,0.01506,0.0,0.0,0.02259,0.01506,0.0,0.0,0.03765,0.00753,0.02259
3,0.0,0.010988,0.0,0.0,0.0,0.0,0.0,0.010988,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.017921,0.0,0.0,0.0
4,0.0,0.0,0.008398,0.0,0.0,0.025194,0.0,0.0,0.0,0.0,...,0.0,0.0,0.013697,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.004114,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.013421,0.0,0.0,0.0,0.0


### Write Derived Tables to CSV

In [115]:
TFIDF.to_csv('data/derived_tables/TFIDF.csv')
DFIDF.to_csv('data/derived_tables/DFIDF.csv')
DTM.to_csv('data/derived_tables/DTM.csv')
DOC.to_csv('data/derived_tables/DOC.csv')
TFIDF_L2.to_csv('data/derived_tables/TFIDF_L2.csv')
BOW.to_csv('data/derived_tables/BOW.csv')

# Update VOCAB
VOCAB.to_csv('data/core_tables/VOCAB.csv')