In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util
import nltk

import re
import requests
import pandas as pd
import numpy as np
import json
import os
import dotenv
import sys
import lyricsgenius
import base64
from glob import glob
sys.tracebacklimit = 0 # turn off the error tracebacks

In [2]:
CORPUS = pd.read_csv('data/core_tables/CORPUS')
CORPUS

Unnamed: 0,decade_id,song_num,stanza_num,line_num,token_num,pos_tuple,pos,token_str,term_str,pos_group
0,0,0,0,0,0,"(""Here's"", 'NNP')",NNP,Here's,heres,NN
1,0,0,0,0,1,"('the', 'DT')",DT,the,the,DT
2,0,0,0,0,2,"('thing,', 'NN')",NN,"thing,",thing,NN
3,0,0,0,0,3,"('we', 'PRP')",PRP,we,we,PR
4,0,0,0,0,4,"('started', 'VBD')",VBD,started,started,VB
...,...,...,...,...,...,...,...,...,...,...
226225,5,148,9,2,2,"('deep', 'VBP')",VBP,deep,deep,VB
226226,5,148,9,3,0,"(""Y'all"", 'DT')",DT,Y'all,yall,DT
226227,5,148,9,3,1,"('are', 'VBP')",VBP,are,are,VB
226228,5,148,9,3,2,"('crazy,', 'JJ')",JJ,"crazy,",crazy,JJ


In [3]:
VOCAB = pd.read_csv('data/core_tables/VOCAB')
VOCAB

Unnamed: 0,term_str,n,n_chars,p,i,max_pos,max_pos_group
0,1,17,1,0.000075,13.699968,CD,CD
1,10k,2,3,0.000009,16.787431,CD,CD
2,110th,2,5,0.000009,16.787431,CD,CD
3,1k,1,2,0.000004,17.787431,CD,CD
4,1s,1,2,0.000004,17.787431,CD,CD
...,...,...,...,...,...,...,...
11699,ọmọ,1,3,0.000004,17.787431,JJ,JJ
11700,ọpẹ,1,3,0.000004,17.787431,NNP,NN
11701,在中間,1,3,0.000004,17.787431,NN,NN
11702,站在中間,2,4,0.000009,16.787431,NNS,NN


# Create BOW

In [4]:
OHCO = ['decade_id', 'song_num', 'stanza_num', 'line_num', 'token_num']

In [5]:
bags = dict(
    SENTS = OHCO[:4],
    STANZAS = OHCO[:3],
    SONGS = OHCO[:2],
    DECADES = OHCO[:1]
)

In [6]:
def create_bag_of_words(CORPUS, bag):
    BOW = CORPUS.groupby(bag+['term_str']).term_str.count().to_frame('n')
    return BOW

In [7]:
BOW = create_bag_of_words(CORPUS, bags['DECADES'])

BOW.sort_values('n', ascending = False).head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,n
decade_id,term_str,Unnamed: 2_level_1
2,i,2061
2,you,1596
0,you,1555
0,i,1514
5,i,1506
1,you,1501
1,i,1479
0,the,1427
2,the,1314
1,the,1306


# Compute TFIDF/DFIDF/DTM

In [8]:
def get_TFIDF(BOW, tf_method, idf_method ='standard'):
    DTCM = BOW.n.unstack(fill_value=0)

    DF = DTCM.astype('bool').sum() 
    N = len(DTCM)

    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()

    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()
        
    elif tf_method == 'log':
        TF = np.log2(1 + DTCM.T)
        
    elif tf_method == 'raw':
        TF = DTCM.T
        
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    
    TF = TF.T

    if idf_method == 'standard':
        IDF = np.log2(N / DF)

    elif idf_method == 'max':
        IDF = np.log2(DF.max() / DF) 

    elif idf_method == 'smooth':
        IDF = np.log2((1 + N) / (1 + DF)) + 1

    return [(TF * IDF), (DF*IDF), DTCM]

### TFIDF

In [9]:
TFIDF = get_TFIDF(BOW, 'max')[0]
TFIDF

term_str,1,10k,110th,1k,1s,2,220kid,22nd,24hoursav,24kgoldn,...,детство,еlla,еsta,еyes,ẹ,ọmọ,ọpẹ,在中間,站在中間,박혜진
decade_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.001722,0.0,0.001559,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001722,0.003444,0.0
2,0.0,0.002508,0.0,0.0,0.0,0.00369,0.001254,0.001254,0.001254,0.005383,...,0.0,0.001254,0.001254,0.001254,0.002508,0.001254,0.001254,0.0,0.0,0.001254
3,0.0,0.0,0.001463,0.0,0.0,0.00054,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.001329,0.0,0.002167,0.0,0.0,0.0,0.0,0.0,...,0.001329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.000777,0.0,0.0,0.0,0.001052,...,0.001052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### DFIDF (and add to VOCAB)

In [10]:
DFIDF = get_TFIDF(BOW, 'max')[1]
DFIDF = DFIDF.to_frame('dfidf')
DFIDF

Unnamed: 0_level_0,dfidf
term_str,Unnamed: 1_level_1
1,0.000000
10k,2.584963
110th,3.169925
1k,2.584963
1s,2.584963
...,...
ọmọ,2.584963
ọpẹ,2.584963
在中間,2.584963
站在中間,2.584963


In [14]:
VOCAB = VOCAB.merge(DFIDF, left_on='term_str', right_on='term_str', how = 'left')#.drop('dfidf_x', axis=1).rename({'dfidf_y':'dfidf'}, axis=1)
VOCAB

Unnamed: 0,term_str,n,n_chars,p,i,max_pos,max_pos_group,dfidf
0,1,17,1,0.000075,13.699968,CD,CD,0.000000
1,10k,2,3,0.000009,16.787431,CD,CD,2.584963
2,110th,2,5,0.000009,16.787431,CD,CD,3.169925
3,1k,1,2,0.000004,17.787431,CD,CD,2.584963
4,1s,1,2,0.000004,17.787431,CD,CD,2.584963
...,...,...,...,...,...,...,...,...
11699,ọmọ,1,3,0.000004,17.787431,JJ,JJ,2.584963
11700,ọpẹ,1,3,0.000004,17.787431,NNP,NN,2.584963
11701,在中間,1,3,0.000004,17.787431,NN,NN,2.584963
11702,站在中間,2,4,0.000009,16.787431,NNS,NN,2.584963


## List the 20 most significant words in the corpus by dfidf

In [15]:
list(VOCAB.sort_values('dfidf', ascending=False)['term_str'].head(20))

['manga',
 'stormy',
 'strawberries',
 'opens',
 'strapped',
 'opera',
 'strap',
 'strangest',
 'indecision',
 'stormin',
 'sweeter',
 'stories',
 'conversation',
 'conversations',
 'stoppin',
 'stoop',
 'stoner',
 'stomp',
 'streetlights',
 'consolation']

In [16]:
DTM = get_TFIDF(BOW, 'max')[2]
DTM

term_str,1,10k,110th,1k,1s,2,220kid,22nd,24hoursav,24kgoldn,...,детство,еlla,еsta,еyes,ẹ,ọmọ,ọpẹ,在中間,站在中間,박혜진
decade_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,0,1,0,4,0,0,0,0,...,0,0,0,0,0,0,0,1,2,0
2,7,2,0,0,0,13,1,1,1,7,...,0,1,1,1,2,1,1,0,0,1
3,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,1,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,2,0,0,0,0,2,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


## TODO: TFIDF_L2

### Write Derived Tables to CSV

In [17]:
TFIDF.to_csv('/Users/michaelvaden/GithubRepos/Text_Analytics_Final_Project/data/derived_tables/TFIDF')
DFIDF.to_csv('/Users/michaelvaden/GithubRepos/Text_Analytics_Final_Project/data/derived_tables/DFIDF')
DTM.to_csv('/Users/michaelvaden/GithubRepos/Text_Analytics_Final_Project/data/derived_tables/DTM')
#TFIDF_L2

# Update VOCAB
VOCAB.to_csv('/Users/michaelvaden/GithubRepos/Text_Analytics_Final_Project/data/core_tables/VOCAB')