In [1]:
import pandas as pd
pd.set_option('display.max_columns', None) # visualize all columns in console
import numpy as np
import pickle
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF

## LyricsTFIDF CLASS 

In [15]:
class LyricsTFIDF:
    
    def __init__(self, lyrics_df, preprocessed_corpus):
        
        self.lyrics_df = lyrics_df
        self.preprocessed_corpus = preprocessed_corpus
        self.tfidf_matrix = None
        
    def fit_transform(self, 
                      preprocessed_corpus=None, 
                      compute_constituents=False,
                      save_attributes=True,
                      topn=20,
                      analyzer='word',
                      ngram_range=(1,1), 
                      doc_frequency=(0.001,0.75), 
                      stop_words_language='english', 
                      logarithmic=True, 
                      info=True):
        
        if preprocessed_corpus is None:
            preprocessed_corpus = self.preprocessed_corpus
        
        # preprocessed corpus is expected to be a list of list of tokens/lemmas
        if info:
            print("Joining tokens for each lyrics ...", flush=True)
        preprocessed_corpus = [" ".join(lyrics) for lyrics in tqdm(preprocessed_corpus)]
        
        # dict for dataframe
        data = {}
        
        # initialize
        if info:
            print("Fitting TFIDF vectorizer ...", flush=True)
        tfidf_vectorizer = TfidfVectorizer(analyzer=analyzer,
                                            ngram_range=ngram_range,
                                            min_df=doc_frequency[0], 
                                            max_df=doc_frequency[1], 
                                            stop_words=stop_words_language, 
                                            sublinear_tf=logarithmic
                                           )
        # fit-transform
        tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_corpus)
    
        if save_attributes:
            self.tfidf_matrix = tfidf_matrix
            self.tfidf_vectorizer = tfidf_vectorizer
            
        data['ngram'] = tfidf_vectorizer.get_feature_names()
        data['tfidf'] = tfidf_matrix.sum(axis=0).A1
        
        if compute_constituents:
            if info:
                print("Fitting count vectorizer ...", flush=True)
            count_vectorizer = CountVectorizer(analyzer=analyzer,
                                                ngram_range=ngram_range,
                                                min_df=doc_frequency[0], 
                                                max_df=doc_frequency[1], 
                                                stop_words=stop_words_language, 
                                                sublinear_tf=logarithmic
                                               )
            count_matrix = count_vectorizer.fit_transform(preprocessed_corpus)
            
            data['tf'] = count_matrix.sum(axis=0).A1
            data['idf'] = tfidf_vectorizer_word.idf_
                
        # create DataFrame
        df = pd.DataFrame(data=data).sort_values("tfidf", ascending=False).reset_index(drop=True).loc[:topn, :]
        
        return df
        
        
    def tfidf_by_genre(self, genres, lyrics_df=None, preprocessed_corpus=None, **kwargs):
        
        if lyrics_df is None:
            lyrics_df = self.lyrics_df
        if preprocessed_corpus is None:
            preprocessed_corpus = self.preprocessed_corpus
        
        # check if input error
        for genre in genres:
            admissible = lyrics_df.genre.unique()
            if genre not in admissible:
                raise KeyError(f"{genre} is not an admissible genre")
        
        # initialize df to return
        df = pd.DataFrame()
        
        print("Fitting TFIDF vectorizer for all genres ...", flush=True)
        for genre in genres:

            genre_idxs = lyrics_df.loc[lyrics_df.genre == genre, "lyrics"].index
            genre_corpus = np.array(preprocessed_corpus)[genre_idxs].tolist()
            
            genre_df = self.fit_transform(preprocessed_corpus=genre_corpus, info=False, save_attributes=False, **kwargs)
            
            # add MultiIndex for genres
            genre_df.columns = pd.MultiIndex.from_product([[genre], genre_df.columns])
            
            df = pd.concat([df, genre_df], axis="columns")
        
        return df
    
    
    def matrix_factorization(self, method, n_components, tfidf_matrix=None):
        
        if tfidf_matrix is None:
            tfidf_matrix = self.tfidf_matrix
        
        if method == 'svd':
            svd = TruncatedSVD(n_components=n_components)
            U = svd.fit_transform(tfidf_matrix)
            S = svd.singular_values_
            V = svd.components_
            
            return U, S, V

        elif method == 'nmf':
            nmf = NMF(n_components=n_components, init='nndsvd', random_state=0)

            W = nmf.fit_transform(tfidf_matrix)
            H = nmf.components_
            
            return W, H
    
    
    def print_latent_topics(self, lower_dimensional_words, vocabulary=None, topn=5):
    
        if vocabulary is None:
            vocabulary = self.tfidf_vectorizer.get_feature_names()

        topic_words = ([[vocabulary[i] for i in np.argsort(t)[:-topn-1:-1]]
                        for t in lower_dimensional_words])
        
        return [', '.join(t) for t in topic_words]

# TF-IDF 

In [3]:
lyrics_df = pd.read_csv("/Users/lucamasserano/Desktop/BOCCONI/nlp/final_project/lyrics_project/data/lyrics_cleaned.csv")
lyrics_df.shape

(43844, 7)

In [4]:
#with open('./data/trimmed_corpus_old.pickle', 'rb') as pickled_object:
#    trimmed_corpus_old = pickle.load(pickled_object)
with open('./data/trimmed_corpus.pickle', 'rb') as pickled_object:
    trimmed_corpus = pickle.load(pickled_object)

# remove if len(token) < 2 --> PUT THIS IN PREPROCESSING AND GIVE STATISTICS ON NUMBER OF LEMMA REMOVED
#trimmed_corpus = [[lemma for lemma in lyrics if len(lemma) > 2] for lyrics in trimmed_corpus]

In [5]:
trimmed_corpus[100]

['appreciate',
 'when',
 'young',
 'mama',
 'beef',
 'year',
 'old',
 'kick',
 'street',
 'back',
 'time',
 'never',
 'think',
 'see',
 'face',
 'woman',
 'alive',
 'could',
 'take',
 'mama',
 'place',
 'suspend',
 'school',
 'scare',
 'home',
 'fool',
 'big',
 'boy',
 'breakin',
 'rule',
 'shed',
 'tear',
 'baby',
 'sister',
 'year',
 'poor',
 'other',
 'little',
 'kid',
 'even',
 'different',
 'daddy',
 'same',
 'drama',
 'when',
 'thing',
 'go',
 'wrong',
 'blame',
 'reminisce',
 'stress',
 'cause',
 'hell',
 'huggin',
 'mama',
 'jail',
 'cell',
 'think',
 'elementary',
 'hey',
 'see',
 'penitentiary',
 'day',
 'police',
 'right',
 'mama',
 'catch',
 'put',
 'whoopin',
 'backside',
 'even',
 'crack',
 'fiend',
 'mama',
 'always',
 'black',
 'queen',
 'finally',
 'understand',
 'woman',
 'easy',
 'tryin',
 'raise',
 'man',
 'always',
 'commit',
 'poor',
 'single',
 'mother',
 'welfare',
 'tell',
 'how',
 'way',
 'can',
 'pay',
 'back',
 'plan',
 'show',
 'understand',
 'appreciate',


In [8]:
with open('./data/tagged_corpus.pickle', 'rb') as pickled_object:
    tagged_corpus = pickle.load(pickled_object)

In [9]:
tagged_corpus[100]

['appreciate_VERB',
 'when_ADV',
 'young_ADJ',
 'mama_NOUN',
 'beef_NOUN',
 'year_NOUN',
 'old_ADJ',
 'kick_VERB',
 'street_NOUN',
 'back_ADV',
 'time_NOUN',
 'never_ADV',
 'think_VERB',
 'would_VERB',
 'see_VERB',
 'face_NOUN',
 'be_VERB',
 'woman_NOUN',
 'alive_ADJ',
 'could_VERB',
 'take_VERB',
 'mama_NOUN',
 'place_NOUN',
 'suspend_VERB',
 'school_NOUN',
 'scare_VERB',
 'go_VERB',
 'home_ADV',
 'fool_NOUN',
 'big_ADJ',
 'boy_NOUN',
 'breakin_NOUN',
 'rule_NOUN',
 'shed_VERB',
 'tear_NOUN',
 'baby_NOUN',
 'sister_NOUN',
 'year_NOUN',
 'poor_ADJ',
 'other_ADJ',
 'little_ADJ',
 'kid_NOUN',
 'even_ADV',
 'different_ADJ',
 'daddy_NOUN',
 'same_ADJ',
 'drama_NOUN',
 'when_ADV',
 'thing_NOUN',
 'go_VERB',
 'wrong_ADJ',
 'would_VERB',
 'blame_VERB',
 'reminisce_VERB',
 'stress_NOUN',
 'cause_VERB',
 'hell_NOUN',
 'huggin_ADJ',
 'mama_NOUN',
 'jail_NOUN',
 'cell_NOUN',
 "'d_VERB",
 'think_VERB',
 'elementary_ADJ',
 'would_VERB',
 'see_VERB',
 'penitentiary_ADJ',
 'day_NOUN',
 'police_NOUN',

In [12]:
lyrics_tfidf = LyricsTFIDF(lyrics_df=lyrics_df, preprocessed_corpus=trimmed_corpus)

### 1. TF-IDF analysis of most important words in the whole corpus

Not very informative at any ngram level. Love is predominant. 

In [13]:
tfidf_whole_corpus = lyrics_tfidf.fit_transform(ngram_range=(1,1), doc_frequency=(0.001, 0.5), topn=20)
tfidf_whole_corpus

Joining tokens for each lyrics ...


100%|██████████| 43844/43844 [00:00<00:00, 184059.80it/s]

Fitting TFIDF vectorizer ...





Unnamed: 0,ngram,tfidf
0,love,1933.175291
1,say,1486.987418
2,come,1400.903199
3,make,1359.997496
4,time,1358.093903
5,yeah,1266.844007
6,let,1263.991387
7,want,1226.357716
8,feel,1187.808912
9,baby,1168.889552


### 2. TFIDF analysis of ngrams within musical genre

#### Lemmas

In [13]:
lyrics_tfidf.tfidf_by_genre(genres=["Hip-Hop", "Electronic", "Country", "Pop"])

Fitting TFIDF vectorizer for all genres ...


100%|██████████| 5712/5712 [00:00<00:00, 149234.85it/s]
100%|██████████| 1361/1361 [00:00<00:00, 234135.09it/s]
100%|██████████| 3666/3666 [00:00<00:00, 285137.38it/s]
100%|██████████| 9796/9796 [00:00<00:00, 234561.32it/s]


Unnamed: 0_level_0,Hip-Hop,Hip-Hop,Electronic,Electronic,Country,Country,Pop,Pop
Unnamed: 0_level_1,ngram,tfidf,ngram,tfidf,ngram,tfidf,ngram,tfidf
0,just,201.531755,love,62.770534,love,171.404108,love,504.436677
1,love,183.391465,know,57.876466,know,145.763925,know,472.331997
2,say,180.236696,just,46.493119,just,143.931513,just,392.819332
3,bitch,176.781107,feel,45.684078,say,121.59252,say,366.013818
4,make,176.19153,come,43.584496,make,115.585436,make,343.425022
5,let,168.557124,let,43.300451,time,115.160339,let,337.720165
6,come,167.525657,time,41.203833,come,114.138575,time,329.005521
7,fuck,159.747776,say,40.556409,way,103.444544,baby,328.813266
8,tell,154.620452,make,39.81855,heart,98.592403,come,326.532883
9,shit,153.629,want,38.281078,let,94.906574,feel,322.736016


#### Lemmas with POS tags

In [14]:
lyrics_tfidf.preprocessed_corpus = tagged_corpus
lyrics_tfidf.tfidf_by_genre(genres=["Hip-Hop", "Electronic", "Country", "Pop"], doc_frequency=(0.001, 0.3))

Fitting TFIDF vectorizer for all genres ...


100%|██████████| 5712/5712 [00:00<00:00, 124915.22it/s]
100%|██████████| 1361/1361 [00:00<00:00, 190592.89it/s]
100%|██████████| 3666/3666 [00:00<00:00, 257086.08it/s]
100%|██████████| 9796/9796 [00:00<00:00, 194832.29it/s]


Unnamed: 0_level_0,Hip-Hop,Hip-Hop,Electronic,Electronic,Country,Country,Pop,Pop
Unnamed: 0_level_1,ngram,tfidf,ngram,tfidf,ngram,tfidf,ngram,tfidf
0,love_noun,123.828691,want_verb,34.466838,be_verb,89.520494,need_verb,241.798611
1,why_adv,98.466808,never_adv,33.841482,heart_noun,88.890043,be_verb,240.074769
2,gon_verb,94.076031,baby_noun,33.450649,let_verb,85.696241,give_verb,233.417344
3,right_adv,93.671018,love_verb,32.395608,night_noun,81.134652,life_noun,228.985391
4,ass_noun,89.826484,tell_verb,32.369741,could_verb,80.76852,here_adv,223.220594
5,real_adj,88.39961,think_verb,29.710738,think_verb,80.486791,day_noun,221.498472
6,night_noun,87.133135,need_verb,29.000614,little_adj,80.035263,away_adv,209.293946
7,hear_verb,85.884731,way_noun,28.927436,tell_verb,79.957868,thing_noun,208.280183
8,run_verb,85.314507,give_verb,28.877181,baby_noun,79.579543,night_noun,205.457199
9,really_adv,85.150709,would_verb,28.721585,good_adj,79.43391,girl_noun,202.81223


### 3. Latent Topics from Matrix Factorization 

In [8]:
W, H = lyrics_tfidf.matrix_factorization(method='nmf', n_components=10)

In [14]:
lyrics_tfidf.print_latent_topics(H, topn=10)

TypeError: can only concatenate list (not "str") to list