In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

import nbimporter
from tfidf import LyricsTFIDF

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

from scipy import sparse

Importing Jupyter notebook from tfidf.ipynb


# LyricsClassification CLASS

In [2]:
class LyricsClassification():
    
    
    def __init__(self, lyrics_df, genres=None, preprocessed_corpus=None):
        
        if genres is None:
            self.lyrics_df = lyrics_df
            self.genres=lyrics_df.genre.unique()
        else:
            self.lyrics_df = lyrics_df.loc[lyrics_df.genre.isin(genres), :]
            self.genres=genres
        
        self.preprocessed_corpus = preprocessed_corpus
        self.tfidf_model = LyricsTFIDF(lyrics_df, preprocessed_corpus)
        
        
    
    def divide_dataset(self, lyrics_df_preprocessed, train_size=0.7, dev_size=0.15, test_size=0.15):
        
        # stratified sampling respects class proportions
        train, dev_and_test = train_test_split(lyrics_df_preprocessed, 
                                               train_size=train_size, test_size=dev_size+test_size, 
                                               stratify=lyrics_df_preprocessed.genre)
        
        # repeat on dev_and_test to obtain separate dev and test sets
        dev, test = train_test_split(dev_and_test, 
                                     train_size=dev_size/(dev_size+test_size), test_size=test_size/(dev_size+test_size), 
                                     stratify=dev_and_test.genre)
        
        self.train_set, self.dev_set, self.test_set = train, dev, test
        
        return train, dev, test
        
    
    def encode_labels(self, train_set, dev_set, test_set, target_variable='genre'):

        # transform labels into numbers
        labels2numbers = LabelEncoder()

        y_train = labels2numbers.fit_transform(train_set.loc[:, target_variable])
        y_dev = labels2numbers.transform(dev_set.loc[:, target_variable])
        y_test = labels2numbers.transform(test_set.loc[:, target_variable])
        
        self.y_train, self.y_dev, self.y_test = y_train, y_dev, y_test
        self.label_encoder = labels2numbers
        
        return y_train, y_dev, y_test
    
    
    def transform_input(self, train_set, dev_set, test_set, ngram_range=(1,3), add_year=False, **kwargs):
        
        # train --> fit transform
        print("Transforming train ...", flush=True)
        tfidf_dataframe = self.tfidf_model.fit_transform(preprocessed_corpus=train_set.loc[:, "lyrics"], 
                                                         ngram_range=ngram_range, **kwargs)
        self.train_tfidf_matrix = self.tfidf_model.tfidf_matrix
        
        # dev --> transform
        print("Transforming dev ...", flush=True)
        dev_corpus = [" ".join(lyrics) for lyrics in tqdm(dev_set.loc[:, "lyrics"])]
        self.dev_tfidf_matrix = self.tfidf_model.tfidf_vectorizer.transform(dev_corpus)
        
        # test --> transform
        print("Transforming test ...", flush=True)
        test_corpus = [" ".join(lyrics) for lyrics in tqdm(test_set.loc[:, "lyrics"])]
        self.test_tfidf_matrix = self.tfidf_model.tfidf_vectorizer.transform(test_corpus)
        
        if add_year:
            self.train_tfidf_matrix = sparse.csr_matrix(np.hstack((self.train_tfidf_matrix.toarray(), 
                                                                train_set.loc[:, "year"].to_numpy().reshape((-1, 1))
                                                                ))
                                                       )
            self.dev_tfidf_matrix = sparse.csr_matrix(np.hstack((self.dev_tfidf_matrix.toarray(), 
                                                             dev_set.loc[:, "year"].to_numpy().reshape((-1, 1))
                                                                ))
                                                     )
            self.test_tfidf_matrix = sparse.csr_matrix(np.hstack((self.test_tfidf_matrix.toarray(), 
                                                                 test_set.loc[:, "year"].to_numpy().reshape((-1, 1))
                                                                ))
                                                      )
        
        return self.train_tfidf_matrix, self.dev_tfidf_matrix, self.test_tfidf_matrix


    def dummy_classifier(self, train_tfidf, y_train, dev_tfidf, y_dev):

        most_frequent = DummyClassifier(strategy='most_frequent')
        most_frequent.fit(train_tfidf, y_train)
        dumb_predictions = most_frequent.predict(dev_tfidf)
        
        mapping = [mapping for mapping in zip(range(len(self.genres)), self.label_encoder.inverse_transform(range(len(self.genres))))]
        print(f"Mapping: {mapping}", flush=True)
        print(classification_report(y_dev, dumb_predictions))
        
        return dumb_predictions
        
    
    def logistic_classifier(self, train_tfidf, y_train, dev_tfidf, y_dev, 
                            regularize=False, dimensionality_reduction=False, feature_selection=False, **kwargs):
        
        # regularization
        if not regularize is False:
            best_c = None
            best_performance = 0.0
            for c in regularize:
                print(c)
                classifier_c = LogisticRegression(n_jobs=-1, multi_class='auto', solver='lbfgs', C=c, **kwargs)
                classifier_c.fit(train_tfidf, y_train)
                predictions_c = classifier_c.predict(dev_tfidf)
                score = f1_score(y_dev, predictions_c, average='micro')
                if score > best_performance:
                    best_performance = score
                    best_c = c
                    print("New best performance: {}".format(score))

                print(classification_report(y_dev, predictions_c))
            return best_c
        
        # dimensionality reduction
        elif not dimensionality_reduction is False:
            best_dimension = None
            best_performance = 0.0
            for k in dimensionality_reduction:
                print(k)
                svd = TruncatedSVD(n_components=k)

                X_train_dim = svd.fit_transform(train_tfidf)
                X_dev_dim = svd.transform(dev_tfidf)

                classifier_k = LogisticRegression(n_jobs=-1, multi_class='auto', solver='lbfgs', **kwargs)
                classifier_k.fit(train_tfidf, y_train)
                predictions_k = classifier_k.predict(dev_tfidf)
                score = f1_score(y_dev, predictions_k, average='micro')
                if score > best_performance:
                    best_performance = score
                    best_dimension = k
                    print("New best performance: {}".format(score))
                print(classification_report(y_dev, predictions_k))
            
            return best_dimension
        
        # feature selection
        elif not feature_selection is False:
            best_feature_number = None
            best_performance = 0.0
            for k in feature_selection:
                print(k)
                selector = SelectKBest(chi2, k=k).fit(train_tfidf, y_train)
                X_train_sel = selector.transform(train_tfidf)
                X_dev_sel = selector.transform(dev_tfidf)
                
                classifier_k = LogisticRegression(n_jobs=-1, multi_class='auto', solver='lbfgs', **kwargs)
                classifier_k.fit(X_train_sel, y_train)
                predictions_k = classifier_k.predict(X_dev_sel)
                score = f1_score(y_dev, predictions_k, average='micro')
                if score > best_performance:
                    best_performance = score
                    best_feature_number = k
                    print("New best performance: {}".format(score))
                print(classification_report(y_dev, predictions_k))
                
            return best_feature_number
        
        # simple logistic regression
        else:
            classifier = LogisticRegression(n_jobs=-1, multi_class='auto', solver='lbfgs', **kwargs)
            classifier.fit(train_tfidf, y_train)

            predictions = classifier.predict(dev_tfidf)
            mapping = [mapping for mapping in zip(range(len(self.genres)), self.label_encoder.inverse_transform(range(len(self.genres))))]
            print(f"Mapping: {mapping}", flush=True)
            print(classification_report(y_dev, predictions))
            
            return predictions
            
            
    def bootstrap_sample(self, system1, system2, gold, samples=1000, score=f1_score, average='micro'):
        """
        compute the proportion of times the performance difference of the 
        two systems on a subsample is significantly different from the 
        performance on the entire sample
        """
        N = len(gold) # number of instances

        # make sure the two systems have the same number of samples
        assert len(system1) == N and len(system2) == N, 'samples have different lengths'

        # compute performance score on entire sample
        base_score1 = score(gold, system1, average=average)
        base_score2 = score(gold, system2, average=average)

        # switch systems if system2 is better
        if base_score2 > base_score1:
            system1, system2 = system2, system1
            base_score1, base_score2 = base_score2, base_score1

        # compute the difference
        basedelta = base_score1 - base_score2
        assert basedelta > 0, 'Wrong system first, system1 needs to be better!'

        system1 = np.array(system1)
        system2 = np.array(system2)
        gold = np.array(gold)

        p = 0
        deltas = []
        for i in range(samples):
            # select a subsample, with replacement
            sample = np.random.choice(N, size=N, replace=True)

            # collect data corresponding to subsample
            sample1 = system1[sample]
            sample2 = system2[sample]
            gold_sample = gold[sample]

            # compute scores on subsample
            sample_score1 = score(gold_sample, sample1, average=average)
            sample_score2 = score(gold_sample, sample2, average=average)
            sample_delta = sample_score1 - sample_score2

            # check whether the observed sample difference is at least 
            # twice as large as the base difference
            if sample_delta > 2*basedelta:
                p += 1
            deltas.append(sample_delta)

        return p/samples, deltas

## Analysis 

In [48]:
lyrics_df = lyrics_df = pd.read_csv("./data/lyrics_cleaned.csv")

with open('./data/lemmatized_corpus.pickle', 'rb') as pickled_object:
    lemmatized_corpus = pickle.load(pickled_object)

In [50]:
# put preprocessed corpus into dataframe
lyrics_df.loc[:, "lyrics"] = lemmatized_corpus

In [51]:
lyrics_df.loc[4748:4758, :]

Unnamed: 0,artist,song,album,release_date,genre,lyrics,year
4748,Bruce Springsteen,Blood Brothers,Greatest Hits (1995),1996-03-03,Rock,"[-PRON-, play, king, of, the, mountain, out, o...",1996
4749,Bruce Springsteen,Lonesome Day,The Rising,2002-07-30,Rock,"[baby, once, think, know, everything, need, to...",2002
4750,Bruce Springsteen,My Fathers House,Nebraska,1982-09-30,Rock,"[last, night, dream, that, be, child, out, whe...",1982
4751,Bruce Springsteen,Living Proof,Lucky Town,1992-03-31,Rock,"[well, now, on, summer, night, in, dusky, room...",1992
4752,Bruce Springsteen,She's The One,Born to Run,1975-08-25,Rock,"[with, -PRON-, killer, grace, and, -PRON-, sec...",1975
4753,Bruce Springsteen,Native American,,,Rock,"[jimmy, fallon, emmy, award, and, grammy, awar...",2002
4754,Bruce Springsteen,The Fever,18 Tracks,1999-04-13,Rock,"[when, get, home, from, -PRON-, job, turn, on,...",1999
4755,Bruce Springsteen,"Oh, Mary, Don't You Weep",,,Rock,"[well, if, could, surely, would, stand, on, th...",1986
4756,Bruce Springsteen,Cover Me,Born in the U.S.A.,1984-07-31,Rock,"[the, time, be, tough, now, just, get, tough, ...",1984
4757,Bruce Springsteen,Out of Work,,,Rock,"[when, somebody, like, harry, belafonte, make,...",1989


In [60]:
lyrics_classification = LyricsClassification(lyrics_df, genres=['Hip-Hop','Rock','Pop','Country'])

In [61]:
lyrics_classification.lyrics_df.genre.unique()

array(['Hip-Hop', 'Rock', 'Pop', 'Country'], dtype=object)

In [62]:
train_set, dev_set, test_set = lyrics_classification.divide_dataset(lyrics_classification.lyrics_df)
train_set.shape, dev_set.shape, test_set.shape

((24093, 7), (5163, 7), (5163, 7))

In [63]:
y_train, y_dev, y_test = lyrics_classification.encode_labels(train_set, dev_set, test_set, target_variable='genre')
y_train.shape, y_dev.shape, y_test.shape

((24093,), (5163,), (5163,))

In [79]:
train_tfidf, dev_tfidf, test_tfidf = lyrics_classification.transform_input(train_set, dev_set, test_set, 
                                                                           analyzer='char', ngram_range=(2,6), add_year=False)
train_tfidf.shape, dev_tfidf.shape, test_tfidf.shape

Transforming train ...
Joining tokens for each lyrics ...



  0%|          | 0/24093 [00:00<?, ?it/s][A
  8%|▊         | 1911/24093 [00:00<00:01, 19107.10it/s][A
 16%|█▋        | 3929/24093 [00:00<00:01, 19414.07it/s][A
 25%|██▍       | 6020/24093 [00:00<00:00, 19838.56it/s][A
 34%|███▎      | 8127/24093 [00:00<00:00, 20190.71it/s][A
 43%|████▎     | 10366/24093 [00:00<00:00, 20801.03it/s][A
 52%|█████▏    | 12493/24093 [00:00<00:00, 20934.53it/s][A
 60%|██████    | 14511/24093 [00:00<00:00, 20699.43it/s][A
 69%|██████▉   | 16655/24093 [00:00<00:00, 20912.87it/s][A
 79%|███████▉  | 19058/24093 [00:00<00:00, 21757.58it/s][A
 89%|████████▊ | 21367/24093 [00:01<00:00, 22138.51it/s][A
 98%|█████████▊| 23537/24093 [00:01<00:00, 21985.52it/s][A
100%|██████████| 24093/24093 [00:01<00:00, 21259.23it/s][A

Fitting TFIDF vectorizer ...
Transforming dev ...



  0%|          | 0/5163 [00:00<?, ?it/s][A
 37%|███▋      | 1899/5163 [00:00<00:00, 18988.89it/s][A
 76%|███████▌  | 3918/5163 [00:00<00:00, 19332.16it/s][A
100%|██████████| 5163/5163 [00:00<00:00, 19718.68it/s][A

Transforming test ...



  0%|          | 0/5163 [00:00<?, ?it/s][A
 46%|████▌     | 2373/5163 [00:00<00:00, 23729.23it/s][A
 93%|█████████▎| 4788/5163 [00:00<00:00, 23851.58it/s][A
100%|██████████| 5163/5163 [00:00<00:00, 23790.33it/s][A

((24093, 166887), (5163, 166887), (5163, 166887))

#### Dummy Classifier

In [80]:
dumb_predictions = lyrics_classification.dummy_classifier(train_tfidf, y_train, dev_tfidf, y_dev)

Mapping: [(0, 'Country'), (1, 'Hip-Hop'), (2, 'Pop'), (3, 'Rock')]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       550
           1       0.00      0.00      0.00       857
           2       0.00      0.00      0.00      1469
           3       0.44      1.00      0.61      2287

    accuracy                           0.44      5163
   macro avg       0.11      0.25      0.15      5163
weighted avg       0.20      0.44      0.27      5163



  'precision', 'predicted', average, warn_for)


#### Logistic Classifier

Defaults

In [81]:
logistic_predictions = lyrics_classification.logistic_classifier(train_tfidf=train_tfidf, y_train=y_train, 
                                                                 dev_tfidf=dev_tfidf, y_dev=y_dev)

Mapping: [(0, 'Country'), (1, 'Hip-Hop'), (2, 'Pop'), (3, 'Rock')]
              precision    recall  f1-score   support

           0       0.71      0.24      0.36       550
           1       0.82      0.70      0.76       857
           2       0.59      0.46      0.52      1469
           3       0.62      0.84      0.72      2287

    accuracy                           0.65      5163
   macro avg       0.69      0.56      0.59      5163
weighted avg       0.66      0.65      0.63      5163



Balanced classes

In [70]:
logistic_predictions_balanced = lyrics_classification.logistic_classifier(train_tfidf=train_tfidf, y_train=y_train, 
                                                                  dev_tfidf=dev_tfidf, y_dev=y_dev, class_weight='balanced')

Mapping: [(0, 'Country'), (1, 'Hip-Hop'), (2, 'Pop'), (3, 'Rock')]
              precision    recall  f1-score   support

           0       0.47      0.64      0.54       550
           1       0.74      0.80      0.77       857
           2       0.56      0.54      0.55      1469
           3       0.70      0.64      0.67      2287

    accuracy                           0.64      5163
   macro avg       0.62      0.65      0.63      5163
weighted avg       0.64      0.64      0.64      5163



#### Regularization

In [82]:
best_c = lyrics_classification.logistic_classifier(train_tfidf=train_tfidf, y_train=y_train, dev_tfidf=dev_tfidf, y_dev=y_dev, 
                                                  regularize=[50, 20, 10, 5, 2, 0.5, 0.1, 0.05, 0.01])

50
New best performance: 0.6159209761766414
              precision    recall  f1-score   support

           0       0.54      0.38      0.45       550
           1       0.78      0.69      0.73       857
           2       0.51      0.51      0.51      1469
           3       0.64      0.71      0.67      2287

    accuracy                           0.62      5163
   macro avg       0.62      0.57      0.59      5163
weighted avg       0.62      0.62      0.61      5163

20
New best performance: 0.6261863257795856
              precision    recall  f1-score   support

           0       0.56      0.43      0.49       550
           1       0.81      0.70      0.75       857
           2       0.53      0.50      0.51      1469
           3       0.64      0.72      0.68      2287

    accuracy                           0.63      5163
   macro avg       0.63      0.59      0.61      5163
weighted avg       0.63      0.63      0.62      5163

10
New best performance: 0.631222157660275

In [73]:
best_c_classifier = LogisticRegression(n_jobs=-1, multi_class='auto', solver='lbfgs', C=best_c)

best_c_classifier.fit(train_tfidf, y_train)
predictions_c_dev = best_c_classifier.predict(dev_tfidf)

print(classification_report(y_dev, predictions_c_dev))

              precision    recall  f1-score   support

           0       0.63      0.30      0.41       550
           1       0.81      0.71      0.76       857
           2       0.58      0.48      0.53      1469
           3       0.63      0.81      0.71      2287

    accuracy                           0.65      5163
   macro avg       0.66      0.58      0.60      5163
weighted avg       0.65      0.65      0.63      5163



In [74]:
best_c_classifier = LogisticRegression(n_jobs=-1, multi_class='auto', solver='lbfgs', C=best_c)

best_c_classifier.fit(train_tfidf, y_train)
predictions_c_test = best_c_classifier.predict(test_tfidf)

print(classification_report(y_test, predictions_c_test))

              precision    recall  f1-score   support

           0       0.70      0.37      0.49       550
           1       0.78      0.74      0.76       857
           2       0.58      0.51      0.54      1470
           3       0.65      0.79      0.71      2286

    accuracy                           0.66      5163
   macro avg       0.68      0.60      0.63      5163
weighted avg       0.66      0.66      0.65      5163



In [72]:
best_c

2

#### Feature Selection

Best classifier is with all features

In [83]:
best_k = lyrics_classification.logistic_classifier(train_tfidf=train_tfidf, y_train=y_train, dev_tfidf=dev_tfidf, y_dev=y_dev, 
                                                  feature_selection=[30000, 50000, 70000, 90000, 110000, 130000, 166887], C=best_c)

30000
New best performance: 0.6401317063722642
              precision    recall  f1-score   support

           0       0.67      0.23      0.34       550
           1       0.82      0.71      0.76       857
           2       0.59      0.45      0.51      1469
           3       0.61      0.84      0.71      2287

    accuracy                           0.64      5163
   macro avg       0.67      0.56      0.58      5163
weighted avg       0.65      0.64      0.62      5163

50000
New best performance: 0.6472980825101685
              precision    recall  f1-score   support

           0       0.70      0.27      0.39       550
           1       0.82      0.72      0.76       857
           2       0.58      0.46      0.51      1469
           3       0.63      0.83      0.72      2287

    accuracy                           0.65      5163
   macro avg       0.68      0.57      0.60      5163
weighted avg       0.65      0.65      0.63      5163

70000
New best performance: 0.648847

In [84]:
best_k

110000

In [87]:
selector = SelectKBest(chi2, k=best_k).fit(train_tfidf, y_train)
X_train_sel = selector.transform(train_tfidf)
X_dev_sel = selector.transform(dev_tfidf)
X_test_sel = selector.transform(test_tfidf)

In [86]:
best_k_classifier = LogisticRegression(n_jobs=-1, multi_class='auto', solver='lbfgs', C=best_c)

best_k_classifier.fit(X_train_sel, y_train)
predictions_k_test = best_k_classifier.predict(X_test_sel)

print(classification_report(y_test, predictions_k_test))

              precision    recall  f1-score   support

           0       0.70      0.36      0.48       550
           1       0.79      0.74      0.76       857
           2       0.59      0.50      0.54      1470
           3       0.65      0.80      0.72      2286

    accuracy                           0.66      5163
   macro avg       0.68      0.60      0.62      5163
weighted avg       0.66      0.66      0.65      5163



#### Dimensionality reduction

In [None]:
best_dimension = lyrics_classification.logistic_classifier(train_tfidf=X_train_sel, y_train=y_train, 
                                                           dev_tfidf=X_dev_sel, y_dev=y_dev, 
                                                    dimensionality_reduction=[10000, 20000, 30000, 40000, 50000, 80000])

10000


In [None]:
print("ciao")

In [None]:
svd = TruncatedSVD(n_components=k)

                X_train_dim = svd.fit_transform(train_tfidf)
                X_dev_dim = svd.transform(dev_tfidf)

### Random Forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=200)

In [None]:
random_forest.fit(train_tfidf, y_train)

In [None]:
rf_predictions = random_forest.predict(dev_tfidf)
rf_predictions

In [None]:
print(classification_report(y_dev, rf_predictions))

### Significance Testing

#### Logistic default

In [None]:
p_value, deltas = lyrics_classification.bootstrap_sample(system1=predictions_logit_default.tolist(), system2=dumb_predictions.tolist(), gold=y_dev.tolist())
print(p_value, p_value < 0.01)

In [None]:
print(f1_score(y_dev, dumb_predictions, average='micro'), f1_score(y_dev, predictions_c_dev, average='micro'))

#### Logistic regularized

In [None]:
p_value, deltas = lyrics_classification.bootstrap_sample(system1=predictions_c_dev.tolist(), system2=dumb_predictions.tolist(), gold=y_dev.tolist())
print(p_value, p_value < 0.01)

In [None]:
print(f1_score(y_dev, dumb_predictions, average='micro'), f1_score(y_dev, predictions_c_dev, average='micro'))

In [None]:
%matplotlib inline
import pandas as pd
pd.Series(deltas).plot.hist(bins=20)