In [1]:
seed = 666

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from time import time

import re, random

from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

In [2]:
corpus_df = pd.read_csv('corpus_df.csv').drop(columns=['Unnamed: 0'])
corpus_df.head()
# corpus_df.shape # (1693, 3)

Unnamed: 0,lyrics,artist,genre
0,I know a girl who thinks it goes she'll make y...,flaming lips,indie
1,"All those bugs buzzin' round your head Well, t...",flaming lips,indie
2,"Something in you, it jitters like a moth And I...",flaming lips,indie
3,Their wasn't any snow on Christmas eve and I k...,flaming lips,indie
4,"You can walk among us, but you can't walk on b...",flaming lips,indie


In [3]:
# corpus_df['genre'].value_counts()
# punk     1097
# indie     596

In [4]:
X_train, X_test, y_train, y_test = train_test_split(corpus_df['lyrics'], corpus_df['genre'], test_size=.25, random_state=seed, stratify=corpus_df['genre'])
# train = pd.concat([X_train, y_train], axis=1)
# test = pd.concat([X_test, y_test], axis=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1269,), (424,), (1269,), (424,))

In [5]:
# train['genre'].value_counts()
# punk     822
# indie    447

# test['genre'].value_counts()
# punk     275
# indie    149

In [6]:
def inspect(vectoriser, X):
    # Fit and transform
    start = time()
    print(f"There are {vectoriser.fit_transform(X).shape[1]} columns.\n")
    end = time()
    print(f"Took {round((end-start),2)} seconds.\n")
    
    # Inspect tokens
    tokens = list(vectoriser.vocabulary_.keys())
    tokens.sort()
    print(f"Example tokens: {tokens[:50]}\n")
    
    # Inspect ignored tokens
    ignored = vectoriser.stop_words_
    if len(ignored)==0:
        print("No token is ignored.")
    elif len(ignored)>50:
        print(f"Example ignored tokens: {random.sample(ignored, 50)}")
    else:
        print(f"Example ignored tokens: {ignored}")

In [7]:
vectoriser = TfidfVectorizer(token_pattern=r'[a-z]+', stop_words='english', min_df=30, max_df=.7)
inspect(vectoriser, X_train)

There are 338 columns.

Took 0.13 seconds.

Example tokens: ['afraid', 'ain', 'air', 'alive', 'alright', 'anymore', 'apart', 'ask', 'away', 'baby', 'bad', 'beat', 'beautiful', 'bed', 'believe', 'best', 'better', 'big', 'black', 'blind', 'blood', 'blow', 'blue', 'body', 'born', 'boy', 'brain', 'break', 'bring', 'broken', 'brother', 'burn', 'burning', 'came', 'car', 'care', 'caught', 'cause', 'chance', 'change', 'chorus', 'city', 'close', 'cold', 'come', 'comes', 'coming', 'control', 'cool', 'couldn']

Example ignored tokens: ['ges', 'pasturelooks', 'blanks', 'squares', 'spout', 'mindwould', 'thresholdchange', 'mustard', 'affordable', 'casualty', 'plead', 'abberations', 'dime', 'energy', 'elsehello', 'threatearly', 'pathos', 'private', 'perceive', 'undeveloped', 'prouder', 'exito', 'uhh', 'molecule', 'convention', 'ex', 'tasted', 'herebut', 'astrayi', 'hypostatic', 'snowfall', 'peacocks', 'hunting', 'worries', 'werden', 'shiny', 'tonightgonna', 'licking', 'doled', 'coldcan', 'rising', 'c

In [8]:
X = vectoriser.fit_transform(X_train)
X

<1269x338 sparse matrix of type '<class 'numpy.float64'>'
	with 27811 stored elements in Compressed Sparse Row format>

In [9]:
pd.DataFrame(X.todense(), columns=vectoriser.get_feature_names())

Unnamed: 0,afraid,ain,air,alive,alright,anymore,apart,ask,away,baby,...,work,world,wouldn,wrong,x,ya,yeah,years,young,yr
0,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.263042,0.000000,0.000000,...,0.000000,0.177653,0.0,0.000000,0.0,0.0,0.000,0.000000,0.0,0.0
1,0.0,0.068964,0.045186,0.0,0.04422,0.0,0.0,0.000000,0.055089,0.171971,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.271,0.000000,0.0,0.0
2,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.087184,0.0,0.000000,0.0,0.0,0.000,0.000000,0.0,0.0
3,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000,0.000000,0.0,0.0
4,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.082321,0.000000,...,0.000000,0.000000,0.0,0.314034,0.0,0.0,0.000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1264,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.160742,0.0,0.0,0.000,0.000000,0.0,0.0
1265,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000,0.000000,0.0,0.0
1266,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.079974,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000,0.237552,0.0,0.0
1267,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.074868,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000,0.000000,0.0,0.0
