In [1]:
seed = 666

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

from time import time

import re, random

from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

import spacy
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])

In [2]:
corpus_df = pd.read_csv('corpus_data.csv').drop(columns=['Unnamed: 0'])
corpus_df.head()
# corpus_df.shape # (2146, 3)

Unnamed: 0,lyrics,artist,genre
0,I know a girl who thinks it goes she'll make y...,flaming lips,indie
1,"All those bugs buzzin' round your head Well, t...",flaming lips,indie
2,"Something in you, it jitters like a moth And I...",flaming lips,indie
3,Their wasn't any snow on Christmas eve and I k...,flaming lips,indie
4,"You can walk among us, but you can't walk on b...",flaming lips,indie


In [3]:
# corpus_df['genre'].value_counts()
# punk     1097
# indie    1049

In [4]:
def lemma_w_sp(df):
    
    """Function that takes in a DF column ['lyrics'];
       returns lemmatized version of that column with spacy"""
    
    df = df.copy()
    
    df['lyrics'] = df['lyrics'].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))
    
    return df

In [5]:
corpus_df = lemma_w_sp(corpus_df)
corpus_df.head()

Unnamed: 0,lyrics,artist,genre
0,I know a girl who think it go -PRON- will make...,flaming lips,indie
1,"All that bug buzzin ' round your head Well , t...",flaming lips,indie
2,"Something in you , it jitter like a moth And I...",flaming lips,indie
3,Their be not any snow on Christmas eve and I k...,flaming lips,indie
4,"You can walk among us , but you can not walk o...",flaming lips,indie


In [6]:
X_train, X_test, y_train, y_test = train_test_split(corpus_df['lyrics'], corpus_df['genre'], test_size=.25, random_state=seed,
                                                    stratify=corpus_df['genre'])
# train = pd.concat([X_train, y_train], axis=1)
# test = pd.concat([X_test, y_test], axis=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1609,), (537,), (1609,), (537,))

In [7]:
# train['genre'].value_counts()
# punk     822
# indie    787

# test['genre'].value_counts()
# punk     275
# indie    262

In [8]:
vectoriser = TfidfVectorizer(token_pattern=r'[a-z]+', stop_words='english', min_df=10, max_df=.97)
#inspect(vectoriser, X_train)

In [9]:
X_train_pr = vectoriser.fit_transform(X_train)
X_train_pr

<1609x1124 sparse matrix of type '<class 'numpy.float64'>'
	with 47874 stored elements in Compressed Sparse Row format>

In [10]:
pd.DataFrame(X_train_pr.todense(), columns=vectoriser.get_feature_names())

Unnamed: 0,able,abuse,accept,act,action,admit,afraid,age,ago,ah,...,yea,yeah,year,yes,yesterday,york,youi,young,youth,yr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.190876,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.245337,0.0,0.232691,0.0,0.0,0.0,0.0,0.0,0.0
1605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
m = RandomForestClassifier(max_depth=5, random_state=seed, n_jobs=-1)
m.fit(X_train_pr, y_train)
m.score(X_train_pr, y_train)

0.8321939092604101

In [12]:
X_test_pr = vectoriser.transform(X_test)
pd.DataFrame(X_test_pr.todense(), columns=vectoriser.get_feature_names())

Unnamed: 0,able,abuse,accept,act,action,admit,afraid,age,ago,ah,...,yea,yeah,year,yes,yesterday,york,youi,young,youth,yr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.254946,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.068621,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.232066,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
533,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
534,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.189403,0.0,0.0,0.0,0.0,0.0,0.0
535,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
m.score(X_test_pr, y_test)

0.7057728119180633

In [14]:
ypred_rf = m.predict(X_test_pr)
# print(f'The accuracy of the model is: {round(accuracy_score(y_test, ypred_rf), 3)}')
print(f'The precision using punk is: {round(precision_score(y_test, ypred_rf, average="binary", pos_label="punk"), 3)}')
print(f'The precision using indie is: {round(precision_score(y_test, ypred_rf, average="binary", pos_label="indie"), 3)}')
print(f'The recall of using punk is: {round(recall_score(y_test, ypred_rf, average="binary", pos_label="punk"), 3)}')
print(f'The recall of using indie is: {round(recall_score(y_test, ypred_rf, average="binary", pos_label="indie"), 3)}')
print(f'The f1-score using punk is: {round(f1_score(y_test, ypred_rf, average="binary", pos_label="punk"), 3)}')
print(f'The f1-score using indie is: {round(f1_score(y_test, ypred_rf, average="binary", pos_label="indie"), 3)}')

The precision using punk is: 0.679
The precision using indie is: 0.748
The recall of using punk is: 0.807
The recall of using indie is: 0.599
The f1-score using punk is: 0.738
The f1-score using indie is: 0.665


In [15]:
text_pipeline = make_pipeline(#FunctionTransformer(lemma_w_sp),
                              TfidfVectorizer(token_pattern=r'[a-z]+', stop_words='english', min_df=10, max_df=.97),
                              RandomForestClassifier(max_depth=5, random_state=seed, n_jobs=-1))
text_pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_df=0.97, min_df=10, stop_words='english',
                                 token_pattern='[a-z]+')),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=5, n_jobs=-1,
                                        random_state=666))])

In [16]:
text_pipeline.predict_proba(['Sometimes I give myself the creeps'])

array([[0.52417245, 0.47582755]])