In [1]:
seed = 666

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.naive_bayes import MultinomialNB

from time import time

import re, random

from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

import spacy
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])

In [2]:
corpus_df = pd.read_csv('corpus_data.csv').drop(columns=['Unnamed: 0'])

In [3]:
def lemma_w_sp(df):
    
    """Function that takes in a DF column ['lyrics'];
       returns lemmatized version of that column with spacy"""
    
    df = df.copy()
    
    df['lyrics'] = df['lyrics'].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))
    
    return df

In [4]:
corpus_df = lemma_w_sp(corpus_df)

In [5]:
CORPUS = list(corpus_df['lyrics'])
#type(CORPUS)
#len(CORPUS)

In [6]:
LABELS = list(corpus_df['genre'])
#type(LABELS)
#len(LABELS)

In [7]:
vectoriser = TfidfVectorizer(token_pattern=r'[a-z]+', stop_words='english', min_df=10, max_df=.97)
X = vectoriser.fit_transform(CORPUS)

In [8]:
m = RandomForestClassifier(max_depth=5, random_state=seed, n_jobs=-1)
m.fit(X, LABELS)
m.score(X, LABELS)

0.8196644920782852

In [9]:
XRF_pred = m.predict(X)
print(f'The precision using punk is: {round(precision_score(LABELS, XRF_pred, average="binary", pos_label="punk"), 3)}')
print(f'The precision using indie is: {round(precision_score(LABELS, XRF_pred, average="binary", pos_label="indie"), 3)}')
print(f'The recall of using punk is: {round(recall_score(LABELS, XRF_pred, average="binary", pos_label="punk"), 3)}')
print(f'The recall of using indie is: {round(recall_score(LABELS, XRF_pred, average="binary", pos_label="indie"), 3)}')
print(f'The f1-score using punk is: {round(f1_score(LABELS, XRF_pred, average="binary", pos_label="punk"), 3)}')
print(f'The f1-score using indie is: {round(f1_score(LABELS, XRF_pred, average="binary", pos_label="indie"), 3)}')

The precision using punk is: 0.808
The precision using indie is: 0.834
The recall of using punk is: 0.85
The recall of using indie is: 0.788
The f1-score using punk is: 0.828
The f1-score using indie is: 0.81


In [10]:
text_pipeline = make_pipeline(#FunctionTransformer(lemma_w_sp),
                              TfidfVectorizer(token_pattern=r'[a-z]+', stop_words='english', min_df=10, max_df=.97),
                              RandomForestClassifier(max_depth=5, random_state=seed, n_jobs=-1))
text_pipeline.fit(CORPUS, LABELS)

text_pipeline.predict(['Sometimes I give myself the creeps'])[0]

'indie'

In [11]:
text_pipeline.predict_proba(['Sometimes I give myself the creeps']).max()

0.5265078156019136

In [12]:
# def predict_genre(pipeline, new_text):
#     """
#     Hopefully, this function will
#     take the pre-trained model,
#     create a pipeline,
#     convert the text input into a list element
#     make a prediction
#     spit out the prediction with the probability
#     """
#     new_text = []
#     text_pipeline = make_pipeline(TfidfVectorizer(token_pattern=r'[a-z]+', stop_words='english', min_df=10, max_df=.97),
#                                   RandomForestClassifier(max_depth=5, random_state=seed, n_jobs=-1))
#     text_pipeline.fit(CORPUS, LABELS)
    
#     prediction = text_pipeline.predict(new_text)
#     probs = text_pipeline.predict_proba(new_text)
#     return prediction[0], probs.max()

In [13]:
# predict_genre(text_pipeline, 'Sometimes I give myself the creeps')

ValueError: Found array with 0 sample(s) (shape=(0, 1427)) while a minimum of 1 is required.