In [1]:
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
import sklearn as sk
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from numpy.random import uniform
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import unidecode
import pandas as pd
import nltk
import re
from os import listdir
from os.path import isfile, join
from sklearn.metrics import f1_score
from sklearn import svm
import pickle

In [2]:
"""definitions of classifier and other functions, 
each call to the classifier function auto prints various metrics"""

def read_csv(file_name, sep=','):
    df = pd.read_csv(file_name, sep=sep)
    return df

def print_f1(Y_pred, Y_test):
    print (f1_score(Y_test, Y_pred))

def evaluate_model(Y_test,Y_pred):
    TP = ((Y_pred == Y_test) & (Y_pred > 0)).sum() #true positives
    TN = ((Y_pred == Y_test) & (Y_pred < 1)).sum() #true negatives
    FP = ((Y_pred != Y_test) & (Y_pred > 0)).sum() #false positives
    FN = ((Y_pred != Y_test) & (Y_pred < 1)).sum() #false negatives
    
    print("TP: ", TP)
    print("TN: ", TN)
    print("FP: ", FP)
    print("FN: ", FN)
    
    f1 = sk.metrics.f1_score(Y_test, Y_pred)
    print("f1 score is: ", f1)
    

def mnb_simple(X_train, Y_train, X_test, Y_test):
    clf = MultinomialNB() #complement naive bayes classifier with different smoothing parameters

    Y_pred = clf.fit(X_train, Y_train).predict(X_test) #fitting and predicting
    f1 = sk.metrics.f1_score(Y_test, Y_pred)
    print("f1 score is: ", f1)
    
    evaluate_model(Y_test,Y_pred)
    return clf


def predict_from_proba(pred_proba, threshold):
    lst = []
    
    for prob in pred_proba:
        if prob[1] >= threshold:
            lst.append(1)
        else:
            lst.append(0)        
    return np.array(lst)

def mismatched_indices(Y_test, Y_pred):
    ind_list = []
    for ind in range(len(Y_test)):
        if Y_test[ind] != Y_pred[ind]:
            ind_list.append(ind)
    return ind_list

In [3]:
all_lyrics_data = []

rap_file = ['rap'] #EDIT THESE LINES ACCORDING TO THE DIRECTORY AND FILE NAMES
nrap_files = ['nrap','nrap2','nrap3'] #EDIT THESE LINES ACCORDING TO THE DIRECTORY AND FILE NAMES
file_root = '../resources/'#EDIT THESE LINES ACCORDING TO THE DIRECTORY AND FILE NAMES
file_format = '.csv' #EDIT THESE LINES ACCORDING TO THE DIRECTORY AND FILE NAMES

rap_label_list = []
nrap_label_list = []
rap_lyrics_list = []
nrap_lyrics_list = []
all_lyrics_list = []


#GET ALL LYRICS FROM RAP SONGS rap_lyrics_list, AND CREATE CORRESPONDING LABEL LIST rap_label_list
for ini in rap_file:
    file_name = file_root + ini + file_format 
    data = read_csv(file_name)
    
    all_lyrics_list = data['LYRICS'].values.tolist()
    rap_lyrics_list = data['LYRICS'].values.tolist()
    
    for entry in rap_lyrics_list:
        entry = entry.lower()
        entry = re.sub(r'[^a-zA-Z0-9\-\']',' ',entry)
        entry = re.sub(r'[\s,.?]+',' ',entry)
        rap_label_list.append(1)

#GET ALL LYRICS FROM NON-RAP SONGS nrap_lyrics_list, AND CREATE CORRESPONDING LABEL LIST nrap_label_list
for ini in nrap_files:
    sublist = []
    file_name = file_root + ini + file_format
    data = read_csv(file_name)
    
    sublist = data['LYRICS'].values.tolist()
    
    for entry in sublist:
        entry = entry.lower()
        entry = re.sub(r'[^a-zA-Z0-9\-\']',' ',entry)
        entry = re.sub(r'[\s,.?]+',' ',entry)
        nrap_label_list.append(0)
        
    all_lyrics_list = all_lyrics_list + sublist
    nrap_lyrics_list = nrap_lyrics_list + sublist
    
#ALSO SAVES ALL LYRICS (RAP AND NON-RAP) INTO all_lyrics_list

In [4]:
#MANUALLY FORCING THE TRAINING DATASET TO BE BALANCED
x=4 #ADJUSTABLE STARTING ELEMENT, X MUST BE SMALLER THAN 30
nrap_lyrics_list_sampled = nrap_lyrics_list[x::30]
all_lyrics_list_sampled = rap_lyrics_list + nrap_lyrics_list_sampled
label_list_sampled = rap_label_list + nrap_label_list[x::30]

In [5]:
#TF-IDF VECTORIZER, PARAMETERS ADJUSTABLE, FITTED TO ONLY RAP LYRICS, VECTORIZED FOR 
tfidf_dictionary = TfidfVectorizer(ngram_range=(1,1),max_df=0.6,max_features=5000,min_df=0.00002, token_pattern=r"[a-zA-Z0-9\-\']+").fit(all_lyrics_list)
lyrics_vectorized = tfidf_dictionary.transform(all_lyrics_list_sampled)

In [6]:
#SPLIT DATASET FOR TRAINING AND TESTING, PARAMETERS ADJUSTABLE
X_train, X_test, Y_train, Y_test = train_test_split(lyrics_vectorized, label_list_sampled,\
                                                   test_size=0.25, random_state=42)

In [7]:
mnb_train = MultinomialNB()
mnb_train.fit(X_train, Y_train)
mnb_Y_pred_proba = mnb_train.predict_proba(X_test)
print (mnb_Y_pred_proba)

[[0.00220033 0.99779967]
 [0.24172731 0.75827269]
 [0.48412026 0.51587974]
 ...
 [0.59294522 0.40705478]
 [0.90909443 0.09090557]
 [0.14283684 0.85716316]]


In [8]:
#Export classifer as pickled file
"""following lines commented out as the output file already exist in the 'resources' folder"""
#picklefile = '../resources/mnb_classifier.p'
#with open(picklefile,'wb') as f:
    #pickle.dump(mnb_train,f)

In [9]:
#Export tfidf vectorizer as pickled file
"""following lines commented out as the output file already exist in the 'resources' folder"""
#picklefile = '../resources/tfidf_dict.p'
#with open(picklefile,'wb') as f:
    #pickle.dump(tfidf_dictionary,f)

In [10]:
tf_load = pickle.load(open('../resources/tfidf_dict.p', 'rb'))
print(tf_load)

TfidfVectorizer(max_df=0.6, max_features=5000, min_df=2e-05,
                token_pattern="[a-zA-Z0-9\\-\\']+")
