In [134]:
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
import sklearn as sk
import os
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from numpy.random import uniform
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import unidecode
import pandas as pd
import nltk
import re
from os import listdir
from os.path import isfile, join
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K
from sklearn.metrics import f1_score
from sklearn import svm
import pickle

In [None]:
"""
https://drive.google.com/drive/folders/1qD0tbXwpeD-cU8phT_w9-flJoj_0a-0s?usp=sharing

GO TO THIS LINK, DOWNLOAD THE 4 CSV FILES AND UPLOAD THEM TO THE DIRECTORY SEEN ON THE LEFT SIDE
CLICK THE ICON NAMED'FILES' AND CLICK UPLOAD ICON

"""

In [70]:
"""definitions of different classifiers and other functions, 
each call to the classifier function auto prints various metrics"""

def read_csv(file_name, sep=','):
    df = pd.read_csv(file_name, sep=sep)
    return df

def print_f1(Y_pred, Y_test):
    print (f1_score(Y_test, Y_pred))

def evaluate_model(Y_test,Y_pred):
    TP = ((Y_pred == Y_test) & (Y_pred > 0)).sum() #true positives
    TN = ((Y_pred == Y_test) & (Y_pred < 1)).sum() #true negatives
    FP = ((Y_pred != Y_test) & (Y_pred > 0)).sum() #false positives
    FN = ((Y_pred != Y_test) & (Y_pred < 1)).sum() #false negatives
    
    print("TP: ", TP)
    print("TN: ", TN)
    print("FP: ", FP)
    print("FN: ", FN)

def cnb_simple(X_train, Y_train, X_test, Y_test):
    clf = ComplementNB(alpha = 1)

    Y_pred = clf.fit(X_train, Y_train).predict(X_test) #fitting and predicting
    print("mean acc is: ", clf.score(X_test, Y_test))
    f1 = sk.metrics.f1_score(Y_test, Y_pred)
    print("f1 score is: ", f1)
    
    evaluate_model(Y_test,Y_pred)
    return clf
    
def gnb_simple(X_train, Y_train, X_test, Y_test):
    clf = GaussianNB() #complement naive bayes classifier with different smoothing parameters

    Y_pred = clf.fit(X_train, Y_train).predict(X_test) #fitting and predicting
    f1 = sk.metrics.f1_score(Y_test, Y_pred)
    print("f1 score is: ", f1)
    
    evaluate_model(Y_test,Y_pred)
    return clf


def mismatched_indices(Y_test, Y_pred):
    ind_list = []
    for ind in range(len(Y_test)):
        if Y_test[ind] != Y_pred[ind]:
            ind_list.append(ind)
    return ind_list

In [4]:
all_lyrics_data = []

rap_file = ['rap'] #EDIT THESE LINES ACCORDING TO THE DIRECTORY AND FILE NAMES
nrap_files = ['nrap','nrap2','nrap3'] #EDIT THESE LINES ACCORDING TO THE DIRECTORY AND FILE NAMES
file_root = 'Dataset/AZlyrics/filtered/'#EDIT THESE LINES ACCORDING TO THE DIRECTORY AND FILE NAMES
file_format = '.csv' #EDIT THESE LINES ACCORDING TO THE DIRECTORY AND FILE NAMES

rap_label_list = []
nrap_label_list = []
rap_lyrics_list = []
nrap_lyrics_list = []
all_lyrics_list = []


#GET ALL LYRICS FROM RAP SONGS rap_lyrics_list, AND CREATE CORRESPONDING LABEL LIST rap_label_list
for ini in rap_file:
    file_name = ini + file_format 
    data = read_csv(file_name)
    
    all_lyrics_list = data['LYRICS'].values.tolist()
    rap_lyrics_list = data['LYRICS'].values.tolist()
    
    for entry in rap_lyrics_list:
        entry = entry.lower()
        entry = re.sub(r'[^a-zA-Z0-9\-\']',' ',entry)
        entry = re.sub(r'[\s,.?]+',' ',entry)
        rap_label_list.append(1)

#GET ALL LYRICS FROM NON-RAP SONGS nrap_lyrics_list, AND CREATE CORRESPONDING LABEL LIST nrap_label_list
for ini in nrap_files:
    sublist = []
    file_name = file_root + ini + file_format
    data = read_csv(file_name)
    
    sublist = data['LYRICS'].values.tolist()
    
    for entry in sublist:
        entry = entry.lower()
        entry = re.sub(r'[^a-zA-Z0-9\-\']',' ',entry)
        entry = re.sub(r'[\s,.?]+',' ',entry)
        nrap_label_list.append(0)
        
    all_lyrics_list = all_lyrics_list + sublist
    nrap_lyrics_list = nrap_lyrics_list + sublist
    
#ALSO SAVES ALL LYRICS (RAP AND NON-RAP) INTO all_lyrics_list

In [140]:
#MANUALLY FORCING THE TRAINING DATASET TO BE BALANCED
x=4 #ADJUSTABLE STARTING ELEMENT, X MUST BE SMALLER THAN 30
nrap_lyrics_list_sampled = nrap_lyrics_list[x::30]
all_lyrics_list_sampled = rap_lyrics_list + nrap_lyrics_list_sampled
label_list_sampled = rap_label_list + nrap_label_list[x::30]

In [141]:
#TF-IDF VECTORIZER, PARAMETERS ADJUSTABLE, FITTED TO ONLY RAP LYRICS, VECTORIZED FOR 
tfidf_dictionary = TfidfVectorizer(ngram_range=(1,1),max_df=0.8,max_features=10000,min_df=0.00002, token_pattern=r"[a-zA-Z0-9\-\']+").fit(all_lyrics_list)
lyrics_vectorized = tfidf_dictionary.transform(all_lyrics_list_sampled)

In [142]:
#SPLIT DATASET FOR TRAINING AND TESTING, PARAMETERS ADJUSTABLE
X_train, X_test, Y_train, Y_test = train_test_split(lyrics_vectorized, label_list_sampled,\
                                                   test_size=0.25, random_state=42)

In [143]:
gnb_simple(X_train.toarray(), Y_train, X_test.toarray(), Y_test)

f1 score is:  0.69097605893186
TP:  938
TN:  292
FP:  742
FN:  97


GaussianNB()

In [97]:
"""
test_file_list = ['Dataset/Generated/mc_list.p',
                 'Dataset/Generated/onelyr_list.p',
                 'Dataset/Generated/random_list.p',
                 'Dataset/Generated/twolyr_list.p']
"""

test_file_list = ['mc_list.p',
                 'onelyr_list.p',
                 'random_list.p',
                 'twolyr_list.p']

"""Change the above list accordingly, each file should contain a list of songs/documents (a list of lists)"""

collated_list = []

for f in range(len(test_file_list)):    
    pickle_file = open(test_file_list[f], "rb")

    while True:
        try:
            collated_list.append(pickle.load(pickle_file))
        except EOFError:
            break
            
mc = collated_list[0] 
onelyr = collated_list[1]
random = collated_list[2]
twolyr = collated_list[3]

In [147]:
#prepares the test set and labels

test_list = twolyr """Change this line to one of the names of the test lists: mc/onelyr..."""
test_labels = []

for entry in test_list:
    test_labels.append(1)

test_vectorized = tfidf_dictionary.transform(test_list)
X_test = test_vectorized
Y_test = test_labels

In [None]:
gnb_clf = gnb_simple(X_train.toarray(), Y_train, X_test.toarray(), Y_test)
gnb_Y_pred = gnb_clf.predict(X_test.toarray())

#prints out the lines which the classifier deems not rap
wrong = mismatched_indices(Y_test, gnb_Y_pred)
for ind in wrong:
    print(ind)
    print(random[ind])