In [2]:
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
import sklearn as sk
import os
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from numpy.random import uniform
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import unidecode
import pandas as pd
import nltk
import re
from os import listdir
from os.path import isfile, join
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K
from sklearn.metrics import f1_score
from sklearn import svm

def read_csv(file_name, sep=','):
    df = pd.read_csv(file_name, sep=sep)
    return df

In [3]:
"""definitions of different classifiers, each call to the classifier function auto prints various metrics"""
def evaluate_model(Y_test,Y_pred):
    TP = ((Y_pred == Y_test) & (Y_pred > 0)).sum() #true positives
    TN = ((Y_pred == Y_test) & (Y_pred < 1)).sum() #true negatives
    FP = ((Y_pred != Y_test) & (Y_pred > 0)).sum() #false positives
    FN = ((Y_pred != Y_test) & (Y_pred < 1)).sum() #false negatives
    
    print("TP: ", TP)
    print("TN: ", TN)
    print("FP: ", FP)
    print("FN: ", FN)

def cnb_simple(X_train, Y_train, X_test, Y_test):
    clf = ComplementNB(alpha = 1)

    Y_pred = clf.fit(X_train, Y_train).predict(X_test) #fitting and predicting
    print("mean acc is: ", clf.score(X_test, Y_test))
    f1 = sk.metrics.f1_score(Y_test, Y_pred)
    print("f1 score is: ", f1)
    
    evaluate_model(Y_test,Y_pred)

    
def lgr_simple(X_train, Y_train, X_test, Y_test):
    clf = LogisticRegression(max_iter=100)

    Y_pred = clf.fit(X_train, Y_train).predict(X_test) #fitting and predicting
    print("mean acc is: ", clf.score(X_test, Y_test))
    f1 = sk.metrics.f1_score(Y_test, Y_pred)
    print("f1 score is: ", f1)
    
    evaluate_model(Y_test,Y_pred)
    

def svm_simple(X_train, Y_train, X_test, Y_test):
    clf = svm.SVC(C=10)
    Y_pred = clf.fit(X_train, Y_train).predict(X_test) #fitting and predicting
    print("mean acc is: ", clf.score(X_test, Y_test))
    f1 = sk.metrics.f1_score(Y_test, Y_pred)
    print("f1 score is: ", f1)
    
    evaluate_model(Y_test,Y_pred)

In [None]:
all_lyrics_data = []

rap_file = ['rap'] #EDIT THESE LINES ACCORDING TO THE DIRECTORY AND FILE NAMES
nrap_files = ['nrap','nrap2','nrap3'] #EDIT THESE LINES ACCORDING TO THE DIRECTORY AND FILE NAMES
file_root = 'Dataset/AZlyrics/filtered/'#EDIT THESE LINES ACCORDING TO THE DIRECTORY AND FILE NAMES
file_format = '.csv' #EDIT THESE LINES ACCORDING TO THE DIRECTORY AND FILE NAMES

rap_label_list = []
nrap_label_list = []
rap_lyrics_list = []
nrap_lyrics_list = []
all_lyrics_list = []


#GET ALL LYRICS FROM RAP SONGS rap_lyrics_list, AND CREATE CORRESPONDING LABEL LIST rap_label_list
for ini in rap_file:
    file_name = file_root + ini + file_format 
    data = read_csv(file_name)
    
    all_lyrics_list = data['LYRICS'].values.tolist()
    rap_lyrics_list = data['LYRICS'].values.tolist()
    
    for entry in rap_lyrics_list:
        entry = entry.lower()
        entry = re.sub(r'[^a-zA-Z0-9\-\']',' ',entry)
        entry = re.sub(r'[\s,.?]+',' ',entry)
        rap_label_list.append(1)

#GET ALL LYRICS FROM NON-RAP SONGS nrap_lyrics_list, AND CREATE CORRESPONDING LABEL LIST nrap_label_list
for ini in nrap_files:
    sublist = []
    file_name = file_root + ini + file_format
    data = read_csv(file_name)
    
    sublist = data['LYRICS'].values.tolist()
    
    for entry in sublist:
        entry = entry.lower()
        entry = re.sub(r'[^a-zA-Z0-9\-\']',' ',entry)
        entry = re.sub(r'[\s,.?]+',' ',entry)
        nrap_label_list.append(0)
        
    all_lyrics_list = all_lyrics_list + sublist
    nrap_lyrics_list = nrap_lyrics_list + sublist
    
#ALSO SAVES ALL LYRICS (RAP AND NON-RAP) INTO all_lyrics_list

In [None]:
#MANUALLY FORCING THE TRAINING DATASET TO BE BALANCED
x=0 #ADJUSTABLE STARTING ELEMENT, X MUST BE SMALLER THAN 30
nrap_lyrics_list_sampled = nrap_lyrics_list[x::30]
all_lyrics_list_sampled = rap_lyrics_list + nrap_lyrics_list_sampled
label_list_sampled = rap_label_list + nrap_label_list[x::30]

In [None]:
#TF-IDF VECTORIZER, PARAMETERS ADJUSTABLE, FITTED TO ONLY RAP LYRICS, VECTORIZED FOR 
tfidf_dictionary = TfidfVectorizer(ngram_range=(1,1),max_df=0.7,min_df=0.0005,token_pattern=r"[a-zA-Z0-9\-\']+").fit(rap_lyrics_list)
lyrics_vectorized = tfidf_dictionary.transform(all_lyrics_list_sampled)

In [None]:
#SPLIT DATASET FOR TRAINING AND TESTING, PARAMETERS ADJUSTABLE
X_train, X_test, Y_train, Y_test = train_test_split(lyrics_vectorized, label_list_sampled,\
                                                   test_size=0.25, random_state=42)

In [None]:
gnb_simple(X_train.toarray(), Y_train, X_test.toarray(), Y_test)

In [None]:
svm_simple(X_train, Y_train, X_test, Y_test)

In [None]:
lgr_simple(X_train, Y_train, X_test, Y_test)

In [None]:
"""start of keras, execute this only if gpu present locally"""
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))
K.tensorflow_backend._get_available_gpus()

In [None]:
#TRAINING NN
model = Sequential()
model.add(Dense(30, input_dim=lyrics_vectorized.shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=100, batch_size=32,verbose=2)

In [None]:
#PREDICTING AND EVALUATING
y_pred_prob = model.predict(X_test)
y_pred_classes = model.predict_classes(X_test)
evaluate_model(Y_test, y_pred_classes[:,0])
print_f1(Y_test, y_pred_classes)