Imports

In [None]:
import random
random.seed(23)
from numpy.random import seed
seed(23)
import tensorflow as tf
tf.random.set_seed(23)

from numpy import array
from string import punctuation
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In/Out functions

In [None]:
def readData(filePath):
    df = pd.read_csv(filePath)
    # df = df.replace({'Funny': 'Energetic'})
    return df

def writePrediction(results):
    with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
        for x in results:
            fp.write(str(x) + '\n')

In [None]:
def createTokens(textLyricsList):
    textTokensList = []
    for lyrics in textLyricsList:
        tokens = lyrics.split()
        textTokensList.append(tokens)
    return textTokensList

def cleanTokens(textTokensList):
    cleanTextTokensList = []
    for textTokens in textTokensList:
        translationTable = str.maketrans('', '', punctuation)
        cleanTextTokens = [token.translate(translationTable) for token in textTokens]
        cleanTextTokens = [token for token in cleanTextTokens if token.strip() != '']
        cleanTextTokensList.append(cleanTextTokens)
    return cleanTextTokensList

def filterVacabularyTokens(textTokensList, vocabularySet):
    cleanTextTokensList = []
    for textTokens in textTokensList:
        cleanTextTokens = [token for token in textTokens if token in vocabularySet]
        cleanTextTokensList.append(cleanTextTokens)
    return cleanTextTokensList

def getVocabulary(textTokensList):
    # Creating Vocabulary
    vocabularySet = set()
    for textTokens in textTokensList:
        for token in textTokens:
            vocabularySet.add(token)
    return vocabularySet

def getEncodedData(textTokensList, vocabularySet, mode):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(vocabularySet)
    X = tokenizer.texts_to_matrix(textTokensList, mode=mode)
    return X

Main Code

In [None]:
# Main Code Start

df = readData('cleaned_lyrics.csv')

In [None]:
df.groupby('Class').count()

Unnamed: 0_level_0,Artist,Song,Lyrics
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Funny,200,200,200
Happy,126,126,126
Motivational,103,103,103
Peaceful,145,145,145
Sad,280,280,280
hate,199,199,199
romantic,230,230,230


In [None]:
def getModelData(textTokensList, y, vocabularySet, encoding):
    X = getEncodedData(textTokensList, vocabularySet, encoding)
    y = array(y)
    return X, y

def getModelDataTrainTest(trainTextTokensList, testTextTokensList, vocabularySet, encoding):
    Xtrain = getEncodedData(trainTextTokensList, vocabularySet, encoding)
    Xtest = getEncodedData(testTextTokensList, vocabularySet, encoding)

    ytrain = array(train_y)
    ytest = array(test_y)

    return Xtrain, Xtest, ytrain, ytest


def getTrainedModel(Xtrain, ytrain):

    n_words = Xtrain.shape[1]

    model = Sequential()
    model.add(Dense(100, input_shape=(n_words,), activation='relu'))
    model.add(Dense(175, activation='relu'))
    model.add(Dense(150, activation='relu'))
    model.add(Dense(175, activation='relu'))
    model.add(Dense(150, activation='relu'))
    model.add(Dense(175, activation='relu'))
    model.add(Dense(150, activation='relu'))
    model.add(Dense(175, activation='relu'))
    model.add(Dense(150, activation='relu'))
    model.add(Dense(175, activation='relu'))
    model.add(Dense(150, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(Xtrain, ytrain, epochs=10, verbose=2)

    return model

In [None]:
def get1vAllReplaceMap(mood):
    moods = ['Happy', 'Sad', 'romantic', 'hate', 'Peaceful', 'Motivational', 'Funny']
    replaceMap = { mood: 1 }
    for m in moods:
        if m != mood:
            replaceMap[m] = 0
    return replaceMap    

In [None]:
def evaluateModels(train_X_data, test_X_data, train_y_data, test_y_data, vocabularySet):
    moods = ['Happy', 'Sad', 'romantic', 'hate', 'Peaceful', 'Motivational', 'Funny']
    
    models = []
    train_accuracies = []
    test_accuracies = []
    results = []

    train_df = pd.concat((train_X_data, train_y_data), axis=1)
    test_df = pd.concat((test_X_data, test_y_data), axis=1)

    for mood in moods:
        
        replaceMap = get1vAllReplaceMap(mood)

        filtered_train_df = train_df.replace(replaceMap)
        filtered_test_df = test_df.replace(replaceMap)
    
        train_X = filtered_train_df.iloc[:,:3]
        train_y = filtered_train_df.iloc[:,-1]

        test_X = filtered_test_df.iloc[:,:3]
        test_y = filtered_test_df.iloc[:,-1]

        train_lyrics_list = train_X['Lyrics'].tolist()
        trainTextTokensList = createTokens(train_lyrics_list)
        trainTextTokensList = cleanTokens(trainTextTokensList)
        
        test_lyrics_list = test_X['Lyrics'].tolist()
        testTextTokensList = createTokens(test_lyrics_list)
        testTextTokensList = cleanTokens(testTextTokensList)
        testTextTokensList = filterVacabularyTokens(testTextTokensList, vocabularySet)

        Xtrain, ytrain = getModelData(trainTextTokensList, train_y, vocabularySet, 'tfidf')
        Xtest, ytest = getModelData(testTextTokensList, test_y, vocabularySet, 'tfidf')

        model = getTrainedModel(Xtrain, ytrain)
        models.append(model)
        
        loss, test_acc = model.evaluate(Xtest, ytest, verbose=0)
        loss, train_acc = model.evaluate(Xtrain, ytrain, verbose=0)

        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)

        results.append([mood + ' vs Not ' + mood, train_acc, test_acc])

        string = mood + ' vs Not ' + mood 
        # serialize model to JSON
        model_json = model.to_json()
        with open("{}.json".format(string), "w") as json_file:
            json_file.write(model_json)
        # serialize weights to HDF5
        model.save_weights("{}.h5".format(string))
        print("Saved model to disk")

    return moods, models, train_accuracies, test_accuracies, results

X = df.iloc[:,:3]
y = df.iloc[:,-1]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, stratify=y)

train_lyrics_list = train_X['Lyrics'].tolist()
trainTextTokensList = createTokens(train_lyrics_list)
trainTextTokensList = cleanTokens(trainTextTokensList)
vocabularySet = getVocabulary(trainTextTokensList)

moods, models, train_accuracies, test_accuracies, results = evaluateModels(train_X, test_X, train_y, test_y, vocabularySet)
results_df = pd.DataFrame(results, columns=['Model Pair', 'Train Accuracy', 'Test Accuracy'])
display(results_df)
    

In [None]:
test_lyrics_list = test_X['Lyrics'].tolist()
testTextTokensList = createTokens(test_lyrics_list)
testTextTokensList = cleanTokens(testTextTokensList)
testTextTokensList = filterVacabularyTokens(testTextTokensList, vocabularySet)

data_lyrics_list = X['Lyrics'].tolist()
dataTextTokensList = createTokens(data_lyrics_list)
dataTextTokensList = cleanTokens(dataTextTokensList)

Xdata, ydata = getModelData(dataTextTokensList, y, vocabularySet, 'tfidf')

Xtrain, ytrain = getModelData(trainTextTokensList, train_y, vocabularySet, 'tfidf')
Xtest, ytest = getModelData(testTextTokensList, test_y, vocabularySet, 'tfidf')

def getPredictedProbabilities(moods, models, X):
    predicted_labels_probability = []
    for mood, model in zip(moods, models):

        yprobabilities = model.predict(X)
        yprobabilities = [y[0] for y in yprobabilities]

        predicted_labels_probability.append(yprobabilities)
    return predicted_labels_probability

train_predicted_labels_probability = getPredictedProbabilities(moods, models, Xtrain)
test_predicted_labels_probability = getPredictedProbabilities(moods, models, Xtest)

data_predicted_labels_probability = getPredictedProbabilities(moods, models, Xdata)


In [None]:
def getProbabilityAndLabelVectors(predicted_labels_probability):
    predicted_probabilities_transpose =  pd.DataFrame(predicted_labels_probability).T.values.tolist()

    probability_df = pd.DataFrame(predicted_probabilities_transpose, columns=['Happy', 'Sad', 'romantic', 'hate', 'Peaceful', 'Motivational', 'Funny'])
    display(probability_df)

    ytest_hat_labels_for1vAll = []
    for probabilities in predicted_probabilities_transpose:
        predicted_moods = []
        for i, mood_prob in enumerate(probabilities):
            if mood_prob < 0.5:
                predicted_moods.append(0)
            else:
                predicted_moods.append(1)

        if 1 not in predicted_moods:
            predicted_moods[np.argmax(probabilities)] = 1
        ytest_hat_labels_for1vAll.append(predicted_moods)

    labels_df = pd.DataFrame(ytest_hat_labels_for1vAll, columns=['Happy', 'Sad', 'romantic', 'hate', 'Peaceful', 'Motivational', 'Funny'])
    display(labels_df)

    return predicted_probabilities_transpose, ytest_hat_labels_for1vAll
    
train_predicted_probabilities_transpose, train_labels_for1vAll = getProbabilityAndLabelVectors(train_predicted_labels_probability)
test_predicted_probabilities_transpose, test_labels_for1vAll = getProbabilityAndLabelVectors(test_predicted_labels_probability)

data_predicted_probabilities_transpose, data_labels_for1vAll = getProbabilityAndLabelVectors(data_predicted_labels_probability)

  

Unnamed: 0,Happy,Sad,romantic,hate,Peaceful,Motivational,Funny
0,3.477092e-16,1.597027e-14,7.849911e-13,1.589874e-28,4.260453e-22,1.270425e-18,1.000000e+00
1,5.147244e-05,1.831003e-08,9.999998e-01,1.988740e-09,7.453825e-09,9.557642e-08,1.736540e-23
2,4.825235e-09,2.709066e-13,6.168367e-13,1.000000e+00,2.863310e-24,6.320928e-12,2.697188e-16
3,9.999920e-01,9.448599e-08,5.616643e-05,6.124266e-09,8.740545e-15,5.734829e-10,8.367866e-08
4,8.128574e-17,1.759660e-38,5.699931e-18,1.729412e-25,0.000000e+00,1.556363e-24,1.000000e+00
...,...,...,...,...,...,...,...
893,9.999988e-01,4.789338e-09,3.068970e-07,3.047691e-10,6.757332e-17,5.995125e-08,1.065814e-16
894,7.840059e-07,3.433013e-06,4.481842e-06,9.999842e-01,5.090426e-11,1.861933e-06,1.447534e-15
895,4.530512e-09,4.727944e-06,9.999862e-01,1.948783e-05,2.723086e-08,3.280081e-11,8.195218e-21
896,1.190745e-04,4.933721e-09,1.000000e+00,1.513667e-12,3.305955e-13,5.594250e-14,6.887217e-14


Unnamed: 0,Happy,Sad,romantic,hate,Peaceful,Motivational,Funny
0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0
3,1,0,0,0,0,0,0
4,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...
893,1,0,0,0,0,0,0
894,0,0,0,1,0,0,0
895,0,0,1,0,0,0,0
896,0,0,1,0,0,0,0


Unnamed: 0,Happy,Sad,romantic,hate,Peaceful,Motivational,Funny
0,2.005668e-06,9.999151e-01,2.347589e-03,5.584594e-13,2.600411e-11,2.419475e-08,7.909217e-20
1,3.093229e-05,9.922934e-01,9.390550e-01,5.511045e-04,4.061725e-05,3.018474e-08,5.519873e-15
2,1.014535e-06,3.676577e-01,1.106664e-01,2.354244e-09,2.193400e-02,8.310528e-15,6.339940e-17
3,2.984315e-06,3.896605e-11,3.324726e-06,1.844236e-06,6.531609e-12,1.531007e-09,7.743075e-11
4,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00
...,...,...,...,...,...,...,...
380,9.804899e-16,1.096542e-26,2.170879e-21,1.901668e-02,1.523886e-32,5.523185e-06,4.218173e-14
381,1.080922e-04,1.217679e-09,6.128474e-05,9.979453e-01,1.450704e-12,5.440266e-07,1.979752e-12
382,5.057884e-05,4.747302e-09,9.994612e-01,1.062415e-06,1.862617e-08,8.357378e-10,1.106716e-11
383,3.329515e-04,1.218422e-04,9.042323e-04,2.261892e-09,5.196647e-01,3.588219e-05,5.820345e-07


Unnamed: 0,Happy,Sad,romantic,hate,Peaceful,Motivational,Funny
0,0,1,0,0,0,0,0
1,0,1,1,0,0,0,0
2,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0
4,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...
380,0,0,0,1,0,0,0
381,0,0,0,1,0,0,0
382,0,0,1,0,0,0,0
383,0,0,0,0,1,0,0


Unnamed: 0,Happy,Sad,romantic,hate,Peaceful,Motivational,Funny
0,9.620786e-04,3.158748e-04,1.326075e-02,2.118475e-05,1.620459e-08,4.754540e-06,1.528699e-05
1,9.999920e-01,9.448599e-08,5.616643e-05,6.124266e-09,8.740545e-15,5.734829e-10,8.367866e-08
2,5.775633e-01,6.155277e-16,5.057429e-01,1.245151e-12,1.451324e-12,1.542747e-12,2.211040e-26
3,9.999845e-01,6.240443e-07,3.625154e-04,2.072424e-09,1.735453e-13,7.147401e-12,1.057985e-11
4,9.999996e-01,4.441515e-11,1.418144e-03,2.695434e-13,1.014798e-09,1.724755e-07,1.471188e-23
...,...,...,...,...,...,...,...
1278,9.545593e-17,7.797602e-18,1.279645e-21,3.853902e-31,1.784332e-31,1.981042e-25,1.000000e+00
1279,3.080325e-36,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00
1280,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00
1281,6.211522e-20,0.000000e+00,2.961389e-24,2.072348e-25,0.000000e+00,1.768166e-29,1.000000e+00


Unnamed: 0,Happy,Sad,romantic,hate,Peaceful,Motivational,Funny
0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,1,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...
1278,0,0,0,0,0,0,1
1279,0,0,0,0,0,0,1
1280,0,0,0,0,0,0,1
1281,0,0,0,0,0,0,1


In [None]:
data_df = pd.concat((X,y), axis=1)
data_df = data_df.drop(['Lyrics'], axis=1)
data_df['probability_vector'] = data_predicted_probabilities_transpose
data_df['label_vector'] = data_labels_for1vAll

data_df.to_csv('prob_vec.csv')

