In [1]:
import tensorflow as tf
tf.enable_eager_execution()
import pandas as pd
from tensorflow.python.keras import models
from tensorflow.python.keras import initializers
from tensorflow.python.keras import regularizers

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import SeparableConv1D
from tensorflow.python.keras.layers import MaxPooling1D
from tensorflow.python.keras.layers import GlobalAveragePooling1D
from stemming.porter2 import stem

In [2]:
training_data = pd.read_csv("../Data/train_lyrics_1000.csv",header=0,encoding='utf-8')
testing_data = pd.read_csv("../Data/valid_lyrics_200.csv",header=0,encoding='utf-8')

In [3]:
training_data.head()

Unnamed: 0,file,artist,title,lyrics,genre,mood,year
0,TRAAAAW128F429D538.h5,Casual,I Didn't Mean To,Verse One:\n\nAlright I might\nHave had a litt...,Hip Hop/Rap,sad,1994
1,TRAAAEF128F4273421.h5,Adam Ant,Something Girls,Adam Ant/Marco Pirroni\nEvery girl is a someth...,Rock,happy,1982
2,TRAAAFD128F92F423A.h5,Gob,Face the Ashes,"I've just erased it's been a while, I've got a...",Rock,sad,2007
3,TRAABJV128F1460C49.h5,Lionel Richie,Tonight Will Be Alright,Little darling \nWhere you've been so long \nI...,R&B,happy,1986
4,TRAABLR128F423B7E3.h5,Blue Rodeo,Floating,"Lead Vocal by Greg\n\nWell, these late night c...",Rock,sad,1987


In [4]:
# Splitting the Dataset into training and test data

X_train = training_data['lyrics']
y_train = testing_data['lyrics']
X_test = training_data['mood']
y_test = testing_data['mood']

In [5]:
# Label Encoding the Target Variable
from sklearn import preprocessing

# Create a label (category) encoder object
le = preprocessing.LabelEncoder()

# Fit the encoder to the pandas column
le.fit(X_test)
le.fit(y_test)

# View the labels (if you want)
list(le.classes_)

# Apply the fitted encoder to the pandas column
X_test = le.transform(X_test) 
y_test = le.transform(y_test)

In [7]:
with open('../Data/stopwords_eng.txt', 'r') as infile:
    stop_words = infile.read().splitlines()
print('stop words %s ...' %stop_words[:5])

stop words ['i', 'me', 'my', 'myself', 'we'] ...


# #Vectorizing Using TF-IDF Vectorizer

In [8]:
# Vectorizing Using TFIDF Vectorizer

NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
#TOP_K = 20000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
#MIN_DOCUMENT_FREQUENCY = 2

# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 500


kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'stop_words':stop_words,
            'binary':False,
            'encoding':'utf-8',
        }

In [9]:

print(porter_tokenizer('jjdksd on that is loko'))

['jjdksd', 'on', 'that', 'is', 'loko']


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(**kwargs)

In [11]:
# Learn vocabulary from training texts and vectorize training texts.
x_train_tfidf = vectorizer.fit_transform(X_train)

# Vectorize validation texts.
x_val_tfidf = vectorizer.transform(y_train)



In [12]:
#Defining Parameters for the Model

learning_rate=1e-3
epochs=1000
batch_size=128
layers=2
units=64
dropout_rate=0.2

In [13]:
def get_num_classes(labels):
    """Gets the total number of classes.
    # Arguments
        labels: list, label values.
            There should be at lease one sample for values in the
            range (0, num_classes -1)
    # Returns
        int, total number of classes.
    # Raises
        ValueError: if any label value in the range(0, num_classes - 1)
            is missing or if number of classes is <= 1.
    """
    num_classes = max(labels) + 1
    missing_classes = [i for i in range(num_classes) if i not in labels]
    if len(missing_classes):
        raise ValueError('Missing samples with label value(s) '
                         '{missing_classes}. Please make sure you have '
                         'at least one sample for every label value '
                         'in the range(0, {max_class})'.format(
                            missing_classes=missing_classes,
                            max_class=num_classes - 1))

    if num_classes <= 1:
        raise ValueError('Invalid number of labels: {num_classes}.'
                         'Please make sure there are at least two classes '
                         'of samples'.format(num_classes=num_classes))
    return num_classes

In [14]:
get_num_classes(X_test)

2

In [15]:
def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    """Creates an instance of a multi-layer perceptron model.
    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.
    # Returns
        An MLP model instance.
    """
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=op_units, activation=op_activation))
    return model

In [16]:
def _get_last_layer_units_and_activation(num_classes):
    """Gets the # units and activation function for the last network layer.
    # Arguments
        num_classes: int, number of classes.
    # Returns
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation

In [17]:
_get_last_layer_units_and_activation(2)

(1, 'sigmoid')

In [18]:
op_units, op_activation = _get_last_layer_units_and_activation(2)

In [19]:
# Create model instance.

model = mlp_model(layers=layers,
                  units=units,
                  dropout_rate=dropout_rate,
                  input_shape=x_train_tfidf.shape[1:],
                  num_classes=2)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Colocations handled automatically by placer.


In [20]:
# Compile model with learning parameters.

loss = 'binary_crossentropy'
optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

In [21]:
#Create callback for early stopping on validation loss. If the loss does not decrease in two consecutive tries, stop training.
callbacks = [tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', patience=2)]

In [22]:
# Train and validate model.
history = model.fit(
            x_train_tfidf,
            X_test,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val_tfidf, y_test),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)

Train on 1000 samples, validate on 200 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/1000
 - 3s - loss: 0.6916 - acc: 0.5300 - val_loss: 0.6909 - val_acc: 0.4800
Epoch 2/1000
 - 2s - loss: 0.6610 - acc: 0.6520 - val_loss: 0.6877 - val_acc: 0.4850
Epoch 3/1000
 - 2s - loss: 0.6174 - acc: 0.8370 - val_loss: 0.6796 - val_acc: 0.5050
Epoch 4/1000
 - 2s - loss: 0.5612 - acc: 0.9760 - val_loss: 0.6697 - val_acc: 0.5300
Epoch 5/1000
 - 2s - loss: 0.4954 - acc: 0.9930 - val_loss: 0.6586 - val_acc: 0.5650
Epoch 6/1000
 - 2s - loss: 0.4288 - acc: 0.9980 - val_loss: 0.6483 - val_acc: 0.5700
Epoch 7/1000
 - 2s - loss: 0.3661 - acc: 0.9980 - val_loss: 0.6369 - val_acc: 0.6200
Epoch 8/1000
 - 2s - loss: 0.3069 - acc: 0.9960 - val_loss: 0.6264 - val_acc: 0.6800
Epoch 9/1000
 - 2s - loss: 0.2569 - acc: 0.9980 - val_loss: 0.6170 - val_acc: 0.6850
Epoch 10/1000
 - 2s - loss: 0.2146 - acc: 0.9960 - val_loss: 0.6070 - val_acc: 0.6900
Epoch 11/1000
 - 2s - loss: 0.1811 - acc: 0.9960 - val

In [23]:
# Print results.
history = history.history
print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

Validation accuracy: 0.7350000143051147, loss: 0.563155632019043


# test

In [24]:
x = 'Without you, I feel broke Like Im half of a whole Without you, Ive got no hand to hold Without you, I feel torn Like a sail in a storm Without you, Im just a sad song Im just a sad song'

In [30]:
#x_token=porter_tokenizer(x)
x_input = vectorizer.transform([x])

In [31]:
#x_input = selector.transform(x_input).astype('float32')

predictions = model.predict(x_input)

In [34]:
print(x_input)

  (0, 81233)	0.43776571751094345
  (0, 80615)	0.12663762343763518
  (0, 74701)	0.17355939793195005
  (0, 68979)	0.1500728907353273
  (0, 66441)	0.250859917962649
  (0, 60736)	0.17634370196165494
  (0, 60458)	0.32116958845060173
  (0, 40229)	0.1209677318943262
  (0, 35700)	0.21791986100660443
  (0, 35699)	0.21791986100660443
  (0, 34791)	0.47656404134585845
  (0, 33323)	0.09987826363699612
  (0, 30891)	0.2308944647132536
  (0, 30866)	0.11758620944781517
  (0, 30797)	0.1500728907353273
  (0, 29453)	0.2308944647132536
  (0, 29284)	0.06770225770289866
  (0, 23005)	0.1615491386047857
  (0, 8419)	0.1488178549749624


In [32]:
import numpy as np
rounded = [np.round(x) for x in predictions]

In [33]:
if rounded[0] == 1:
    print('happy')
else:
    print('sad')

happy


In [133]:
def input_converter(string):
    x = string
    x_input = vectorizer.transform([x])
    print(x_input)
    predictions = model.predict(x_input)
    rounded = [np.round(x) for x in predictions]
    if rounded[0] == 1:
        out = 'sad'
    else:
        out = 'happy'
    return out

In [134]:
input_converter('I re ihe sad joy happy happy sad sad happy happy')

  (0, 49760)	0.2294992869652013
  (0, 49743)	0.5038558268933042
  (0, 47075)	0.2538883630961472
  (0, 29840)	0.17467441170297782
  (0, 25772)	0.2538883630961472
  (0, 25756)	0.4792433362646209
  (0, 25733)	0.5516841992471969


'happy'

In [135]:
input_converter('sad sad happy')

  (0, 49760)	0.5342634078961679
  (0, 49743)	0.7819682426149922
  (0, 25733)	0.32107363411652745


'sad'

In [36]:
import pickle

In [41]:
with open('/Users/rohitjain/Documents/MLPresentation/song-mood/model/vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer,file)
with open('/Users/rohitjain/Documents/MLPresentation/song-mood/model/kwargs.pkl', 'wb') as file:
    pickle.dump(kwargs,file)

{'ngram_range': (1, 2), 'dtype': 'int32', 'strip_accents': 'unicode', 'decode_error': 'replace', 'analyzer': 'word', 'stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 's

In [42]:
model_json = model.to_json()
with open("/Users/rohitjain/Documents/MLPresentation/song-mood/model/model.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("/Users/rohitjain/Documents/MLPresentation/song-mood/model/model_weights.h5")
