In [501]:
import numpy as np
import math
from datascience import *
import matplotlib
matplotlib.use('Agg', warn=False)
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import tensorflow as tf
from sklearn.decomposition import PCA

In [516]:
# Get lyrics
lyrics = Table.read_table('lyrics.csv').sample(with_replacement=False)
holdout = Table.read_table('holdout.csv').drop('Id')

words = lyrics.drop(np.arange(3)).labels
titles = lyrics.column('Title')

num_words = len(words)
num_songs = len(titles)

num_valid = 200
test_lyrics = lyrics.take(np.arange(0, num_valid))
train_lyrics = lyrics.take(np.arange(num_valid, num_songs))

print("Number of Words: %d" % (num_words))
print("Number of Songs: %d" % (num_songs))
print("Training Size: %d" % (train_lyrics.num_rows))
print("Testing Size: %d" % (test_lyrics.num_rows))

# Retrieves row that corresponds to title
title_index = lyrics.index_by('Title')
def row_for_title(title):
    """Return the row for a title, similar to the following expression (but faster)
    
    lyrics.where('Title', title).row(0)
    """
    return title_index.get(title)[0]

Number of Words: 4817
Number of Songs: 1721
Training Size: 1521
Testing Size: 200


In [517]:
# Get document frequency
def document_frequency(word):
    return np.count_nonzero(lyrics.column(word) > 0) + np.count_nonzero(holdout.column(word) > 0)

# Get document frequencies
def document_frequencies():
    return Table().with_column('Word', words).apply(document_frequency, 'Word')

# Get inverse document frequencies
def generate_idf():
    frequencies = document_frequencies()
    return np.log(num_songs / (frequencies + 1))

# Get tfidf table
def tfidf(tf):
    return tf * idf


### Vector of document frequencies for words
idf = generate_idf()

In [518]:
train_data = np.array([tfidf(list(r)) for r in train_lyrics.drop(np.arange(3)).rows])
test_data = np.array([tfidf(list(r)) for r in test_lyrics.drop(np.arange(3)).rows])
holdout_data = np.array([tfidf(list(r)) for r in holdout.rows])
full_data = np.concatenate((train_data, test_data), axis=0)

print(train_data.shape)
print(test_data.shape)
print(holdout_data.shape)
print(full_data.shape)

(1521, 4817)
(200, 4817)
(100, 4817)
(1721, 4817)


In [519]:
n_components = 600
pca = PCA(n_components, whiten=True)
train_pca_data = pca.fit_transform(train_data)
holdout_pca_data = pca.transform(holdout_data)
test_pca_data = pca.transform(test_data)
print(train_pca_data.shape)
print(holdout_pca_data.shape)
print(test_pca_data.shape)

(1521, 600)
(100, 600)
(200, 600)


In [522]:
def genreToIDVec(vec):
    return [[1, 0] if e == 'Hip-hop' else [0, 1] for e in vec]

def iDtoGenre(id):
    return 'Hip-hop' if id == 0 else 'Country'

In [523]:
train_classes = genreToIDVec(train_lyrics.column('Genre'))
test_classes = genreToIDVec(test_lyrics.column('Genre'))

## Single Layer

In [458]:
# Parameters
learning_rate = 0.01
training_epochs = 25
batch_size = 100
display_step = 1

# tf Graph Input
x = tf.placeholder(tf.float32, [None, n_components])
y = tf.placeholder(tf.float32, [None, 2])

# Set model weights
W = tf.Variable(tf.zeros([n_components, 2]))
b = tf.Variable(tf.zeros([2]))

# Construct model
pred = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax

# Minimize error using cross entropy
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
# Gradient Descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

# Initializing the variables
init = tf.global_variables_initializer()

In [459]:
# Launch the graph
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(len(train_pca_data) / batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_xs = train_pca_data[i * batch_size : min(len(train_pca_data), (i + 1) * batch_size)]
            batch_ys = train_classes[i * batch_size : min(len(train_pca_data), (i + 1) * batch_size)]
            # Fit training using batch data
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_xs,
                                                          y: batch_ys})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))

    print("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    test_accuracy = accuracy.eval({x: test_pca_data, y: test_classes})
    print("Accuracy:", test_accuracy)
    
    holdout_predictions = tf.argmax(sess.run(pred, feed_dict={x: holdout_pca_data}), 1).eval()
    holdout_predictions = np.array([iDtoGenre(e) for e in holdout_predictions])
    print(holdout_predictions)
    
    country = np.count_nonzero(holdout_predictions == 'Country') / len(holdout_predictions)
    print("Country Proportion: %f" % country)

Epoch: 0001 cost= 0.680658396
Epoch: 0002 cost= 0.627606730
Epoch: 0003 cost= 0.582078969
Epoch: 0004 cost= 0.542888455
Epoch: 0005 cost= 0.509008863
Epoch: 0006 cost= 0.479572852
Epoch: 0007 cost= 0.453857976
Epoch: 0008 cost= 0.431267430
Epoch: 0009 cost= 0.411310093
Epoch: 0010 cost= 0.393582147
Epoch: 0011 cost= 0.377751054
Epoch: 0012 cost= 0.363542271
Epoch: 0013 cost= 0.350728238
Epoch: 0014 cost= 0.339119546
Epoch: 0015 cost= 0.328557773
Epoch: 0016 cost= 0.318909818
Epoch: 0017 cost= 0.310063245
Epoch: 0018 cost= 0.301922625
Epoch: 0019 cost= 0.294406619
Epoch: 0020 cost= 0.287445505
Epoch: 0021 cost= 0.280979290
Epoch: 0022 cost= 0.274956121
Epoch: 0023 cost= 0.269330973
Epoch: 0024 cost= 0.264064592
Epoch: 0025 cost= 0.259122663
Optimization Finished!
Accuracy: 0.885
['Country' 'Hip-hop' 'Country' 'Country' 'Country' 'Country' 'Hip-hop'
 'Hip-hop' 'Hip-hop' 'Hip-hop' 'Hip-hop' 'Hip-hop' 'Country' 'Country'
 'Hip-hop' 'Country' 'Country' 'Country' 'Hip-hop' 'Hip-hop' 'Hip-hop

# Multi Layer Neural Network (Fully Connected, Dropout, Softmax)

In [524]:
# Parameters
learning_rate = 0.0001
training_epochs = 50
batch_size = 100
display_step = 10

# Network Parameters
n_input = n_components
n_hidden1 = 1000
n_classes = 2
dropout = 0.5

# tf Graph Input
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
keep_prob = tf.placeholder(tf.float32)

In [525]:
def neural_network(x, weights, biases, dropout):
    fc1 = tf.matmul(x, weights['h1']) + biases['b1']
    fc1 = tf.nn.relu(fc1)
    fc1 = tf.nn.dropout(fc1, dropout)
    
    out = tf.nn.softmax(tf.matmul(fc1, weights['out']) + biases['out'])
    return out

In [526]:
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden1], stddev=(1/n_input))),
    'out': tf.Variable(tf.random_normal([n_hidden1, n_classes], stddev=(1/n_hidden1)))
}

biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden1])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

pred = neural_network(x, weights, biases, keep_prob)

# Minimize error using cross entropy
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
# Gradient Descent
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Initializing the variables
init = tf.global_variables_initializer()

In [527]:
# Launch the graph
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(len(train_pca_data) / batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_xs = train_pca_data[i * batch_size : min(len(train_pca_data), (i + 1) * batch_size)]
            batch_ys = train_classes[i * batch_size : min(len(train_pca_data), (i + 1) * batch_size)]
            # Fit training using batch data
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_xs,
                                                          y: batch_ys,
                                                          keep_prob: dropout})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))

    print("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    test_accuracy = accuracy.eval({x: test_pca_data, y: test_classes,  keep_prob: 1})
    print("Accuracy:", test_accuracy)
    
    holdout_predictions = tf.argmax(sess.run(pred, feed_dict={x: holdout_pca_data, keep_prob: 1}), 1).eval()
    holdout_predictions = np.array([iDtoGenre(e) for e in holdout_predictions])
    print(holdout_predictions)
    
    country = np.count_nonzero(holdout_predictions == 'Country') / len(holdout_predictions)
    print("Country Proportion: %f" % country)

Epoch: 0010 cost= 0.572317843
Epoch: 0020 cost= 0.409702359
Epoch: 0030 cost= 0.369266750
Epoch: 0040 cost= 0.354374425
Epoch: 0050 cost= 0.343652113
Optimization Finished!
Accuracy: 0.9
['Country' 'Hip-hop' 'Country' 'Country' 'Country' 'Country' 'Hip-hop'
 'Hip-hop' 'Hip-hop' 'Hip-hop' 'Hip-hop' 'Country' 'Country' 'Hip-hop'
 'Hip-hop' 'Country' 'Country' 'Country' 'Hip-hop' 'Hip-hop' 'Hip-hop'
 'Hip-hop' 'Hip-hop' 'Country' 'Hip-hop' 'Hip-hop' 'Hip-hop' 'Hip-hop'
 'Country' 'Country' 'Country' 'Country' 'Hip-hop' 'Hip-hop' 'Hip-hop'
 'Country' 'Country' 'Hip-hop' 'Country' 'Country' 'Hip-hop' 'Country'
 'Hip-hop' 'Hip-hop' 'Hip-hop' 'Country' 'Country' 'Hip-hop' 'Hip-hop'
 'Hip-hop' 'Country' 'Hip-hop' 'Country' 'Country' 'Country' 'Country'
 'Country' 'Hip-hop' 'Country' 'Country' 'Country' 'Hip-hop' 'Country'
 'Country' 'Country' 'Country' 'Hip-hop' 'Country' 'Hip-hop' 'Country'
 'Country' 'Country' 'Hip-hop' 'Country' 'Country' 'Country' 'Country'
 'Country' 'Hip-hop' 'Hip-hop' '

# Create Submission

In [440]:
result_id = 1

def create_competition_submission(predictions, filename='master.csv'):
    """
    Create a submission CSV for the Kaggle competition.
    
    Inputs:
      predictions - list or array of your predictions (Generated as in Question 3.3.1.)
    """
    Table().with_columns('Id', np.arange(len(predictions)), 'Predictions', predictions).to_csv(filename)
    print('Created', filename)

In [528]:
result_name = 'tensorflow-' + str(int(test_accuracy * 100)) + '-' + str(result_id) + '.csv'
create_competition_submission(holdout_predictions, result_name)
result_id += 1

Created tensorflow-89-3.csv
