# Memory-efficient SAGA

In [10]:
import pyximport; pyximport.install()
# import saga

import numpy as np
import scipy.sparse as sparse
import sklearn.linear_model, sklearn.metrics
import matplotlib.pyplot as plt
import time

%matplotlib inline

## Data

Problem: Prediction of the release year of a song from audio features. Songs are mostly western, commercial tracks ranging from 1922 to 2011, with a peak in the year 2000s.

* Dataset can be downloaded at https://archive.ics.uci.edu/ml/datasets/YearPredictionMSD.
* 90 audio attributes: 12 = timbre average, 78 = timbre covariance
    * Features extracted from the 'timbre' features from The Echo Nest API. We take the average and covariance over all 'segments', each segment being described by a 12-dimensional timbre vector.
* The first value is the year (target), ranging from 1922 to 2011. 
* train: first 463,715 examples, test: last 51,630 examples
    * It avoids the 'producer effect' by making sure no song from a given artist ends up in both the train and test set.

In [11]:
data = np.loadtxt('YearPredictionMSD.txt', delimiter=',')

FileNotFoundError: [Errno 2] No such file or directory: 'YearPredictionMSD.txt'

In [None]:
# Song release year.
train_target = data[:463715,0]  #.astype(np.int32)
test_target  = data[463715:,0]  #.astype(np.int32)
assert test_target.shape == (51630,)

# Audio features.
train_data = data[:463715,1:]
test_data  = data[463715:,1:]
assert train_data.shape == (463715,90)
assert test_data.shape == (51630,90)

# Subset for fast testing purpose.
n = 10000
train_data = data[:n,1:]
train_target = data[:n,0]  #.astype(np.int32)

# Normalize so as to do not need an intercept.
intercept = train_target.mean()
data_mean = train_data.mean(axis=0)
train_target = train_target - intercept
test_target = test_target - intercept
train_data = train_data - data_mean
test_data = test_data - data_mean

## Linear regression with scikit-learn

Sanity check of the performance of a linear classifier.

In [None]:
cls = sklearn.linear_model.LinearRegression(fit_intercept=True)
cls.fit(train_data, train_target)
assert cls.intercept_ < 1e-10

def score(x, dataset, n_plot=2000):
    A = eval('{}_data'.format(dataset))
    y = eval('{}_target'.format(dataset)) + intercept
    pred = A.dot(x) + intercept
    score = sklearn.metrics.r2_score(y, pred)
    print('R^2 score on {} set: {:.4f}'.format(dataset, score))
    
    if n_plot > 0:
        plt.figure(figsize=(17,5))
        plt.plot(pred[:n_plot], '.', label='predicted')
        plt.plot(y[:n_plot], '.', label='ground truth')
        plt.title(dataset)
        plt.xlabel('sample')
        plt.ylabel('release year')
        plt.legend()
score(cls.coef_, 'train')
score(cls.coef_, 'test')

## SAGA

* Cython code from the authors.

In [None]:
def saga_authors(A, y, maxiter, gamma, reg=0):
    """
    Solve min_x ||Ax - b||_2^2 reg*||x||_2^2
    
    Parameters:
        gamma: step size or learning rate
        reg:   amount of L2 regularization
    """
    tstart = time.process_time()
    
    # Proper data type.
    A = sparse.csc_matrix(A)
    A.indices = A.indices.astype(np.int64)
    A.indptr = A.indptr.astype(np.int64)
    y = y.astype(np.float64)
    
    # Algorithm.
    props = {'eta': 1 / gamma, 'reg': reg}
    x = saga.saga_lstsq(A, y, maxiter, props)
    
    print('Elapsed time: {:.2f}s'.format(time.process_time() - tstart))
    return x

In [None]:
n = 100
A = sparse.identity(n)
y = np.arange(n)

# eta is the inverse step size

x = saga_authors(A, y, maxiter=1000, gamma=0.1, reg=0)
assert np.all(np.abs(x - y) < 1e-10)

## Linear regression with SAGA

In [None]:
x = saga_authors(train_data.T, train_target, maxiter=1000, gamma=1e-9, reg=0)
score(x, 'train')
score(x, 'test')