In [1]:
import pandas as pd
from sklearn import neighbors
from sklearn import cross_validation
import numpy as np
import cvxpy as cvx

##Load data

In [2]:
df = pd.read_csv("../data/simpel_carbonfoot_df.csv")
df.head()

Unnamed: 0,additives_n,ingredients_from_palm_oil_n,ingredients_that_may_be_from_palm_oil_n,energy_100g,fat_100g,saturated_fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,salt_100g,sodium_100g,carbon_footprint_100g,nutrition_score_fr_100g,nutrition_score_uk_100g
0,0,0,0,83,0.5,0.0,3,0.3,1.1,0.254,0.1,0.0,1,1
1,1,0,0,397,2.9,1.7,14,14.0,3.0,0.1016,0.04,193.4,4,4
2,1,0,0,55,0.1,0.1,7,5.0,5.0,0.127,0.05,198.7,-6,-6
3,0,0,0,330,4.0,2.0,6,6.0,4.5,0.1524,0.06,221.6,0,0
4,2,0,0,251,0.9,0.5,10,10.0,2.5,0.1016,0.04,136.0,1,1


In [3]:
y = np.array(df.carbon_footprint_100g)
X = np.array(df.drop(['carbon_footprint_100g'], axis = 1))
len(y)

143

## Compute pairwise differences

In [4]:
def pairwise_diffs(np_arr):
    np_diffs = np.empty((len(np_arr)*(len(np_arr)-1)/2, np_arr.shape[1]))
    start_ind = 0
    for i in range(len(np_arr)-1):
        sample = np_arr[i,:]
        diffs = np.sqrt((np_arr[i+1:,:] - sample)**2)
        end_ind = start_ind+len(diffs)
        np_diffs[start_ind:end_ind,:] = diffs
        start_ind = end_ind
    return np_diffs

In [5]:
X_diffs = pairwise_diffs(X)
y_diffs = pairwise_diffs(y[np.newaxis].T)

In [6]:
print X_diffs.shape
print y_diffs.shape
print np.mean(y_diffs)

(10153, 13)
(10153, 1)
448.978025549


In [7]:
print np.linalg.norm(X_diffs)
print np.linalg.norm(X_diffs/np.linalg.norm(X_diffs))
print np.linalg.norm(np.dot(X_diffs.T/np.linalg.norm(X_diffs),X_diffs/np.linalg.norm(X_diffs)))

120360.654457
1.0
0.998361433965


## Learn optimal qeights

In [8]:
def optimize_weights(X_diffs, y_diffs):
    #sc = (np.linalg.norm(np.dot(X_diffs.T,X_diffs)))**.5
    sc = np.linalg.norm(X_diffs)
    A = X_diffs/sc
    b = y_diffs/sc
    w = cvx.Variable(X_diffs.shape[1])
    #objective = cvx.Minimize(cvx.sum_entries(cvx.huber(A*w - b,1000)))
    objective = cvx.Minimize(cvx.norm(A*w - b,2))
    constraints = [0 <= w]

    prob = cvx.Problem(objective, constraints)
    prob.solve()
    return prob.status, w.value

In [9]:
statusprob, weights = optimize_weights(X_diffs, y_diffs)

In [10]:
np.round(weights)

matrix([[   2.],
        [   0.],
        [   0.],
        [   0.],
        [   0.],
        [  14.],
        [   0.],
        [   0.],
        [  16.],
        [ 156.],
        [  29.],
        [   4.],
        [   0.]])

## LOOCV KNN with weights

In [11]:
loo = cross_validation.LeaveOneOut(len(y)-1)

resid = []

for train_index, test_index in loo:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    '''
    compute weights
    '''
    X_diffs = []
    y_diffs = []
    X_diffs = pairwise_diffs(X_train)
    y_diffs = pairwise_diffs(y_train[np.newaxis].T)
    statusprob, weights = optimize_weights(X_diffs, y_diffs)
    '''
    predict
    '''
    knn = neighbors.KNeighborsRegressor(n_neighbors=1)
    y_pred = knn.fit(X_train*np.array(weights.T), y_train).predict(X_test*np.array(weights.T))
    
    resid.append(abs(y_pred - y_test))
print 'mean resid', np.mean(resid)
print 'median resid', np.median(resid)

mean resid 164.14701338
median resid 42.5
