In [1]:
%matplotlib inline

# Linear Regression task - Diamond price training

### Imports

In [2]:
print(__doc__)

import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error

Automatically created module for IPython interactive environment


### CSV reading

In [3]:
# columns 1, 5, 6, 8, 9, 10 have numerical variables
# column 7 contains the target: diamond price
diamonds_data = np.genfromtxt('diamonds.csv', delimiter=",", skip_header=1,
                       usecols=(1, 5, 6, 8, 9, 10, 7))

print(diamonds_data.shape)
print(diamonds_data[:, np.newaxis, 6]) # target

(53940, 7)
[[  326.]
 [  326.]
 [  327.]
 ..., 
 [ 2757.]
 [ 2757.]
 [ 2757.]]


### Feature and target selection

In [4]:
diamonds_features_df = pd.DataFrame(diamonds_data[:, 0:6])
diamonds_target_df = pd.DataFrame(diamonds_data[:, np.newaxis, 6])

print(diamonds_features_df)
print(diamonds_target_df)

          0     1   2     3     4     5
0      0.23  61.5  55  3.95  3.98  2.43
1      0.21  59.8  61  3.89  3.84  2.31
2      0.23  56.9  65  4.05  4.07  2.31
3      0.29  62.4  58  4.20  4.23  2.63
4      0.31  63.3  58  4.34  4.35  2.75
5      0.24  62.8  57  3.94  3.96  2.48
6      0.24  62.3  57  3.95  3.98  2.47
7      0.26  61.9  55  4.07  4.11  2.53
8      0.22  65.1  61  3.87  3.78  2.49
9      0.23  59.4  61  4.00  4.05  2.39
10     0.30  64.0  55  4.25  4.28  2.73
11     0.23  62.8  56  3.93  3.90  2.46
12     0.22  60.4  61  3.88  3.84  2.33
13     0.31  62.2  54  4.35  4.37  2.71
14     0.20  60.2  62  3.79  3.75  2.27
15     0.32  60.9  58  4.38  4.42  2.68
16     0.30  62.0  54  4.31  4.34  2.68
17     0.30  63.4  54  4.23  4.29  2.70
18     0.30  63.8  56  4.23  4.26  2.71
19     0.30  62.7  59  4.21  4.27  2.66
20     0.30  63.3  56  4.26  4.30  2.71
21     0.23  63.8  55  3.85  3.92  2.48
22     0.23  61.0  57  3.94  3.96  2.41
23     0.31  59.4  62  4.39  4.43  2.62


In [None]:


#print(diamonds_features_df)
#print(diamonds_target_df)

#print(diamonds_features_df[0:10])

#print(list(enumerate(diamonds_features_df[0])))
#print()

#print(list(diamonds_features_df.iloc[0, :]))



# Linear Regression implementation


The main formula for the iteration steps in the Gradient Descent algorithm is

$$
\theta_j = \theta_j - \alpha \frac{1}{m}
    \sum_{i=1}^{m}{\left(h_\theta\left(x^{\left(i\right)}\right) -
    y^{\left(i\right)}\right) x_j^{\left(i\right)}}
$$

for all $j = 0, 1, ..., n$, $j$ being the index of the parameter, and
$i$ being the index of the data example.

$$
\theta_0 = \theta_0 - \alpha \frac{1}{m}
    \sum_{i=1}^{m}{\left(h_\theta\left(x^{\left(i\right)}\right) -
    y^{\left(i\right)}\right) x_0^{\left(i\right)}}
\\
\theta_1 = \theta_1 - \alpha \frac{1}{m}
    \sum_{i=1}^{m}{\left(h_\theta\left(x^{\left(i\right)}\right) -
    y^{\left(i\right)}\right) x_1^{\left(i\right)}}
\\
\theta_2 = \theta_2 - \alpha \frac{1}{m}
    \sum_{i=1}^{m}{\left(h_\theta\left(x^{\left(i\right)}\right) -
    y^{\left(i\right)}\right) x_2^{\left(i\right)}}
\\
...
$$

We define a difference between the cost and the target, $k$, such that
$$
k^{\left(i\right)} = h_\theta\left(x^{\left(i\right)}\right) - y^{\left(i\right)}
$$
It can be noted that $k_i$ is used repeatedly among all the
parameter calculations. For this reason, we will calculate all the
$k_i$ just once and use it for all parameters.

$$
\theta_j = \theta_j - \alpha \frac{1}{m}
    \sum_{i=1}^{m}{k^{\left(i\right)} ~ x_j^{\left(i\right)}}
$$


In [10]:
def cost(parameters, features):
    '''Calculates the cost of the data example

    Calculates the cost of a data example given the
    current parameter values and the data example features'
    values, using the general formula
    
        cost = param1 * feat1 + param2 * feat2 + ... +
               paramM * featM

    Arguments:
        parameters (np.ndarray(float64)):
            Current parameter values.
        features (np.ndarray(float64)):
            Data example features' values.

    Returns:
        float: the cost function value for the given data example.
    '''
    sum(param * feat for param, feat in zip(parameters, features))

In [11]:
# this is our model, what we want to have as a final result
parameters = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

# this is the number of data examples that we have. for this dataset: 53940
data_size = diamonds_target_df.shape[0]

#costs = np.array(examples_size * [0.0])[:, np.newaxis]
#print(costs.shape)


#k_diff = [cost(parameters, diamonds_features_df[i])
#          for i in range(data_size)]

k_diff = [cost(parameters, diamonds_features_df[i]) - diamonds_target_df[i]
          for i in range(data_size)]


#for i in range(data_size):
#    cost_i = cost(parameters, diamonds_target_df.shape)


TypeError: unsupported operand type(s) for -: 'NoneType' and 'float'

In [None]:
def cost(example):
    '''Calculates the cost of the data example

    Calculates the cost of a data example received as tuples
    in which the first element is the parameter value and
    the second element is the variable value.

    Arguments:
        example (list(tuple(int, int))): List of tuples
            containing the parameters and the variables.

    Returns:
        float: the cost of this data example.
    '''
    sum(param * feat for param, feat in example)

# def descend_parameter(val):
#     new_val = val - lrate / M * (sum (i = 1 to M) -> cost(x(i)) - y(i) * x(j)(i) )

# costs = []

parameters = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


examples_size = diamonds_target_df.shape[0]

costs = 

print(costs)
