In [1]:
print(__doc__)

# Code source adapted from: Jaques Grobler
# License: BSD 3 clause

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error
import pandas as pd
import os
import csv

Automatically created module for IPython interactive environment


In [2]:
def treat_dataset(dataset):
    vcut = {'Fair': 0, 'Good': 1, 'Very Good': 2, 'Premium': 3, 'Ideal': 4}
    vcolor = {'D': 6, 'E': 5, 'F': 4, 'G': 3, 'H': 2, 'I': 1, 'J': 0}
    vclarity = {'I1': 0, 'SI2': 1, 'SI1': 2, 'VS2': 3, 'VS1': 4, 'VVS2': 5, 'VVS1': 6, 'IF': 7}
    
    for row in dataset:
        # Modify string to number values
        row["cut"] = vcut[row["cut"]]
        row["color"] = vcolor[row["color"]]
        row["clarity"] = vclarity[row["clarity"]]
        
        # Normalize values
        row["carat"] = (float(row["carat"]) - (0.2+5.01)/2)/(0.2+5.01)
        row["cut"] = (float(row["cut"]) - (4/2))/4
        row["color"] = (float(row["color"]) - (6/2))/6
        row["clarity"] = (float(row["clarity"]) - (7/2))/7
        row["x"] = (float(row["x"]) - (10.74/2))/10.74
        row["y"] = (float(row["y"]) - (58.9/2))/58.9
        row["z"] = (float(row["z"]) - (31.8/2))/31.8
        row["depth"] = (float(row["depth"]) - (43+79)/2)/(43+79)
        row["table"] = (float(row["table"]) - (43+95)/2)/(43+95)
        
        # Add X0 for ease of use
        row["x0"] = 1

In [3]:
# Read and treat training dataset
dataset_train = []
reader = csv.DictReader(open('diamonds-train.csv', 'r'))
for line in reader:
     dataset_train.append(line)

treat_dataset(dataset_train)
train_df = pd.DataFrame(dataset_train)
print(train_df)

# Read and treat test dataset
dataset_test = []
reader = csv.DictReader(open('diamonds-test.csv', 'r'))
for line in reader:
     dataset_test.append(line)

treat_dataset(dataset_test)
test_df = pd.DataFrame(dataset_test)
print(test_df)

# Auxilary vector for name to number mapping
ds_index = ["x0", "carat", "clarity", "color", "cut", "depth", "table", "x", "y", "z", "price"]

          carat   clarity     color   cut     depth  price     table  \
0     -0.189060 -0.071429 -0.333333  0.50  0.009016  10501 -0.101449   
1     -0.442418 -0.214286  0.000000  0.25 -0.018033    574 -0.065217   
2     -0.237044  0.071429  0.166667  0.50 -0.011475  11649 -0.086957   
3     -0.436660  0.214286 -0.166667 -0.50  0.033607    922 -0.072464   
4     -0.438580 -0.071429  0.166667  0.00 -0.018033    602 -0.050725   
5     -0.402111 -0.357143  0.333333  0.25  0.013934   1205 -0.123188   
6     -0.212092 -0.071429 -0.166667 -0.25  0.021311  10291 -0.079710   
7     -0.336852  0.071429  0.000000  0.50  0.006557   4373 -0.101449   
8     -0.436660  0.071429  0.166667  0.50  0.001639    723 -0.094203   
9     -0.288868  0.071429  0.000000  0.50  0.002459   6535 -0.108696   
10    -0.402111  0.214286  0.333333  0.00 -0.008197   2365 -0.079710   
11    -0.396353 -0.071429  0.333333  0.50  0.005738   1754 -0.101449   
12    -0.210173 -0.071429  0.000000  0.25  0.013934  12756 -0.07

In [28]:
def calculate_cost_function(thetas, data):
    m = len(data)
    s = 0
    for row in range(data):
        h = 0
        for i in range(thetas):
            h += thetas[i] * row[ds_index[i]]
        s += (h - float(row["price"]))*(h - float(row["price"]))
    return (1/(2*m)) * s

In [10]:
# STOCHASTIC WITH NUMBER OF ITERATIONS

# Parameters
# Alpha
learningRate = 1e-02
# Number of cases in training
m = len(dataset_train)
# Thetas Vector
thetas = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
# Number of iterations
iterations = 10000000

# In the Stochastic mode, the error is calculated using only one row of the data set
# We'll use them one by one 
row = 0
while(iterations > 0):
    
    new_thetas = thetas.copy() 
        
    # For each theta we do the following
    for k in range(len(thetas)):
        h = 0
        
        for i in range(len(thetas)):
            h += thetas[i] * dataset_train[row][ds_index[i]]

        # Updating the new thetas vector values
        new_thetas[k] = thetas[k] - (learningRate * (h - float(dataset_train[row]["price"])) * dataset_train[row][ds_index[k]])
        
    # Updating row that will be used to calculate the error
    row = (row + 1) % m
            
     # Atualization of the values of the thetas
    thetas = new_thetas.copy()
    iterations = iterations - 1

# Printing Thetas     
print(thetas)

[21387.540620268242, 56335.38207639048, 3500.3465747941714, 1780.0263511806272, 484.4891375708398, -9044.444555218413, -4386.116955667563, -8366.306011698367, 718.0776041618209, -5893.918139873633]


In [18]:
# STOCHASTIC WITH STOP CONDITION

# Parameters
# Alpha
learningRate = 1e-02
# Number of cases in training
m = len(dataset_train)
# Thetas Vector
thetas = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
# Stop Conditions
stopCondition = 1e-06
done = False

# In the Stochastic mode, the error is calculated using only one row of the data set
# We'll use them one by one 
row = 0
while(not done):
    
    new_thetas = thetas.copy() 
        
    # For each theta we do the following
    for k in range(len(thetas)):
        h = 0
        
        for i in range(len(thetas)):
            h += thetas[i] * dataset_train[row][ds_index[i]]

        # Updating the new thetas vector values
        new_thetas[k] = thetas[k] - (learningRate * (h - float(dataset_train[row]["price"])) * dataset_train[row][ds_index[k]])
        
    # Updating row that will be used to calculate the error
    row = (row + 1) % m
            
    # If the change in value for new thetas is too small, we can stop iterating
    done = True
    for k in range(len(thetas)):
        done = abs(thetas[k] - new_thetas[k]) < stopCondition and done       
    

    # Atualization of the values of the thetas
    thetas = new_thetas.copy()

# Printing Thetas
print(thetas)

[20335.04534561404, 56366.87506396955, 3560.2557854440156, 2103.5441517503214, 445.4072566808454, -8765.98482168368, -4373.871137887192, -7803.213670927394, -774.3454900983065, -6991.270432234771]


In [35]:
# BATCH GRADIENT ALGORITHM WITH NUMBER OF ITERATIONS

# Parameters
# Alpha
learningRate = 1e-01
# Number of cases in training
m = len(dataset_train)
# Thetas Vector
thetas = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
# Number of iterations
iterations = 500


while(iterations > 0):
    
    new_thetas = thetas.copy()
    
    # For each theta we do the following
    for k in range(len(thetas)):
        
        s = 0
        # We add every row of the dataset to the error calculation (Batch)
        for row in dataset_train:
            
            h = 0
            # Calculating the value for theta
            for i in range(len(thetas)):
                h += thetas[i] * row[ds_index[i]]
                
            s += (h - float(row["price"])) * row[ds_index[k]]
        
        # Updating the new thetas vector values
        new_thetas[k] = thetas[k] - ((learningRate / m) * s)

    # Atualization of the values of the thetas
    thetas = new_thetas.copy()
    iterations = iterations - 1
    
# Printing Thetas    
print(thetas)


[6317.864335434728, 8806.014460402721, -223.1998447334793, -728.6385472819542, 305.79922514735097, 4.499858042021787, -301.74968886652266, 12053.276565945655, -384.8146964431467, -6.621754364416988]


In [None]:
# BATCH GRADIENT ALGORITHM WITH STOP CONDITION

# Parameters
# Alpha
learningRate = 1e-06
# Number of cases in training
m = len(dataset_train)
# Thetas Vector
thetas = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
# Stop Conditions
iterations = 1e-03
done = False

while(not done):
    
    new_thetas = thetas.copy()
    
    # For each theta we do the following
    for k in range(len(thetas)):
        
        s = 0
        # We add every row of the dataset to the error calculation (Batch)
        for row in dataset_train:
            
            h = 0
            # Calculating the value for theta
            for i in range(len(thetas)):
                h += thetas[i] * row[ds_index[i]]
                
            s += (h - float(row["price"])) * row[ds_index[k]]
        
        # Updating the new thetas vector values
        new_thetas[k] = thetas[k] - ((learningRate / m) * s)
    
    # If the change in value for new thetas is too small, we can stop iterating
    done = True
    for k in range(len(thetas)):
        done = abs(thetas[k] - new_thetas[k]) < stopCondition and done
    
    # Atualization of the values of the thetas
    thetas = new_thetas.copy()
    
# Printing Thetas    
print(thetas)


In [50]:
# MINI BATCH GRADIENT ALGORITHM WITH NUMBER OF ITERATIONS

# Parameters
# Alpha
learningRate = 1e-01
# Number of cases in training
m = len(dataset_train)
# Thetas Vector
thetas = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
# Number of iterations
iterations = 100000
batch_size = 10


while(iterations > 0):
    
    new_thetas = thetas.copy()
    
    # Step through the dataset in chuncks
    for row in range(0, len(dataset_train), batch_size):
        # For each theta we do the following
        for k in range(len(thetas)):

            s = 0
            # We add every row of the dataset to the error calculation (Batch)
            for offset in range(batch_size):
                if row + offset >= m:
                    break
                
                h = 0
                # Calculating the value for theta
                for i in range(len(thetas)):
                    h += thetas[i] * dataset_train[row+offset][ds_index[i]]

                s += (h - float(dataset_train[row]["price"])) * dataset_train[row+offset][ds_index[k]]

            # Updating the new thetas vector values
            new_thetas[k] = thetas[k] - ((learningRate / m) * s)
        
        iterations = iterations - 1

    # Atualization of the values of the thetas
    thetas = new_thetas.copy()
    
# Printing Thetas    
print(thetas)

[12.43604698697453, -2.9691764985381304, 0.3645707306582003, 2.693597835348101, 4.811201079709588, 0.9958056491450533, -0.01281832570631184, 1.4521309395756004, -3.5853871249860463, -3.4449224040921576]


In [48]:
print(dataset_test[2]["price"])

teste = 0
for i in range(len(thetas)):
    teste += thetas[i] * dataset_test[2][ds_index[i]]

print(teste)


6145
164.70116871484413
