In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import csv
import random
import math
import operator

###KNN Regression
#loadDataset and splitting
def loadDataset(filename, split, trainingSet=[] , testSet=[]):
        lines = csv.reader(open(filename)) #open the data file
        next(lines,None)
        dataset = list(lines) # make it into a list
        dataset
        for x in range(len(dataset)):
            for y in range(11):
                dataset[x][y] = float(dataset[x][y]) #changing the datatype
            if random.random() < split:
                trainingSet.append(dataset[x])
            else:
                testSet.append(dataset[x])
                
#compute euclideanDistance
def euclideanDistance(instance1, instance2, length): # calculate distance between point
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

#compute neighbors
def getNeighbors(trainingSet, testInstance, k): #find closest points  
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

def getResponse(neighbors): # calculate the average of nighbors
    l = np.array(neighbors)[:,10]
    avgneighbors = sum(l)/len(l)
    return avgneighbors

##Model Evaluation
#RMSE
def rmse(Y, Y_pred): #calculate rmse
    rmse = np.sqrt(sum((Y - Y_pred) ** 2) / len(Y))
    return rmse

#R2
def r2_score(Y, Y_pred): #calculate r2
    mean_y = np.mean(Y)
    ss_tot = sum((Y - mean_y) ** 2)
    ss_res = sum((Y - Y_pred) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2

In [2]:
##initialize train and set
trainingSet=[]
testSet=[]
split = 0.8
loadDataset('Cleancp.csv', split, trainingSet, testSet) #load split the dataset
print('Train set: ' + repr(len(trainingSet))) #display number of training and test set
print('Test set: ' + repr(len(testSet)))


# generate predictions
predictions=[]
k = 3
for x in range(len(testSet)):
    neighbors = getNeighbors(trainingSet, testSet[x], k)
    #print(neighbors)
    result = getResponse(neighbors)
    predictions.append(result)
Y = np.array(testSet)[:,10]
Y_pred_knn = predictions

print("RMSE (KNN)") #calculate for rmse and R2
print(rmse(Y, Y_pred_knn))
print("R2 Score (KNN)")
print(r2_score(Y, Y_pred_knn))

Train set: 7655
Test set: 1890
RMSE (KNN)
4725.255762730904
R2 Score (KNN)
0.9108463832701784


In [3]:
### Multivariate Linear Regression
#Getting the variables to array.
EngineFuelType = np.array(trainingSet)[:,0]
EngineHP = np.array(trainingSet)[:,1]
EngineCylinders = np.array(trainingSet)[:,2]
TransmissionType  = np.array(trainingSet)[:,3]
Driven_Wheels = np.array(trainingSet)[:,4]
NumberofDoors = np.array(trainingSet)[:,5]
VehicleSize = np.array(trainingSet)[:,6]
highwayMPG = np.array(trainingSet)[:,7]
citympg = np.array(trainingSet)[:,8]
# MSRP = data['MSRP'].values
Age = np.array(trainingSet)[:,9]

In [4]:
#generate our parameters(the theta values)
m = len(trainingSet)
x0 = np.ones(m)
X = np.array([x0, EngineFuelType, EngineHP,EngineCylinders, TransmissionType,
              Driven_Wheels, NumberofDoors, VehicleSize, highwayMPG, citympg, Age]).T
# Initial Coefficients
B = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Y = np.array(trainingSet)[:,10]
# learning rate
alpha = 0.00001

# define our cost function.
def cost_function(X, Y, B):
    m = len(Y)
    J = np.sum((X.dot(B) - Y) ** 2)/(2 * m)
    return J

inital_cost = cost_function(X, Y, B)
print("Initial Cost")
print(inital_cost)

Initial Cost
582436999.8274983


In [5]:
#Defining the Gradient Descent
def gradient_descent(X, Y, B, alpha, iterations):
    cost_history = [0] * iterations
    m = len(Y)
    
    for iteration in range(iterations):
        # Hypothesis Values
        h = X.dot(B)
        # Difference b/w Hypothesis and Actual Y
        loss = h - Y
        # Gradient Calculation
        gradient = X.T.dot(loss) / m
        # Changing Values of B using Gradient
        B = B - alpha * gradient
        # New Cost Value
        cost = cost_function(X, Y, B)
        cost_history[iteration] = cost
        
    return B, cost_history

# 10000 Iterations
newB, cost_history = gradient_descent(X, Y, B, alpha, 10000)

# New Values of Coefficients
print("New Coefficients")
print(newB)

New Coefficients
[ 5.97708652e-01 -2.84850054e+01  1.28809034e+02 -8.12204521e+01
 -3.92879628e+01 -8.61753927e+00  6.52029981e+01 -5.58380460e+00
  9.73589333e+01  1.39136342e+02 -8.24918369e+02]


In [6]:
# Final Cost of new B
print("Final Cost")
print(cost_history[-1])

Final Cost
36615202.15912354


In [7]:
##predicting
#Getting the variables to array for testSet
EngineFuelType_T = np.array(testSet)[:,0]
EngineHP_T = np.array(testSet)[:,1]
EngineCylinders_T = np.array(testSet)[:,2]
TransmissionType_T  = np.array(testSet)[:,3]
Driven_Wheels_T = np.array(testSet)[:,4]
NumberofDoors_T = np.array(testSet)[:,5]
VehicleSize_T = np.array(testSet)[:,6]
highwayMPG_T = np.array(testSet)[:,7]
citympg_T = np.array(testSet)[:,8]
# MSRP = data['MSRP'].values
Age_T = np.array(testSet)[:,9]

m = len(testSet)
x0 = np.ones(m)
X_test = np.array([x0, EngineFuelType_T, EngineHP_T,EngineCylinders_T, TransmissionType_T,
              Driven_Wheels_T, NumberofDoors_T, VehicleSize_T, highwayMPG_T, citympg_T, Age_T]).T
Y_test = np.array(testSet)[:,10]

Y_pred_mlr = X_test.dot(newB)

print("RMSE (MLR)")
print(rmse(Y_test, Y_pred_mlr))
print("R2 Score (MLR)")
print(r2_score(Y_test, Y_pred_mlr))

RMSE (MLR)
8691.501215956838
R2 Score (MLR)
0.6983676513530199


In [8]:
###Averaging MLR and KNN model
pred_mlr_knn = (Y_pred_mlr+Y_pred_knn)/2

In [9]:
print("RMSE (Final Result)")
print(rmse(Y_test, pred_mlr_knn ))
print("R2 Score (Final Result)")
print(r2_score(Y_test, pred_mlr_knn ))

RMSE (Final Result)
5830.730803352818
R2 Score (Final Result)
0.8642517287903838
