In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import csv
import random
import math
import operator
from copy import deepcopy

###KNN Regression
#loadDataset and splitting
def loadDataset(filename, split, trainingSet=[] , testSet=[]):
        lines = csv.reader(open(filename)) #open the data file
        next(lines,None)
        dataset = list(lines) # make it into a list
        dataset
        for x in range(len(dataset)):
            for y in range(11):
                dataset[x][y] = float(dataset[x][y]) #changing the datatype
            if random.random() < split:
                trainingSet.append(dataset[x])
            else:
                testSet.append(dataset[x])
                
#compute euclideanDistance
def euclideanDistance(instance1, instance2, length): # calculate distance between point
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

#compute neighbors
def getNeighbors(trainingSet, testInstance, k): #find closest points  
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

def getResponse(neighbors): # calculate the average of nighbors
    l = np.array(neighbors)[:,10]
    avgneighbors = sum(l)/len(l)
    return avgneighbors

def getResponseweighted(neighbors): # calculate the average of nighbors
    l = np.array(neighbors)[:,4]
    avgneighbors = sum(l)/len(l)
    return avgneighbors

##Model Evaluation
#RMSE
def rmse(Y, Y_pred): #calculate rmse
    rmse = np.sqrt(sum((Y - Y_pred) ** 2) / len(Y))
    return rmse

#R2
def r2_score(Y, Y_pred): #calculate r2
    mean_y = np.mean(Y)
    ss_tot = sum((Y - mean_y) ** 2)
    ss_res = sum((Y - Y_pred) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2

In [2]:
##initialize train and set
trainingSet=[]
testSet=[]
split = 0.8
loadDataset('cp.csv', split, trainingSet, testSet) #load split the dataset
print('Train set: ' + repr(len(trainingSet))) #display number of training and test set
print('Test set: ' + repr(len(testSet)))
weighted_train = deepcopy(trainingSet)
weighted_test = deepcopy(testSet)
for row in weighted_train:
    #del row[7]
    del row[6]
    del row[5]
    del row[4]
    del row[3]
    del row[2]
    del row[0]
for row in weighted_test:
    #del row[7]
    del row[6]
    del row[5]
    del row[4]
    del row[3]
    del row[2]
    del row[0]

# generate predictions
predictions=[]
predictions_w=[]
k = 3
for x in range(len(testSet)):
    neighbors = getNeighbors(trainingSet, testSet[x], k)
    result = getResponse(neighbors)
    predictions.append(result)
for x in range(len(weighted_test)):
    neighbors_w = getNeighbors(weighted_train, weighted_test[x], k)
    result = getResponseweighted(neighbors_w)
    predictions_w.append(result)
Y = np.array(testSet)[:,10]
Y_w = np.array(weighted_test)[:,4]
Y_pred_knn = predictions
Y_pred_knn_w = predictions_w

print("RMSE (KNN)") #calculate for rmse and R2
print(rmse(Y, Y_pred_knn))
print("R2 Score (KNN)")
print(r2_score(Y, Y_pred_knn))
print("RMSE (KNN_W)") #calculate for rmse and R2
print(rmse(Y_w, Y_pred_knn_w))
print("R2 Score (KNN_W)")
print(r2_score(Y_w, Y_pred_knn_w))

Train set: 6997
Test set: 1723
RMSE (KNN)
4730.331821328421
R2 Score (KNN)
0.8996727251917191
RMSE (KNN_W)
5319.30951237083
R2 Score (KNN_W)
0.8731336851278295


In [3]:
### Multivariate Linear Regression
#Getting the variables to array.
EngineFuelType = np.array(trainingSet)[:,0]
EngineHP = np.array(trainingSet)[:,1]
EngineCylinders = np.array(trainingSet)[:,2]
TransmissionType  = np.array(trainingSet)[:,3]
Driven_Wheels = np.array(trainingSet)[:,4]
NumberofDoors = np.array(trainingSet)[:,5]
VehicleSize = np.array(trainingSet)[:,6]
highwayMPG = np.array(trainingSet)[:,7]
citympg = np.array(trainingSet)[:,8]
# MSRP = data['MSRP'].values
Age = np.array(trainingSet)[:,9]

In [4]:
print(testSet[0])

[1.0, 300.0, 6.0, 1.0, 1.0, 2.0, 1.0, 28.0, 19.0, 6.0, 40650.0]


In [5]:
#generate our parameters(the theta values)
m = len(trainingSet)
x0 = np.ones(m)
X = np.array([x0, EngineFuelType, EngineHP,EngineCylinders, TransmissionType,
              Driven_Wheels, NumberofDoors, VehicleSize, highwayMPG, citympg, Age]).T
X_w = np.array([x0,EngineHP, Age, highwayMPG, citympg]).T
# Initial Coefficients
B = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
B_w = np.array([0, 0, 0, 0, 0])
Y = np.array(trainingSet)[:,10]
# learning rate
alpha = 0.00001

# define our cost function.
def cost_function(X, Y, B):
    m = len(Y)
    J = np.sum((X.dot(B) - Y) ** 2)/(2 * m)
    return J

inital_cost = cost_function(X, Y, B)
print("Initial Cost")
print(inital_cost)
inital_cost = cost_function(X_w, Y, B_w)
print("Initial Cost weighted")
print(inital_cost)

Initial Cost
593336899.2428184
Initial Cost weighted
593336899.2428184


In [6]:
#Defining the Gradient Descent
def gradient_descent(X, Y, B, alpha, iterations):
    cost_history = [0] * iterations
    m = len(Y)
    
    for iteration in range(iterations):
        # Hypothesis Values
        h = X.dot(B)
        # Difference b/w Hypothesis and Actual Y
        loss = h - Y
        # Gradient Calculation
        gradient = X.T.dot(loss) / m
        # Changing Values of B using Gradient
        B = B - alpha * gradient
        # New Cost Value
        cost = cost_function(X, Y, B)
        cost_history[iteration] = cost
        
    return B, cost_history

# 10000 Iterations
newB, cost_history = gradient_descent(X, Y, B, alpha, 10000)
newB_w, cost_history_w = gradient_descent(X_w, Y, B_w, alpha, 10000)
# New Values of Coefficients
print("New Coefficients")
print(newB)
print("New Coefficients Weighted")
print(newB_w)

New Coefficients
[  -4.64705236  -37.41825681  127.6934206  -105.08366471   70.62263181
  105.80212381   39.30575845  -17.61690202  125.43327001  106.6959922
 -799.39711222]
New Coefficients Weighted
[  -4.77896927  126.2752366  -808.78897305  129.60564634  110.67120194]


In [7]:
# Final Cost of new B
print("Final Cost")
print(cost_history[-1])
print("Final Cost Weighted")
print(cost_history_w[-1])

Final Cost
35574006.380012535
Final Cost Weighted
35791826.0090052


In [8]:
##predicting
#Getting the variables to array for testSet
EngineFuelType_T = np.array(testSet)[:,0]
EngineHP_T = np.array(testSet)[:,1]
EngineCylinders_T = np.array(testSet)[:,2]
TransmissionType_T  = np.array(testSet)[:,3]
Driven_Wheels_T = np.array(testSet)[:,4]
NumberofDoors_T = np.array(testSet)[:,5]
VehicleSize_T = np.array(testSet)[:,6]
highwayMPG_T = np.array(testSet)[:,7]
citympg_T = np.array(testSet)[:,8]
# MSRP = data['MSRP'].values
Age_T = np.array(testSet)[:,9]

m = len(testSet)
x0 = np.ones(m)
X_test = np.array([x0, EngineFuelType_T, EngineHP_T,EngineCylinders_T, TransmissionType_T,
              Driven_Wheels_T, NumberofDoors_T, VehicleSize_T, highwayMPG_T, citympg_T, Age_T]).T
X_test_w = np.array([x0,  EngineHP_T, Age_T, highwayMPG_T, citympg_T]).T
Y_test = np.array(testSet)[:,10]


Y_pred_mlr = X_test.dot(newB)
Y_pred_mlr_w = X_test_w.dot(newB_w)

print("RMSE (MLR)")
print(rmse(Y_test, Y_pred_mlr))
print("R2 Score (MLR)")
print(r2_score(Y_test, Y_pred_mlr))
print("RMSE (MLR) Weighted")
print(rmse(Y_test, Y_pred_mlr_w))
print("R2 Score (MLR) Weighted")
print(r2_score(Y_test, Y_pred_mlr_w))

RMSE (MLR)
8447.541487926605
R2 Score (MLR)
0.6800394244104737
RMSE (MLR) Weighted
8481.274117343766
R2 Score (MLR) Weighted
0.6774789963837355


In [9]:
###Averaging MLR and KNN model
pred_mlr_knn = (Y_pred_mlr+Y_pred_knn)/2

In [10]:
print("RMSE (Final Result)")
print(rmse(Y_test, pred_mlr_knn ))
print("R2 Score (Final Result)")
print(r2_score(Y_test, pred_mlr_knn ))

RMSE (Final Result)
5696.924111584599
R2 Score (Final Result)
0.8544820153984418


In [11]:
###Averaging MLR and KNN model with feature selection
pred_mlr_knn = (Y_pred_mlr_w+Y_pred_knn_w)/2

In [12]:
print("RMSE (Final Result) with attribute selection on both")
print(rmse(Y_test, pred_mlr_knn ))
print("R2 Score (Final Result) with attribute selection on both")
print(r2_score(Y_test, pred_mlr_knn ))

RMSE (Final Result) with attribute selection on both
6001.785405759958
R2 Score (Final Result) with attribute selection on both
0.8384909997229634


In [13]:
###Averaging MLR and KNN model with weight mlr=4/14, knn=10/14
pred_mlr_knn_w = (np.dot(Y_pred_mlr_w,4)+np.dot(Y_pred_knn,10))/14

In [14]:
print("RMSE (Final Result) weighted with attribute selection on linear regression")
print(rmse(Y_test, pred_mlr_knn_w ))
print("R2 Score (Final Result) weighted with attribute selection on linear regression")
print(r2_score(Y_test, pred_mlr_knn_w ))

RMSE (Final Result) weighted with attribute selection on linear regression
4965.782922108323
R2 Score (Final Result) weighted with attribute selection on linear regression
0.8894366309876126


In [15]:
###Averaging MLR and KNN model with weight mlr=10/14 knn = 4/14
pred_mlr_knn_w = (np.dot(Y_pred_mlr,10)+np.dot(Y_pred_knn_w,4))/14

In [16]:
print("RMSE (Final Result) weighted with attribute selection on KNN")
print(rmse(Y_test, pred_mlr_knn_w ))
print("R2 Score (Final Result) weighted with attribute selection on KNN")
print(r2_score(Y_test, pred_mlr_knn_w ))

RMSE (Final Result) weighted with attribute selection on KNN
6904.035285030809
R2 Score (Final Result) weighted with attribute selection on KNN
0.7862816526337091
