# machine learning-project1

### load data and split  
use train_test_split in sklearn to split data randomly

In [57]:
import pandas as pd
import numpy as np
import math
import random
from sklearn.model_selection import train_test_split


def loadDataSet():
    data = pd.read_csv("./train.csv")
    convertLabel(data)
    y = data['price_range']
    x = data.drop('price_range', axis = 1)
    x_train, x_tmp, y_train, y_tmp = train_test_split(x, y, test_size = 0.2, random_state = 60, stratify = y)
    x_test, x_valid, y_test, y_valid = train_test_split(x_tmp, y_tmp, test_size = 0.5, random_state = 100, stratify = y_tmp)
    return x_train,y_train,x_test,y_test,x_valid,y_valid

### preprocess data  
convertLabel: Convert labels into to two classes: low (0, 1) and high (2, 3)  
DataLabelSplit(): split the data into two part, one labeled high price_range, the other labeled low price_range  
normalize(): used for Logistic Regression, scaling values of attributes to the same level [0,1]  

In [19]:
def convertLabel(data):
    data.loc[data.price_range<=1,'price_range']=0
    data.loc[data.price_range>1,'price_range']=1


def DataLabelSplit(x_data,y_data):
    x=x_data.values
    y=y_data.values
    data=np.column_stack((x,y))
    true_data=data[data[:,20]==1]
    false_data=data[data[:,20]==0]
    return true_data,false_data


def normalize(x):
    column_num=x.shape[1]
    row_num=x.shape[0]
    array=x.values
    for i in range(column_num):
        column_max=max(array[:,i])
        array[:,i]=array[:,i]/column_max
    return array

### model and algorithm  
note: function train_and_test below produce the entire procedure of training and testing  
1. Naive Bayes  
instead of discretizing continuous attributes into intervals, I use Gauss probability density to measure the probability for continuous attributes
getMeanStdLabel(),calcuGaussProb(): used for calculate Gauss probability density function
calPossibilityForDiscrete(): calculate the probability for desicrete attributes
calPossibility(): calculate the entire probability

In [47]:
def getMeanStdLabel(true_data,false_data):
    return np.mean(true_data,0), np.std(true_data,  0 ),np.mean(false_data,0), np.std(false_data,  0 )


def calPossibilityForDiscrete(true_data,false_data):
    column_number=true_data.shape[1]
    true_data_possibility=[]
    false_data_possibility=[]
    for i in range(column_number):
        true_data_factor_num=true_data[true_data[:,i]>0].shape[0]
        true_data_possibility.append(true_data_factor_num/true_data.shape[0])
        false_data_factor_num=false_data[false_data[:,i]>0].shape[0]
        false_data_possibility.append(false_data_factor_num/false_data.shape[0])
    return true_data_possibility,false_data_possibility


def calcuGaussProb(x,mean,stdev):
    exponent = np.exp(-(np.power(x-mean,2))/(2*np.power(stdev,2)))
    GaussProb = (1/(np.sqrt(2*np.pi)*stdev))*exponent
    return GaussProb


def calPossibility(arr,mean_arr,std_arr,pro_arr):
    possibility=1
    for i in range(len(arr)):
        if i==1 or i==3 or i==5 or i==17 or i==18 or i==19:
            if arr[i]==1:
                possibility*=pro_arr[i]
            else:
                possibility*=1-pro_arr[i]
        else:
            possibility*=calcuGaussProb(arr[i],mean_arr[i],std_arr[i])
    return possibility


def naive_bayes_predict(x_train,y_train):
#     x_train,y_train,x_test,y_test,x_valid,y_valid=loadDataSet()
    true_data,false_data=DataLabelSplit(x_train,y_train)
    true_mean,true_std,false_mean,false_std=getMeanStdLabel(true_data,false_data)
    true_data_possibility,false_data_possibility=calPossibilityForDiscrete(true_data,false_data)
    return true_mean,true_std,false_mean,false_std,true_data_possibility,false_data_possibility


def test_bayes(true_mean,true_std,false_mean,false_std,true_data_possibility,false_data_possibility,x_test,y_test):
    prediction=[]
    for i in range(x_test.shape[0]):
        arr=x_test.values[i]
        true_possibility=calPossibility(arr,true_mean,true_std,true_data_possibility)
        false_possibility=calPossibility(arr,false_mean,false_std,false_data_possibility)
        if true_possibility>false_possibility:
            prediction.append(1)
        else:
            prediction.append(0)
#     print(prediction)
    result=y_test.values
    count=0
    for i in range(len(result)):
        if prediction[i]==result[i]:
            count+=1
    return count/len(result)

def bayes_train_and_test(x_train,y_train,x_test,y_test,x_valid,y_valid):
    true_mean,true_std,false_mean,false_std,true_data_possibility,false_data_possibility=naive_bayes_predict(x_train,y_train)
    return test_bayes(true_mean,true_std,false_mean,false_std,true_data_possibility,false_data_possibility,x_test,y_test)

2. Logistic Regression  
logisticRegression(): use gradient descend to train w  
test(): test valid data and test data

In [48]:
def sigmoid(gamma):
#     print("sigmoid(%d)" % gamma)
    if gamma < 0:
        return 1 - 1/(1 + math.exp(gamma))
    else:
        return 1/(1 + math.exp(-gamma))

    
def initW(data_num):
    w=[]
    for i in range(data_num):
        w.append(random.uniform(-0.01,0.01))
    return np.array(w)

def test_logisticRegression(w,x,y):
    row_num=x.shape[0]
    column_num=x.shape[1]
    data=normalize(x)
    prediction=[]
    for i in range(row_num):
        row=np.append(1,data[i])
        if w.dot(row)>0:
            prediction.append(1)
        else:
            prediction.append(0)
    
    result=y.values
    count=0
    for i in range(len(result)):
        if prediction[i]==result[i]:
            count+=1
#     print(count/len(result))
    return count/len(result)


def logisticRegression(x_train,y_train,x_valid,y_valid):
    ita=0.002
#     x_train,y_train,x_test,y_test,x_valid,y_valid=loadDataSet()
    column_num=x_train.shape[1]
    train_num=x_train.shape[0]
    w=initW(column_num+1)
    data_train=normalize(x_train)
    for i in range(500):
        deltaW=np.zeros(column_num+1)
        for t in range(train_num):
            row=np.append(1,data_train[t])
            y=sigmoid(w.dot(row))
            error=y_train.values[t]-y
            deltaW+=error*row
        w+=ita*deltaW
        if test_logisticRegression(w,x_valid,y_valid)>=0.95:
            break
    return w


def logisticRegression_train_and_test(x_train,y_train,x_test,y_test,x_valid,y_valid):
    w=logisticRegression(x_train,y_train,x_valid,y_valid)
    return test_logisticRegression(w,x_test,y_test)

3. svm  
use module svm in sklearn

In [49]:
from sklearn import svm


def svmPredict(x_train,y_train):
#     x_train,y_train,x_test,y_test,x_valid,y_valid=loadDataSet()
    clf = svm.SVC(C=1,decision_function_shape='ovo')
    clf.fit(x_train,y_train)
    return clf


def svm_test(clf,x_test,y_test):
    return clf.score(x_test,y_test)


def svm_train_and_test(x_train,y_train,x_test,y_test,x_valid,y_valid):
    clf=svmPredict(x_train,y_train)
    return svm_test(clf,x_test,y_test)

### Empirical Study
1. comparison of accuracy

In [50]:
import matplotlib.pyplot as plt


def figureTableForAccuracy():
    name_list= ['training set','test set']
    x_train,y_train,x_test,y_test,x_valid,y_valid=loadDataSet()
    bayes=[]
    rl=[]
    svm=[]

    true_mean,true_std,false_mean,false_std,true_data_possibility,false_data_possibility=naive_bayes_predict(x_train,y_train)
    bayes.append(test_bayes(true_mean,true_std,false_mean,false_std,true_data_possibility,false_data_possibility,x_train,y_train))
    bayes.append(test_bayes(true_mean,true_std,false_mean,false_std,true_data_possibility,false_data_possibility,x_test,y_test))

    w=logisticRegression(x_train,y_train,x_valid,y_valid)
    rl.append(test_logisticRegression(w,x_train,y_train))
    rl.append(test_logisticRegression(w,x_test,y_test))

    clf=svmPredict(x_train,y_train)
    svm.append(svm_test(clf,x_train,y_train))
    svm.append(svm_test(clf,x_test,y_test))

    x =list(range(len(bayes))) 
    total_width, n = 0.6, 3
    width = total_width / n

    plt.bar(x, bayes, width=width, label='naive-bayes',fc = 'y')  
    for i in range(len(x)):  
        x[i] = x[i] + width
    plt.bar(x, rl, width=width, label='logistic regressio',tick_label = name_list,fc = 'r')
    for i in range(len(x)):  
        x[i] = x[i] + width
    plt.bar(x, svm, width=width, label='svm',fc = 'b')
    plt.legend()
    plt.show()
    
    table=np.reshape(bayes+rl+svm,)
    return pd.DataFrame(runtime,index=['naive-bayes','logistic regression','svm'],columns=['time'])

In [72]:
import time

def displayTime():
    x_train,y_train,x_test,y_test,x_valid,y_valid=loadDataSet()
    runtime=[]
    start =time.time()
    bayes_train_and_test(x_train,y_train,x_test,y_test,x_valid,y_valid)
    end = time.time()
    runtime.append(end-start)

    start =time.time()
    logisticRegression_train_and_test(x_train,y_train,x_test,y_test,x_valid,y_valid)
    end = time.time()
    runtime.append(end-start)

    start =time.time()
    svm_train_and_test(x_train,y_train,x_test,y_test,x_valid,y_valid)
    end = time.time()
    runtime.append(end-start)

    return pd.DataFrame(runtime,index=['naive-bayes','logistic regression','svm'],columns=['time'])


test=[1,2,3,4,5,6]
result=np.reshape(test,(2,3))
print(result)

[[1 2 3]
 [4 5 6]]
