In [66]:
import pandas as pd
import numpy as np
import math
import random
from sklearn.model_selection import train_test_split
from sklearn import model_selection


def loadDataSet():
    data = pd.read_csv("./train.csv")
    convertLabel(data)
    y = data['price_range']
    x = data.drop('price_range', axis = 1)
    x_train, x_tmp, y_train, y_tmp = train_test_split(x, y, test_size = 0.2, random_state = random.randint(0,100), stratify = y)
    x_test, x_valid, y_test, y_valid = train_test_split(x_tmp, y_tmp, test_size = 0.5, random_state = random.randint(0,100), stratify = y_tmp)
    return x_train,y_train,x_test,y_test,x_valid,y_valid

In [67]:
def convertLabel(data):
    data.loc[data.price_range<=1,'price_range']=0
    data.loc[data.price_range>1,'price_range']=1

In [68]:
def DataLabelSplit(x_data,y_data):
    x=x_data.values
    y=y_data.values
    data=np.column_stack((x,y))
    true_data=data[data[:,20]==1]
    false_data=data[data[:,20]==0]
    return true_data,false_data

def getMeanStdLabel(true_data,false_data):
    return np.mean(true_data,0), np.std(true_data,  0 ),np.mean(false_data,0), np.std(false_data,  0 )

def calPossibilityForDiscrete(true_data,false_data):
    column_number=true_data.shape[1]
    true_data_possibility=[]
    false_data_possibility=[]
    for i in range(column_number):
        true_data_factor_num=true_data[true_data[:,i]>0].shape[0]
        true_data_possibility.append(true_data_factor_num/true_data.shape[0])
        false_data_factor_num=false_data[false_data[:,i]>0].shape[0]
        false_data_possibility.append(false_data_factor_num/false_data.shape[0])
    return true_data_possibility,false_data_possibility


def calcuGaussProb(x,mean,stdev):
    exponent = np.exp(-(np.power(x-mean,2))/(2*np.power(stdev,2)))
    GaussProb = (1/(np.sqrt(2*np.pi)*stdev))*exponent
    return GaussProb


def calPossibility(arr,mean_arr,std_arr,pro_arr):
    possibility=1
    for i in range(len(arr)):
        if i==1 or i==3 or i==5 or i==17 or i==18 or i==19:
            if arr[i]==1:
                possibility*=pro_arr[i]
            else:
                possibility*=1-pro_arr[i]
        else:
            possibility*=calcuGaussProb(arr[i],mean_arr[i],std_arr[i])
    return possibility

def native_bayes_predict():
    x_train,y_train,x_test,y_test,x_valid,y_valid=loadDataSet()
    true_data,false_data=DataLabelSplit(x_train,y_train)
    true_mean,true_std,false_mean,false_std=getMeanStdLabel(true_data,false_data)
    true_data_possibility,false_data_possibility=calPossibilityForDiscrete(true_data,false_data)
#     print(true_mean)
#     print(true_std)
    prediction=[]
    for i in range(x_test.shape[0]):
        arr=x_test.values[i]
        true_possibility=calPossibility(arr,true_mean,true_std,true_data_possibility)
        false_possibility=calPossibility(arr,false_mean,false_std,false_data_possibility)
        if true_possibility>false_possibility:
            prediction.append(1)
        else:
            prediction.append(0)
#     print(prediction)
    result=y_test.values
    count=0
    for i in range(len(result)):
        if prediction[i]==result[i]:
            count+=1
    return count/len(result)

native_bayes_predict()

0.91

In [69]:
def sigmoid(inX):  
    return 1.0/(1+math.exp(-inX))

def initW(data_num):
    w=[]
    for i in range(data_num):
        w.append(random.uniform(-0.01,0.01))
    return np.array(w)

def gradientDescend():
    ita=0.001
    x_train,y_train,x_test,y_test,x_valid,y_valid=loadDataSet()
    column_num=x_train.shape[1]
    w=initW(column_num)
    train_num=x_train.shape[0]
    while True:
        deltaW=np.zeros(column_num)
        for j in range(train_num):
            a=0
            a+=w.dot(x_train.values[j])
            y=sigmoid(a)
            error=y_train.values[j]-y
            deltaW+=error*x_train.values[j]
        w+=ita*deltaW
        if checkValid(w,x_valid,y_valid):
            break
    
    test_num=x_test.shape[0]
    prediction=[]
    for i in range(test_num):
        if w.dot(x_test.values[i])>0:
            prediction.append(1)
        else:
            prediction.append(0)
            
    result=y_test.values
    count=0
    for i in range(len(result)):
        if prediction[i]==result[i]:
            count+=1
    return count/len(result)

def checkValid(w,x,y):
    row_num=x.shape[0]
    column_num=x.shape[1]
    count=0
    for i in range(row_num):
        if w.dot(x.values[i])>0:
            count+=1
    return (count/row_num)>0.9

gradientDescend()

0.5