In [27]:
import pandas as pd
import numpy as np
import math
import random
from sklearn.model_selection import train_test_split
from sklearn import model_selection


def loadDataSet():
    data = pd.read_csv("./train.csv")
    convertLabel(data)
    y = data['price_range']
    x = data.drop('price_range', axis = 1)
    x_train, x_tmp, y_train, y_tmp = train_test_split(x, y, test_size = 0.2, random_state = random.randint(0,100), stratify = y)
    x_test, x_valid, y_test, y_valid = train_test_split(x_tmp, y_tmp, test_size = 0.5, random_state = random.randint(0,100), stratify = y_tmp)
    return x_train,y_train,x_test,y_test,x_valid,y_valid

In [28]:
def convertLabel(data):
    data.loc[data.price_range<=1,'price_range']=0
    data.loc[data.price_range>1,'price_range']=1

In [29]:
def DataLabelSplit(x_data,y_data):
    x=x_data.values
    y=y_data.values
    data=np.column_stack((x,y))
    true_data=data[data[:,20]==1]
    false_data=data[data[:,20]==0]
    return true_data,false_data

def getMeanStdLabel(true_data,false_data):
    return np.mean(true_data,0), np.std(true_data,  0 ),np.mean(false_data,0), np.std(false_data,  0 )

def calPossibilityForDiscrete(true_data,false_data):
    column_number=true_data.shape[1]
    true_data_possibility=[]
    false_data_possibility=[]
    for i in range(column_number):
        true_data_factor_num=true_data[true_data[:,i]>0].shape[0]
        true_data_possibility.append(true_data_factor_num/true_data.shape[0])
        false_data_factor_num=false_data[false_data[:,i]>0].shape[0]
        false_data_possibility.append(false_data_factor_num/false_data.shape[0])
    return true_data_possibility,false_data_possibility


def calcuGaussProb(x,mean,stdev):
    exponent = np.exp(-(np.power(x-mean,2))/(2*np.power(stdev,2)))
    GaussProb = (1/(np.sqrt(2*np.pi)*stdev))*exponent
    return GaussProb


def calPossibility(arr,mean_arr,std_arr,pro_arr):
    possibility=1
    for i in range(len(arr)):
        if i==1 or i==3 or i==5 or i==17 or i==18 or i==19:
            if arr[i]==1:
                possibility*=pro_arr[i]
            else:
                possibility*=1-pro_arr[i]
        else:
            possibility*=calcuGaussProb(arr[i],mean_arr[i],std_arr[i])
    return possibility

def native_bayes_predict():
    x_train,y_train,x_test,y_test,x_valid,y_valid=loadDataSet()
    true_data,false_data=DataLabelSplit(x_train,y_train)
    true_mean,true_std,false_mean,false_std=getMeanStdLabel(true_data,false_data)
    true_data_possibility,false_data_possibility=calPossibilityForDiscrete(true_data,false_data)
#     print(true_mean)
#     print(true_std)
    prediction=[]
    for i in range(x_test.shape[0]):
        arr=x_test.values[i]
        true_possibility=calPossibility(arr,true_mean,true_std,true_data_possibility)
        false_possibility=calPossibility(arr,false_mean,false_std,false_data_possibility)
        if true_possibility>false_possibility:
            prediction.append(1)
        else:
            prediction.append(0)
#     print(prediction)
    result=y_test.values
    count=0
    for i in range(len(result)):
        if prediction[i]==result[i]:
            count+=1
    return count/len(result)

native_bayes_predict()

0.91

In [104]:
def sigmoid(inX):
    if(inX<-100):
        return 0
    return 1.0/(1+math.exp(-inX))

def initW(data_num):
    w=[]
    for i in range(data_num):
        w.append(random.uniform(-0.01,0.01))
    return np.array(w)

def gradientDescend():
    ita=0.1
    x_train,y_train,x_test,y_test,x_valid,y_valid=loadDataSet()
    column_num=x_train.shape[1]
    train_num=x_train.shape[0]
    w=initW(column_num+1)
    while True:
        deltaW=np.zeros(column_num+1)
        for j in range(train_num):
            row=np.append(1,x_train.values[j])
            y=sigmoid(w.dot(row))
            error=y_train.values[j]-y
            deltaW+=error*row
        w+=ita*deltaW/train_num
        if test(w,x_valid,y_valid)>0.8:
            break
    return test(w,x_test,y_test)

def test(w,x,y):
    row_num=x.shape[0]
    column_num=x.shape[1]
    prediction=[]
    for i in range(row_num):
        row=np.append(1,x.values[i])
        if w.dot(row)>0:
            prediction.append(1)
        else:
            prediction.append(0)
    
    result=y.values
    count=0
    for i in range(len(result)):
        if prediction[i]==result[i]:
            count+=1
    print(count/len(result))
    return count/len(result)

0.5
0.5
0.545
0.5
0.75
0.69
0.67
0.66
0.635
0.705
0.745
0.695
0.71
0.69
0.69
0.695
0.715
0.695
0.72
0.7
0.745
0.695
0.71
0.695
0.735
0.695
0.715
0.7
0.725
0.7
0.72
0.7
0.725
0.695
0.72
0.705
0.725
0.705
0.72
0.705
0.72
0.705
0.72
0.705
0.72
0.7
0.72
0.7
0.72
0.7
0.72
0.705
0.74
0.71
0.74
0.705
0.72
0.695
0.715
0.705
0.74
0.7
0.72
0.7
0.72
0.7
0.72
0.7
0.72
0.7
0.72
0.7
0.72
0.7
0.72
0.7
0.725
0.7
0.72
0.7
0.72
0.7
0.725
0.7
0.72
0.7
0.725
0.7
0.725
0.7
0.725
0.7
0.72
0.7
0.72


KeyboardInterrupt: 