In [44]:
import numpy as np
import pandas as pd
from typing import List
def onehot_encoder(ary, columns=[], remove_trap=False):
    df_results = pd.DataFrame()

    # Iterate each column in DataFrame ary
    for i in range(ary.shape[1]):
        # if this column (i) is dummy column
        if i in columns:
            base_name = ary.columns[i]
            this_column = pd.get_dummies(ary.iloc[:, i])
            this_column = this_column.rename(columns={n:"{}_{}".format(base_name, n) for n in this_column.columns})
            # Remove Dummy Variable Trap if needed
            if remove_trap:
                this_column = this_column.drop(this_column.columns[0], axis=1)
        # else this column is normal column
        else:
            this_column = ary.iloc[:, i]
        # Append this column to the Result DataFrame
        df_results = pd.concat([df_results, this_column], axis=1)

    return df_results

In [45]:
#A function that calculates the distance between points
def euclid_distance(l1: List[float], l2: List[float]) -> float:
    assert len(l1) == len(l2)
    return sum([(l1_i - l2_i)**2 for l1_i, l2_i in zip(l1, l2)])**0.5

def max_norm_distance(l1: List[float], l2: List[float]) -> float:
    assert len(l1) == len(l2)
    return max(abs(l1_i - l2_i) for l1_i, l2_i in zip(l1, l2))

def manhattan_distance(l1: List[float], l2: List[float]) -> float:
    assert len(l1) == len(l2)
    return sum(abs(l1_i - l2_i) for l1_i, l2_i in zip(l1, l2))

#Search function for the most frequent sample value
def most_frequent(l: List[str]) -> str:
    count = {}
    for l_i in l:
        if l_i in count.keys():
            count[l_i] += 1
        else:
            count[l_i] = 1
    count = sorted(count.items(), key = lambda item: item[1], reverse = True)
    return count[0][0]

#Classification function
def classification(data: List, df: pd.DataFrame, k: int, distance:str) -> str:
    dist = []
    if distance=='euclid_distance':
        #Calculation of distances to each point of the training sample
        for i in range(df.shape[0]):
            dist.append((i, euclid_distance(data, df.iloc[i, :-1])))
    elif distance=='max_norm_distance':
        #Calculation of distances to each point of the training sample
        for i in range(df.shape[0]):
            dist.append((i, max_norm_distance(data, df.iloc[i, :-1])))
    elif distance=='manhattan_distance':
        #Calculation of distances to each point of the training sample
        for i in range(df.shape[0]):
            dist.append((i, manhattan_distance(data, df.iloc[i, :-1])))
        
    #Search for values of the target variable
    dist.sort(key = lambda item: item[1])
    values = [df.iloc[d[0], -1] for d in dist[:k]]
    
    return most_frequent(values)

In [54]:
from sklearn.model_selection import KFold
train_df = pd.read_csv('./train_preproceed.csv')
test_df = pd.read_csv('./test.csv')

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_df_ = onehot_encoder(train_df, columns=[1, 2], remove_trap=True)
train_df = train_df_.sample(n=1000, random_state=42)  # 使用 random_state 以确保可重复性
X_train = train_df.iloc[:, :-1].values
X_train= scaler.fit_transform(X_train)
Y_train = train_df.iloc[:, -1].values.reshape(-1,1)


val_df = train_df_.sample(n=1000, random_state=38)  # 使用 random_state 以确保可重复性
X_test = val_df.iloc[:, :-1].values
X_test= scaler.fit_transform(X_test)
Y_test = val_df.iloc[:, -1].values.reshape(-1,1)

kf = KFold(n_splits=2, shuffle=True, random_state=42)  # 将n_splits设置为您希望的折数
accuracy_results = []

distance = ['euclid_distance', 'max_norm_distance', 'manhattan_distance']

for j in range(len(distance)):
    accuracies = []
    
    for train_index, test_index in kf.split(train_df):
        train_set, test_set = train_df.iloc[train_index], train_df.iloc[test_index]
        
        X_train = train_set.iloc[:, :-1].values
        X_train = scaler.fit_transform(X_train)
        Y_train = train_set.iloc[:, -1].values.reshape(-1, 1)
        
        X_test = test_set.iloc[:, :-1].values
        X_test = scaler.transform(X_test)  # 使用相同的缩放
        
        my_pred = [classification(test_set.iloc[i, :-1], train_set, 3, distance[j]) for i in range(test_set.shape[0])]
        l = [(test_set.iloc[i, -1], my_pred[i]) for i in range(test_set.shape[0])]
        accuracy = sum([test == pred for test, pred in l]) / len(l)
        accuracies.append(accuracy)

    avg_accuracy = sum(accuracies) / len(accuracies)
    accuracy_results.append(avg_accuracy)
    print(avg_accuracy)
for j in range(len(distance)):
    print('My algorithm\'s average accuracy using', distance[j], ':', accuracy_results[j])


0.556
0.578
0.5609999999999999
My algorithm's average accuracy using euclid_distance : 0.556
My algorithm's average accuracy using max_norm_distance : 0.578
My algorithm's average accuracy using manhattan_distance : 0.5609999999999999


In [55]:
from sklearn.model_selection import KFold
train_df = pd.read_csv('./train_preproceed.csv')
test_df = pd.read_csv('./test.csv')

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_df_ = onehot_encoder(train_df, columns=[1, 2], remove_trap=True)
train_df = train_df_.sample(n=1000, random_state=42)  # 使用 random_state 以确保可重复性
X_train = train_df.iloc[:, :-1].values
X_train= scaler.fit_transform(X_train)
Y_train = train_df.iloc[:, -1].values.reshape(-1,1)


val_df = train_df_.sample(n=1000, random_state=38)  # 使用 random_state 以确保可重复性
X_test = val_df.iloc[:, :-1].values
X_test= scaler.fit_transform(X_test)
Y_test = val_df.iloc[:, -1].values.reshape(-1,1)

kf = KFold(n_splits=3, shuffle=True, random_state=42)  # 将n_splits设置为您希望的折数
accuracy_results = []

distance = ['euclid_distance', 'max_norm_distance', 'manhattan_distance']

for j in range(len(distance)):
    accuracies = []
    
    for train_index, test_index in kf.split(train_df):
        train_set, test_set = train_df.iloc[train_index], train_df.iloc[test_index]
        
        X_train = train_set.iloc[:, :-1].values
        X_train = scaler.fit_transform(X_train)
        Y_train = train_set.iloc[:, -1].values.reshape(-1, 1)
        
        X_test = test_set.iloc[:, :-1].values
        X_test = scaler.transform(X_test)  # 使用相同的缩放
        
        my_pred = [classification(test_set.iloc[i, :-1], train_set, 3, distance[j]) for i in range(test_set.shape[0])]
        l = [(test_set.iloc[i, -1], my_pred[i]) for i in range(test_set.shape[0])]
        accuracy = sum([test == pred for test, pred in l]) / len(l)
        accuracies.append(accuracy)

    avg_accuracy = sum(accuracies) / len(accuracies)
    accuracy_results.append(avg_accuracy)
    print(avg_accuracy)
for j in range(len(distance)):
    print('My algorithm\'s average accuracy using', distance[j], ':', accuracy_results[j])


0.5580340819861779
0.5650350949752148
0.5620231009452566
My algorithm's average accuracy using euclid_distance : 0.5580340819861779
My algorithm's average accuracy using max_norm_distance : 0.5650350949752148
My algorithm's average accuracy using manhattan_distance : 0.5620231009452566


In [52]:
from sklearn.model_selection import KFold
train_df = pd.read_csv('./train_preproceed.csv')
test_df = pd.read_csv('./test.csv')

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_df_ = onehot_encoder(train_df, columns=[1, 2], remove_trap=True)
train_df = train_df_.sample(n=1000, random_state=42)  # 使用 random_state 以确保可重复性
X_train = train_df.iloc[:, :-1].values
X_train= scaler.fit_transform(X_train)
Y_train = train_df.iloc[:, -1].values.reshape(-1,1)


val_df = train_df_.sample(n=1000, random_state=38)  # 使用 random_state 以确保可重复性
X_test = val_df.iloc[:, :-1].values
X_test= scaler.fit_transform(X_test)
Y_test = val_df.iloc[:, -1].values.reshape(-1,1)

kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 将n_splits设置为您希望的折数
accuracy_results = []

distance = ['euclid_distance', 'max_norm_distance', 'manhattan_distance']

for j in range(len(distance)):
    accuracies = []
    
    for train_index, test_index in kf.split(train_df):
        train_set, test_set = train_df.iloc[train_index], train_df.iloc[test_index]
        
        X_train = train_set.iloc[:, :-1].values
        X_train = scaler.fit_transform(X_train)
        Y_train = train_set.iloc[:, -1].values.reshape(-1, 1)
        
        X_test = test_set.iloc[:, :-1].values
        X_test = scaler.transform(X_test)  # 使用相同的缩放
        
        my_pred = [classification(test_set.iloc[i, :-1], train_set, 3, distance[j]) for i in range(test_set.shape[0])]
        l = [(test_set.iloc[i, -1], my_pred[i]) for i in range(test_set.shape[0])]
        accuracy = sum([test == pred for test, pred in l]) / len(l)
        accuracies.append(accuracy)

    avg_accuracy = sum(accuracies) / len(accuracies)
    accuracy_results.append(avg_accuracy)
    print(avg_accuracy)
for j in range(len(distance)):
    print('My algorithm\'s average accuracy using', distance[j], ':', accuracy_results[j])


0.583
0.59
0.582
My algorithm's average accuracy using euclid_distance : 0.583
My algorithm's average accuracy using max_norm_distance : 0.59
My algorithm's average accuracy using manhattan_distance : 0.582


In [53]:
from sklearn.model_selection import KFold
train_df = pd.read_csv('./train_preproceed.csv')
test_df = pd.read_csv('./test.csv')

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_df_ = onehot_encoder(train_df, columns=[1, 2], remove_trap=True)
train_df = train_df_.sample(n=1000, random_state=42)  # 使用 random_state 以确保可重复性
X_train = train_df.iloc[:, :-1].values
X_train= scaler.fit_transform(X_train)
Y_train = train_df.iloc[:, -1].values.reshape(-1,1)


val_df = train_df_.sample(n=1000, random_state=38)  # 使用 random_state 以确保可重复性
X_test = val_df.iloc[:, :-1].values
X_test= scaler.fit_transform(X_test)
Y_test = val_df.iloc[:, -1].values.reshape(-1,1)

kf = KFold(n_splits=10, shuffle=True, random_state=42)  # 将n_splits设置为您希望的折数
accuracy_results = []

distance = ['euclid_distance', 'max_norm_distance', 'manhattan_distance']

for j in range(len(distance)):
    accuracies = []
    
    for train_index, test_index in kf.split(train_df):
        train_set, test_set = train_df.iloc[train_index], train_df.iloc[test_index]
        
        X_train = train_set.iloc[:, :-1].values
        X_train = scaler.fit_transform(X_train)
        Y_train = train_set.iloc[:, -1].values.reshape(-1, 1)
        
        X_test = test_set.iloc[:, :-1].values
        X_test = scaler.transform(X_test)  # 使用相同的缩放
        
        my_pred = [classification(test_set.iloc[i, :-1], train_set, 3, distance[j]) for i in range(test_set.shape[0])]
        l = [(test_set.iloc[i, -1], my_pred[i]) for i in range(test_set.shape[0])]
        accuracy = sum([test == pred for test, pred in l]) / len(l)
        accuracies.append(accuracy)

    avg_accuracy = sum(accuracies) / len(accuracies)
    accuracy_results.append(avg_accuracy)
    print(avg_accuracy)
for j in range(len(distance)):
    print('My algorithm\'s average accuracy using', distance[j], ':', accuracy_results[j])


0.5880000000000001
0.599
0.583
My algorithm's average accuracy using euclid_distance : 0.5880000000000001
My algorithm's average accuracy using max_norm_distance : 0.599
My algorithm's average accuracy using manhattan_distance : 0.583
