In [1]:
import pandas as pd
import numpy as np

In [2]:
def custom_train_test_split(X, Y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)

    # Shuffle the data
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X = X[indices]
    Y = Y[indices]

    # Calculate the split index
    split_index = int(X.shape[0] * (1 - test_size))

    # Split the data into train and test/validation sets
    X_train = X[:split_index]
    X_validation = X[split_index:]
    Y_train = Y[:split_index]
    Y_validation = Y[split_index:]

    return X_train, X_validation, Y_train, Y_validation

In [3]:
def pointbiserialr(x, y):
    n = len(x)
    x_mean = np.mean(x)
    x_std = np.std(x)
    y_mean = np.mean(y)
    
    sum_xy = np.sum((x - x_mean) * y)
    
    corr = sum_xy / (n * x_std * np.std(y))
    abs_corr = np.abs(corr)
    
    return corr, abs_corr

def spearmanr(x, y):
    x_rank = np.argsort(np.argsort(x))
    y_rank = np.argsort(np.argsort(y))
    corr, abs_corr = pointbiserialr(x_rank, y_rank)
    return corr, abs_corr

In [6]:
dataset = pd.read_csv("./train.csv")


# Delete missing data
col_names = dataset.columns
num_data = dataset.shape[0]
for c in col_names:
    num_non = dataset[c].isin(["?"]).sum()
    if num_non > 0:
        print (c)
        print (num_non)
        print ("{0:.2f}%".format(float(num_non) / num_data * 100))
        print ("\n")

dataset = dataset[dataset["workclass"] != "?"]
dataset = dataset[dataset["occupation"] != "?"]
dataset = dataset[dataset["native-country"] != "?"]


workclass
2217
5.67%


occupation
2225
5.69%


native-country
687
1.76%




In [7]:
# Deal with categorical columns

category_col =['workclass', 'race', 'education','marital-status', 'occupation',
               'relationship', 'gender', 'native-country', 'income'] 

for col in category_col:
    b, c = np.unique(dataset[col], return_inverse=True) 
    dataset[col] = c

dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,52,2,210736,15,10,2,3,0,4,1,3103,0,55,38,1
1,62,2,209844,15,10,0,0,4,4,0,0,0,30,38,0
2,25,2,410240,11,9,4,2,3,4,1,0,0,40,38,0
3,28,2,90547,11,9,2,7,5,2,0,0,0,23,38,0
4,28,2,132326,11,9,4,0,3,4,1,0,0,40,38,0


In [8]:
col_names = dataset.columns

param=[]
correlation=[]
abs_corr=[]

tuplesAry = []

dataset

for c in col_names:
    #Check if binary or continuous
    if c != "income":
        if len(dataset[c].unique()) <= 2:
            corr = spearmanr(dataset['income'].to_numpy(), dataset[c].to_numpy())
        else:
            corr = pointbiserialr(dataset['income'].to_numpy(), dataset[c].to_numpy())
        param.append(c)
        correlation.append(corr[0])
        abs_corr.append(corr[1])

#Create dataframe for visualization
param_df = pd.DataFrame({'correlation':correlation,'parameter':param, 'abs_corr':abs_corr})

#Sort by absolute correlation
param_df = param_df.sort_values(by=['abs_corr'], ascending=False)

#Set parameter name as index
param_df = param_df.set_index('parameter')

print(param_df)

                 correlation  abs_corr
parameter                             
educational-num     0.330767  0.330767
relationship       -0.250225  0.250225
age                 0.236322  0.236322
gender              0.232375  0.232375
hours-per-week      0.224649  0.224649
capital-gain        0.220880  0.220880
marital-status     -0.192672  0.192672
capital-loss        0.149591  0.149591
education           0.083913  0.083913
race                0.076426  0.076426
occupation          0.047696  0.047696
native-country      0.020730  0.020730
workclass           0.015571  0.015571
fnlwgt             -0.007827  0.007827


In [10]:
best_col = param_df.index[0:4].values

print(best_col)

['educational-num' 'relationship' 'age' 'gender']


In [11]:
###################################################
##################### MODELING #####################
####################################################
# Split-out Validation Dataset and Create Test Variables
trainBestCol = np.append(best_col, np.array(['income']))
testCol = len(best_col)

array = dataset[trainBestCol].values
X = array[:, 0:testCol]
Y = array[:, testCol]
print('Split Data: X')
print(X)
print('Split Data: Y')
print(Y)
validation_size = 0.20
# seed = 7
seed = None
num_folds = 10
scoring = 'accuracy'

X_train, X_validation, Y_train, Y_validation = custom_train_test_split(X, Y, test_size=validation_size, random_state=seed)


Split Data: X
[[10  0 52  1]
 [10  4 62  0]
 [ 9  3 25  1]
 ...
 [10  3 19  0]
 [14  0 29  1]
 [10  0 35  1]]
Split Data: Y
[1 0 0 ... 0 1 0]


In [12]:
# Define custom KNN model
class KNN:
    def __init__(self, k):
        self.k = k
        
    def fit(self, X, y):
        self.X = X
        self.y = y
        
    def predict(self, X_test):
        y_pred = []
        for i in range(len(X_test)):
            distances = np.sqrt(np.sum((self.X - X_test[i].reshape(1, -1))**2, axis=1))
            nearest_indices = np.argsort(distances)[:self.k]
            nearest_labels = self.y[nearest_indices]
            mode_label = np.argmax(np.bincount(nearest_labels))
            y_pred.append(mode_label)
        return np.array(y_pred)

In [13]:
# Read test data and preprocess
test_dataset = pd.read_csv("./test_X.csv")
test_category_col =['workclass', 'race', 'education','marital-status', 'occupation',
               'relationship', 'gender', 'native-country'] 

for col in test_category_col:
    b, c = np.unique(test_dataset[col], return_inverse=True) 
    test_dataset[col] = c

test_dataset = test_dataset[best_col]

In [14]:
# Define and fit custom KNN model
knn = KNN(k=5)
knn.fit(X_train, Y_train)

# Make predictions using custom KNN model
predictions = knn.predict(test_dataset.values)

# Save predictions to file
save_data = np.array(predictions)
save_data_index = list(range(len(save_data)))

df = pd.DataFrame({"id": save_data_index, "income": save_data})
df.to_csv("data.csv", index=False)

print('我好ㄌ')

我好ㄌ
