In [2]:
import csv
import math

# Function to load CSV file with tab delimiter
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = csv.reader(file, delimiter='\t')  # Set the delimiter to tab
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Function to convert string column to float, ignore non-numeric
def str_column_to_float(dataset, column):
    for row in dataset:
        try:
            row[column] = float(row[column].strip())
        except ValueError:
            pass  # Ignore the conversion if it is not a numeric value

# Function to replace missing values with column mean
def replace_missing_values(dataset, missing_value):
    means = [0 for i in range(len(dataset[0]))]
    counts = [0 for i in range(len(dataset[0]))]
    for row in dataset:
        for i in range(len(row)):
            if row[i] != missing_value:
                means[i] += row[i]
                counts[i] += 1
    for i in range(len(means)):
        if counts[i] != 0:
            means[i] /= counts[i]

    for row in dataset:
        for i in range(len(row)):
            if row[i] == missing_value:
                row[i] = means[i]

# Function to normalize dataset
def minmax_normalize_dataset(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Function to calculate Euclidean distance between two rows
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return math.sqrt(distance)

# Function to locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

# Function to make a classification prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

# Function to write predictions to a file
def write_predictions(predictions, filename):
    with open(filename, 'w') as file:
        for pred in predictions:
            file.write("%s\n" % pred)

# k-Nearest Neighbors Algorithm
def k_nearest_neighbors(train, test, num_neighbors):
    predictions = list()
    for test_row in test:
        output = predict_classification(train, test_row, num_neighbors)
        predictions.append(output)
    return predictions

# Example usage:
# Load and prepare data
filename_train = 'TrainData1.csv'
filename_test = 'TestData1.csv'
train = load_csv(filename_train)
test = load_csv(filename_test)

# Convert string columns to float
for i in range(len(train[0])):
    str_column_to_float(train, i)
    str_column_to_float(test, i)

# Replace missing values
missing_value = 1e+99
replace_missing_values(train, missing_value)
replace_missing_values(test, missing_value)

# Normalize data
minmax_normalize_dataset(train)
minmax_normalize_dataset(test)

# Make predictions
num_neighbors = 5
predictions = k_nearest_neighbors(train, test, num_neighbors)

# Write predictions to a file
write_predictions(predictions, 'predictions.txt')
