In [4]:
import requests  # Import requests module to download data

In [2]:
# URL of the dataset
url = "https://goo.gl/BDYgh5"

In [5]:
# Download dataset from the given URL
response = requests.get(url)
dataset_content = response.text  # Read content of the dataset

In [6]:
# Save the dataset to a CSV file
filename = "abalone.csv"
with open(filename, "w") as file:
    file.write(dataset_content)

print(f"Dataset downloaded and saved as {filename}")

Dataset downloaded and saved as abalone.csv


In [8]:
from csv import reader  # Import the CSV reader module to read CSV files

def load_csv(filename):
    """Load a CSV file into a dataset (list of lists)"""

    dataset = []  # Initialize an empty list to store dataset rows

    with open(filename, 'r') as file:  # Open the specified file in read mode
        csv_reader = reader(file)  # Create a CSV reader object to read the file

        for row in csv_reader:  # Iterate over each row in the CSV file
            if not row:  # Skip empty rows to avoid processing blank lines
                continue
            dataset.append(row)  # Append the non-empty row to the dataset list

    return dataset  # Return the final dataset as a list of lists


In [9]:
def str_column_to_float(dataset, column):
    """Convert a column in the dataset from string to float."""

    for row in dataset:  # Iterate through each row in the dataset
        row[column] = float(row[column].strip())  # Strip whitespace and convert the value to float


In [10]:
def str_column_to_int(dataset, column):
    """Convert a categorical column in the dataset to integer values."""

    # Extract all values from the specified column
    class_values = [row[column] for row in dataset]

    # Get unique values from the column (to assign unique integers)
    unique = set(class_values)

    # Create a dictionary to map each unique category to an integer
    lookup = {value: i for i, value in enumerate(unique)}

    # Replace categorical values in the dataset with their corresponding integer values
    for row in dataset:
        row[column] = lookup[row[column]]

    # Return the mapping dictionary for reference
    return lookup


In [11]:
def dataset_minmax(dataset):
    """Calculate the minimum and maximum values for each column in the dataset."""

    # Transpose the dataset using zip(*dataset) to get columns instead of rows
    # Find the min and max for each column and store them in a list
    minmax = [[min(column), max(column)] for column in zip(*dataset)]

    return minmax  # Return the list of [min, max] pairs for each column

def normalize_dataset(dataset, minmax):
    """Normalize the dataset by scaling values between 0 and 1 using min-max scaling."""

    for row in dataset:  # Iterate through each row in the dataset
        for i in range(len(row)):  # Iterate through each column in the row
            # Apply min-max normalization formula: (value - min) / (max - min)
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])


In [12]:
from random import randrange  # Import randrange to randomly select indices

def cross_validation_split(dataset, n_folds):
    """Split dataset into k folds for cross-validation."""

    dataset_split = []  # List to store the resulting folds
    dataset_copy = list(dataset)  # Create a copy of the dataset to modify safely
    fold_size = int(len(dataset) / n_folds)  # Determine the size of each fold

    for _ in range(n_folds):  # Repeat for the number of folds
        fold = []  # Initialize an empty list for the current fold

        while len(fold) < fold_size:  # Keep adding data until fold reaches the required size
            index = randrange(len(dataset_copy))  # Randomly select an index from the remaining dataset
            fold.append(dataset_copy.pop(index))  # Remove the selected row and add it to the fold

        dataset_split.append(fold)  # Add the completed fold to the list of folds

    return dataset_split  # Return the list of dataset folds


In [13]:
from math import sqrt  # Import the sqrt function to compute square roots

def euclidean_distance(row1, row2):
    """Calculate the Euclidean distance between two data points (rows)."""

    return sqrt(sum((row1[i] - row2[i]) ** 2 for i in range(len(row1) - 1)))
    # 1. Subtract corresponding elements in row1 and row2.
    # 2. Square the differences.
    # 3. Sum up all squared differences.
    # 4. Take the square root to get the final Euclidean distance.


In [14]:
def get_neighbors(train, test_row, num_neighbors):
    """Find the k nearest neighbors of a test row in the training dataset."""

    # Compute the Euclidean distance between the test_row and every row in the training dataset
    distances = [(train_row, euclidean_distance(test_row, train_row)) for train_row in train]

    # Sort the list of (train_row, distance) tuples based on the distance (ascending order)
    distances.sort(key=lambda tup: tup[1])

    # Extract the first 'num_neighbors' rows (i.e., the closest neighbors)
    return [distances[i][0] for i in range(num_neighbors)]


In [15]:
def predict_classification(train, test_row, num_neighbors):
    """Predict the class label for a given test row using the k-Nearest Neighbors algorithm."""

    # Find the k nearest neighbors of the test_row in the training dataset
    neighbors = get_neighbors(train, test_row, num_neighbors)

    # Extract the class labels (last column) from the k nearest neighbors
    output_values = [row[-1] for row in neighbors]

    # Return the most common class label among the neighbors
    return max(set(output_values), key=output_values.count)


In [16]:
def k_nearest_neighbors(train, test, num_neighbors):
    """Make predictions for each test instance using the k-Nearest Neighbors algorithm."""

    # Iterate over each test row and predict its class using k-NN
    return [predict_classification(train, row, num_neighbors) for row in test]


In [17]:
def accuracy_metric(actual, predicted):
    """Calculate the accuracy percentage of predictions compared to actual values."""

    # Count the number of correct predictions (where actual == predicted)
    correct = sum(1 for i in range(len(actual)) if actual[i] == predicted[i])

    # Compute accuracy as (correct predictions / total predictions) * 100
    return (correct / float(len(actual))) * 100.0


In [18]:
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    """Evaluate a machine learning algorithm using k-fold cross-validation."""

    # Split the dataset into k folds
    folds = cross_validation_split(dataset, n_folds)

    scores = []  # List to store accuracy scores for each fold

    for fold in folds:  # Iterate over each fold

        # Create the training set by combining all folds except the current one
        train_set = sum([f for f in folds if f != fold], [])

        # Create the test set as a copy of the current fold
        test_set = [row[:] for row in fold]

        # Remove class labels from test set (simulating unknown predictions)
        for row in test_set:
            row[-1] = None  # Setting the last column (class label) to None

        # Run the provided algorithm to get predictions
        predicted = algorithm(train_set, test_set, *args)

        # Extract actual class labels from the original fold
        actual = [row[-1] for row in fold]

        # Compute accuracy of predictions
        accuracy = accuracy_metric(actual, predicted)

        # Store the accuracy score
        scores.append(accuracy)

    return scores  # Return the list of accuracy scores for each fold


In [19]:
from random import seed  # Import seed function to control randomness

# Set the random seed for reproducibility
seed(1)

# Define the dataset file name
filename = "abalone.csv"

# Load the dataset from the CSV file
dataset = load_csv(filename)

# Convert all columns (except the first) from strings to floating-point numbers
for i in range(1, len(dataset[0])):
    str_column_to_float(dataset, i)

# Convert the first column (categorical feature) to integer values
str_column_to_int(dataset, 0)

# Define parameters for k-fold cross-validation and k-NN
n_folds = 5  # Number of folds for cross-validation
num_neighbors = 5  # Number of neighbors for k-NN

# Evaluate the k-Nearest Neighbors algorithm using cross-validation
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)

# Print the accuracy scores for each fold
print('Scores:', scores)

# Calculate and print the mean accuracy across all folds
print('Mean Accuracy: %.3f%%' % (sum(scores) / float(len(scores))))


Scores: [24.790419161676645, 21.79640718562874, 23.592814371257482, 21.676646706586826, 23.353293413173652]
Mean Accuracy: 23.042%
