In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
import matplotlib.pyplot as plt

# Load the Titanic dataset (replace 'your_dataset.csv' with the actual file path)
df = pd.read_csv('titanic3.csv')

# Assume 'features' is a list of feature names and 'target' is the target variable
features = ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest']
target = 'sex'

# Extract features and target variable
X = df[features].values
y = df[target].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

def kNN_classifier(test_point, X_train, y_train, k, distance_metric):
    distances = []

    for i, train_point in enumerate(X_train):
        distance = distance_metric(test_point, train_point)
        distances.append((distance, y_train[i]))

    distances.sort(key=lambda x: x[0])  # Sort based on distance

    k_nearest_neighbors = distances[:k]
    classes = [neighbor[1] for neighbor in k_nearest_neighbors]

    # Return the most frequent class
    return max(set(classes), key=classes.count)

def evaluate_classifier(X_test, y_test, X_train, y_train, k, distance_metric):
    predictions = [kNN_classifier(test_point, X_train, y_train, k, distance_metric) for test_point in X_test]

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    cm = confusion_matrix(y_test, predictions)

    return accuracy, f1, cm

# Run k-NN for different values of k and distance metrics
k_values = [1, 2, 3, 4, 5, 6, 7]
euclidean_f1_scores = []
manhattan_f1_scores = []

for k in k_values:
    euclidean_accuracy, euclidean_f1, _ = evaluate_classifier(X_test, y_test, X_train, y_train, k, euclidean_distance)
    manhattan_accuracy, manhattan_f1, _ = evaluate_classifier(X_test, y_test, X_train, y_train, k, manhattan_distance)

    euclidean_f1_scores.append(euclidean_f1)
    manhattan_f1_scores.append(manhattan_f1)

# Plot the results
plt.plot(k_values, euclidean_f1_scores, label='Euclidean Distance')
plt.plot(k_values, manhattan_f1_scores, label='Manhattan Distance')
plt.xlabel('k Values')
plt.ylabel('F1 Score')
plt.title('Comparison of k-NN with Different Distance Metrics')
plt.legend()
plt.show()


In [7]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
import matplotlib.pyplot as plt


# Load the Titanic dataset (replace 'your_dataset.csv' with the actual file path)
# df = pd.read_csv('titanic3.csv')

# # Assume 'features' is a list of feature names and 'target' is the target variable
# features = ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest']
# target = 'sex'

# Assuming you have a CSV file named 'titanic.csv'
df = pd.read_csv('titanic3.csv')

# Preprocess the data as needed (handle missing values, encode categorical variables, etc.)
# For simplicity, let's assume 'X' contains the features, and 'y' contains the target variable.
X = df[['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest']]
y = df['sex']

from sklearn.model_selection import train_test_split

# Assuming 'X' contains the features and 'y' contains the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


def euclidean_distance(x1, x2):
    print(x1)
    print(x2)
    print(type(x1))
    print(type(x2))
    return np.sqrt(np.sum((x1 - x2) ** 2))

def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

def knn_classifier(test_point, X_train, y_train, k, distance_metric):
    distances = []

    for i, train_point in enumerate(X_train):
        if distance_metric == 'euclidean':
            distance = euclidean_distance(test_point, train_point)
        elif distance_metric == 'manhattan':
            distance = manhattan_distance(test_point, train_point)
        else:
            raise ValueError("Invalid distance metric. Use 'euclidean' or 'manhattan'.")

        distances.append((distance, y_train[i]))

    distances.sort(key=lambda x: x[0])
    neighbors = distances[:k]

    # Get the most frequent class among the neighbors
    most_frequent_class = Counter([neighbor[1] for neighbor in neighbors]).most_common(1)[0][0]

    return most_frequent_class


In [3]:
def evaluate_classifier(y_true, y_pred):
    confusion_mat = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    return confusion_mat, accuracy, f1


In [None]:
k_values = [1, 2, 3, 4, 5, 6, 7]
distance_metrics = ['euclidean', 'manhattan']

# Assuming X_test is your testing set
test_point_index = 0  # Choose the index of the test point you want to use
test_point = X_test.iloc[test_point_index]

results = {}

for distance_metric in distance_metrics:
    f1_scores = []
    for k in k_values:
        y_pred = [knn_classifier(test_point, X_train, y_train, k, distance_metric) for test_point in X_test]
        _, _, f1 = evaluate_classifier(y_test, y_pred)
        f1_scores.append(f1)
    results[distance_metric] = f1_scores


In [None]:
plt.figure(figsize=(10, 6))

for distance_metric, f1_scores in results.items():
    plt.plot(k_values, f1_scores, label=distance_metric)

plt.title('k-NN Classifier Performance')
plt.xlabel('k values')
plt.ylabel('F1 Score')
plt.legend()
plt.show()
