# Assignment

### Ans1)

In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

In [3]:
# Split the dataset into a training set and a testing set (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Create a KNN classifier with a specified number of neighbors (e.g., 3)
k = 3
knn_classifier = KNeighborsClassifier(n_neighbors=k)

In [5]:
# Train the KNN classifier on the training data
knn_classifier.fit(X_train, y_train)

In [6]:
# Make predictions on the testing data
y_pred = knn_classifier.predict(X_test)


In [7]:
# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)

In [8]:
print(f"Accuracy of KNN Classifier with k={k}: {accuracy:.2f}")

Accuracy of KNN Classifier with k=3: 1.00


### Ans2)

In [9]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [10]:
# Load the Boston Housing dataset
boston = load_boston()
X = boston.data
y = boston.target


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [11]:
# Split the dataset into a training set and a testing set (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Create a KNN regressor with a specified number of neighbors (e.g., 3)
k = 3
knn_regressor = KNeighborsRegressor(n_neighbors=k)

In [13]:
# Train the KNN regressor on the training data
knn_regressor.fit(X_train, y_train)

In [14]:
# Make predictions on the testing data
y_pred = knn_regressor.predict(X_test)

In [16]:
# Calculate the Mean Squared Error (MSE) to evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)

In [17]:
print(f"Mean Squared Error of KNN Regressor with k={k}: {mse:.2f}")

Mean Squared Error of KNN Regressor with k=3: 21.66


### Ans3)

In [18]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier

In [19]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

In [20]:
# Define the parameter grid for K
param_grid = {'n_neighbors': np.arange(1, 21)}  # Try K values from 1 to 20

In [21]:
# Create a KNN classifier
knn_classifier = KNeighborsClassifier()

In [22]:
# Perform grid search with 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(knn_classifier, param_grid, cv=kf, scoring='accuracy')

In [23]:
# Fit the grid search to the data
grid_search.fit(X, y)

In [24]:
# Get the best K value and corresponding accuracy
best_k = grid_search.best_params_['n_neighbors']
best_accuracy = grid_search.best_score_

In [25]:
print(f"Optimal K: {best_k}")
print(f"Cross-Validation Accuracy with Optimal K: {best_accuracy:.2f}")

Optimal K: 13
Cross-Validation Accuracy with Optimal K: 0.98


### Ans4)

In [26]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [27]:
# Load the Boston Housing dataset
boston = load_boston()
X = boston.data
y = boston.target


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [28]:
# Split the dataset into a training set and a testing set (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Create a StandardScaler to scale the features
scaler = StandardScaler()

In [30]:
# Fit and transform the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)

In [31]:
# Transform the testing data using the same scaler
X_test_scaled = scaler.transform(X_test)

In [32]:
# Create a KNN regressor with a specified number of neighbors (e.g., 3)
k = 3
knn_regressor = KNeighborsRegressor(n_neighbors=k)

In [33]:
# Train the KNN regressor on the scaled training data
knn_regressor.fit(X_train_scaled, y_train)

In [34]:
# Make predictions on the scaled testing data
y_pred = knn_regressor.predict(X_test_scaled)

In [35]:
# Calculate the Mean Squared Error (MSE) to evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)

In [36]:
print(f"Mean Squared Error of KNN Regressor with feature scaling and k={k}: {mse:.2f}")

Mean Squared Error of KNN Regressor with feature scaling and k=3: 19.40


### Ans5)

In [37]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [38]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

In [39]:
# Split the dataset into a training set and a testing set (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
# Create a KNN classifier with weighted voting (using 'distance' as the weight option)
k = 3
knn_classifier = KNeighborsClassifier(n_neighbors=k, weights='distance')

In [41]:
# Train the KNN classifier on the training data
knn_classifier.fit(X_train, y_train)

In [42]:
# Make predictions on the testing data
y_pred = knn_classifier.predict(X_test)

In [43]:
# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)

In [44]:
print(f"Accuracy of KNN Classifier with weighted voting (k={k}, weights='distance'): {accuracy:.2f}")

Accuracy of KNN Classifier with weighted voting (k=3, weights='distance'): 1.00


### Ans6)

In [6]:
import numpy as np
from sklearn.preprocessing import StandardScaler

def standardize_features(X_train, X_test):
    """
    Standardize the features of the training and testing data.

    Parameters:
    - X_train: Training data features (numpy array or pandas DataFrame).
    - X_test: Testing data features (numpy array or pandas DataFrame).

    Returns:
    - X_train_std: Standardized training data features.
    - X_test_std: Standardized testing data features.
    """

    # Create a StandardScaler object
    scaler = StandardScaler()

    # Fit and transform the scaler on the training data
    X_train_std = scaler.fit_transform(X_train)

    # Transform the testing data using the same scaler
    X_test_std = scaler.transform(X_test)

    return X_train_std, X_test_std

# Example usage:
# X_train_std, X_test_std = standardize_features(X_train, X_test)


### Ans7)

In [47]:
import numpy as np

def standardize_features(X_train, X_test):
    """
    Standardize the features of the training and testing data.

    Parameters:
    - X_train: Training data features (numpy array or pandas DataFrame).
    - X_test: Testing data features (numpy array or pandas DataFrame).

    Returns:
    - X_train_std: Standardized training data features.
    - X_test_std: Standardized testing data features.
    """

    # Create a StandardScaler object
    scaler = StandardScaler()

    # Fit and transform the scaler on the training data
    X_train_std = scaler.fit_transform(X_train)

    # Transform the testing data using the same scaler
    X_test_std = scaler.transform(X_test)

    return X_train_std, X_test_std


### Ans8)

In [46]:
import numpy as np

def manhattan_distance(point1, point2):
    """
    Calculate the Manhattan distance between two points in n-dimensional space.

    Parameters:
    - point1: List or array representing the coordinates of the first point.
    - point2: List or array representing the coordinates of the second point.

    Returns:
    - distance: Manhattan distance between the two points.
    """

    # Convert the input points to NumPy arrays for ease of computation
    point1 = np.array(point1)
    point2 = np.array(point2)

    # Calculate the absolute differences for each dimension and sum them
    distance = np.sum(np.abs(point1 - point2))

    return distance