### Read data

In [1]:
from sklearn.datasets import load_svmlight_file
import pandas as pd


def get_data():
    # Assuming 'farm-ads-vect' is in SVMlight format
    X, y = load_svmlight_file('data/farm-ads-vect')

    # Convert sparse matrix to DataFrame
    df_vectors = pd.DataFrame.sparse.from_spmatrix(X)

    return df_vectors, y

In [2]:
df, labels = get_data()

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54867,54868,54869,54870,54871,54872,54873,54874,54875,54876
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.shape

(4143, 54877)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

def evaluate_model(model, X_test, y_test):
    """
    Evaluate a logistic regression model on the test data.

    Parameters:
    - model: trained logistic regression model
    - X_test: DataFrame, feature vectors for testing
    - y_test: Series, true labels for testing

    Returns:
    - accuracy: float, accuracy of the model on the test set
    - report: str, classification report (includes precision, recall, f1-score, and support)
    """
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, f1


def train_knn(X_train, y_train, n_neighbors=5):
    """
    Train a k-nearest neighbors (KNN) model on the training data.

    Parameters:
    - X_train: DataFrame, feature vectors for training
    - y_train: Series, labels for training
    - n_neighbors: int, number of neighbors to consider (default is 5)

    Returns:
    - model: trained KNN model
    """
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    model.fit(X_train, y_train)
    return model

In [6]:
from typing import List


def evaluate_attribute_set(attribute_names: List[str], df, labels):
    df_temp = df[attribute_names]

    X_train, X_test, y_train, y_test = train_test_split(df_temp, labels, test_size=0.2, random_state=42)

    # Train the logistic regression model
    model = train_knn(X_train, y_train)

    # Evaluate the model
    accuracy, f1 = evaluate_model(model, X_test, y_test)

    return accuracy, f1

In [7]:
def sequential_forward_selection(df, labels, max_features=None):
    """
    Perform Sequential Forward Selection (SFS) for feature selection.

    Parameters:
    - X_train: DataFrame, feature vectors for training
    - y_train: Series, labels for training
    - X_test: DataFrame, feature vectors for testing
    - y_test: Series, true labels for testing
    - max_features: int, maximum number of features to select (default is None)

    Returns:
    - best_features: list, selected features
    """
    num_features = df.shape[1]
    all_features = list(df.columns)
    selected_features = []
    best_accuracy = 0.0

    while len(selected_features) < num_features and (max_features is None or len(selected_features) < max_features):
        remaining_features = [feature for feature in all_features if feature not in selected_features]
        current_best_feature = None

        for feature in remaining_features:
            trial_features = selected_features + [feature]
            accuracy, f1 = evaluate_attribute_set(trial_features, df, labels)

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                current_best_feature = feature

        if current_best_feature is not None:
            selected_features.append(current_best_feature)
            print(f"Selected Features: {selected_features}")
            print(f"Accuracy with Selected Features: {best_accuracy:.2f}")

    return selected_features, best_accuracy

In [8]:
df.iloc[:,:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
4138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
selected_features, best_accuracy = sequential_forward_selection(df.iloc[:,:100], labels, max_features=5)
selected_features, best_accuracy

Selected Features: [38]
Accuracy with Selected Features: 0.72
Selected Features: [38, 15]
Accuracy with Selected Features: 0.75
Selected Features: [38, 15, 87]
Accuracy with Selected Features: 0.76
Selected Features: [38, 15, 87, 76]
Accuracy with Selected Features: 0.76
Selected Features: [38, 15, 87, 76, 78]
Accuracy with Selected Features: 0.77


([38, 15, 87, 76, 78], 0.767189384800965)

In [None]:
selected_features, best_accuracy