# Data Preprocessing and Modeling Notebook

This notebook demonstrates our complete workflow: cleaning a dataset from outliers, splitting and standardizing the data (without data leakage), and finally training a linear classifier. Our custom functions from `functions.py` are used throughout.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import our custom functions and classes
from functions import (
    to_pm1_labels,
    remove_outliers_from_file,
    split_and_standardize_data,
    Perceptron
    # (other functions/classes are available as well)
)

%matplotlib inline

In [None]:
# ------------------------------
# Create a synthetic dataset
# ------------------------------

np.random.seed(42)
n_samples = 200

# Generate two features from a standard normal distribution
X = np.random.randn(n_samples, 2)

# Create labels based on a linear decision boundary through the origin
# (e.g. label = 1 if x1 + x2 > 0, else 0)
y = (X[:, 0] + X[:, 1] > 0).astype(int)

# Introduce some outliers manually
n_outliers = 5
outliers = np.array([[10, 10], [10, 9.5], [9.5, 10], [-10, -10], [-9.5, -10]])
outlier_labels = np.array([1, 1, 1, 0, 0])

# Append the outliers
X_all = np.vstack([X, outliers])
y_all = np.concatenate([y, outlier_labels])

# Create a DataFrame and save to CSV
df = pd.DataFrame(X_all, columns=["feature1", "feature2"])
df["label"] = y_all
csv_file = "data.csv"
df.to_csv(csv_file, index=False)

print(f"Dataset created with {X_all.shape[0]} samples (including {n_outliers} outliers) and saved to {csv_file}.")

In [None]:
# ------------------------------
# Remove outliers from the CSV file
# ------------------------------

# Remove outliers based on z-score for 'feature1' and 'feature2'
remove_outliers_from_file(csv_file, threshold=3.0, columns=["feature1", "feature2"])

In [None]:
# ------------------------------
# Load and inspect the cleaned data
# ------------------------------

df_clean = pd.read_csv(csv_file)
print("Cleaned dataset shape:", df_clean.shape)
df_clean.head()

In [None]:
# ------------------------------
# Preprocess the data
# ------------------------------

# Separate features and label
X = df_clean[["feature1", "feature2"]].values
y = df_clean["label"].values

# Convert labels to {-1, +1} if necessary
y = to_pm1_labels(y)

# Split and standardize the data
X_train, X_test, y_train, y_test = split_and_standardize_data(X, y, test_ratio=0.2, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

In [None]:
# ------------------------------
# Visualize the training and test sets
# ------------------------------

plt.figure(figsize=(8, 6))
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='coolwarm', marker='o', label='Train')
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap='coolwarm', marker='s', edgecolors='k', label='Test')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Training and Test Sets (Standardized)')
plt.legend()
plt.show()

In [None]:
# ------------------------------
# Train a Perceptron model
# ------------------------------

model = Perceptron(max_iter=20, shuffle=True, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model on the test set
test_accuracy = model.score(X_test, y_test)
print(f"Test Accuracy of Perceptron: {test_accuracy * 100:.2f}%")

In [None]:
# ------------------------------
# Plot the decision boundary
# ------------------------------

# For a linear classifier in 2D (without explicit bias) the decision boundary is defined by:
#    w1*x1 + w2*x2 = 0  =>  x2 = -(w1/w2)*x1

w = model.w
if np.abs(w[1]) > 1e-6:
    x_vals = np.linspace(X_train[:, 0].min()-1, X_train[:, 0].max()+1, 100)
    y_vals = -(w[0] * x_vals) / w[1]
    
    plt.figure(figsize=(8, 6))
    
    # Plot training and test points
    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='coolwarm', marker='o', label='Train')
    plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap='coolwarm', marker='s', edgecolors='k', label='Test')
    
    # Plot decision boundary
    plt.plot(x_vals, y_vals, 'k--', label='Decision Boundary')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('Perceptron Decision Boundary')
    plt.legend()
    plt.show()
else:
    print("Cannot plot decision boundary: w[1] is too close to zero.")

## Summary

In this notebook we demonstrated how to preprocess a dataset by removing outliers, splitting and standardizing the data while avoiding data leakage, and finally training and visualizing a linear Perceptron classifier. You can extend this notebook by using other models from your code (e.g. PegasosSVM, KernelPerceptron) and by incorporating additional visualizations as needed.