### Basic SVC with Synthetic Data

In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt # Optional for visualization

In [4]:
# --- 1. Generate Synthetic Data ---
# Create a dataset with 2 features for easy visualization (optional)
# n_samples: number of data points
# n_features: number of input features
# n_informative: number of features that actually correlate with the output
# n_redundant: number of features that are linear combinations of informative features
# n_classes: number of output classes (e.g., 2 for binary classification)
# random_state: for reproducibility
X, y = make_classification(n_samples=200, n_features=2, n_informative=2, n_redundant=0,
                           n_clusters_per_class=1, n_classes=2, random_state=42)

In [6]:
X

array([[-0.87292898,  0.013042  ],
       [ 1.31293463,  2.77053357],
       [ 2.34042818,  2.42099601],
       [ 2.29454774, -0.40438019],
       [ 0.94410516,  0.4772409 ],
       [-0.11959689,  0.50891314],
       [ 0.1510847 ,  0.81007677],
       [-0.00745441, -0.45284256],
       [-1.25396925,  0.06769236],
       [-0.24392415,  1.19979806],
       [-1.9208928 ,  2.9189499 ],
       [ 0.48806269,  3.50578584],
       [ 2.25751204,  1.81515089],
       [-3.12190908,  0.62220145],
       [ 1.87768113,  0.51095984],
       [ 0.46785542,  0.7871928 ],
       [ 0.48404809,  0.04843842],
       [-0.54235518,  3.49536942],
       [ 1.04235818,  1.10204918],
       [ 1.63367133,  0.59219328],
       [ 1.85756778,  1.7382647 ],
       [-2.41167954, -0.80620914],
       [ 0.87481918,  0.48815664],
       [ 0.09529002,  0.57526555],
       [-1.64994223, -0.06695917],
       [-0.65245349, -0.19158949],
       [ 2.10252081,  2.34684404],
       [ 2.9574463 ,  2.01418031],
       [-1.70056309,

In [8]:
y

array([1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0])

In [10]:
# --- Optional: Visualize the generated data ---
# plt.figure(figsize=(8, 6))
# plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k')
# plt.title("Synthetic Classification Data")
# plt.xlabel("Feature 1")
# plt.ylabel("Feature 2")
# plt.show()

# --- 2. Split Data into Training and Testing Sets ---
# test_size: proportion of the dataset to include in the test split (e.g., 0.3 = 30%)
# random_state: ensures the split is the same every time the code runs
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

Training set size: 140 samples
Test set size: 60 samples


In [14]:
# --- 3. Feature Scaling ---
# SVC is sensitive to feature scaling, so it's highly recommended.
# StandardScaler standardizes features by removing the mean and scaling to unit variance.
scaler = StandardScaler()
# Fit the scaler ONLY on the training data to avoid data leakage
X_train_scaled = scaler.fit_transform(X_train)
# Apply the SAME transformation to the test data
X_test_scaled = scaler.transform(X_test)

In [16]:
# --- 4. Initialize and Train the SVC Model ---
# Key Hyperparameters:
# kernel: Specifies the kernel type ('linear', 'poly', 'rbf', 'sigmoid'). 'rbf' is often a good default.
# C: Regularization parameter. Controls the trade-off between achieving a low training error
#    and a low testing error (generalization). Smaller C encourages a larger margin, possibly
#    misclassifying more training points. Larger C tries to classify all training points correctly,
#    potentially leading to overfitting.
# gamma: Kernel coefficient for 'rbf', 'poly', and 'sigmoid'. Defines how far the influence
#        of a single training example reaches. 'scale' (1 / (n_features * X.var())) and
#        'auto' (1 / n_features) are common starting points.
svc_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42, probability=True) # probability=True allows predict_proba

In [18]:
print("\nTraining SVC model...")


Training SVC model...


In [20]:
# Train the model using the scaled training data
svc_model.fit(X_train_scaled, y_train)

In [22]:
print("Training complete.")

Training complete.


In [24]:
# --- 5. Make Predictions ---
# Predict class labels for the scaled test data
y_pred = svc_model.predict(X_test_scaled)

In [26]:
# Predict probabilities (optional, requires probability=True during initialization)
# y_pred_proba = svc_model.predict_proba(X_test_scaled)

# --- 6. Evaluate the Model ---
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [28]:
print("\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)


--- Model Evaluation ---
Accuracy: 0.8500

Confusion Matrix:
[[26  8]
 [ 1 25]]


In [30]:
# Rows: True Class, Columns: Predicted Class
# [[True Negative (TN), False Positive (FP)],
#  [False Negative (FN), True Positive (TP)]]

In [32]:
print("\nClassification Report:")
print(class_report)
# precision: TP / (TP + FP) - Accuracy of positive predictions.
# recall: TP / (TP + FN) - Sensitivity, ability to find all positive samples.
# f1-score: Harmonic mean of precision and recall.
# support: Number of actual occurrences of the class in the test set.


Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.76      0.85        34
           1       0.76      0.96      0.85        26

    accuracy                           0.85        60
   macro avg       0.86      0.86      0.85        60
weighted avg       0.87      0.85      0.85        60



In [34]:
# --- Optional: Print some predictions vs actual ---
print("\n--- Sample Predictions (Test Set) ---")
for i in range(min(10, len(y_test))):
     actual_label = y_test[i]
     predicted_label = y_pred[i]
     # predicted_prob = y_pred_proba[i] # Uncomment if using predict_proba
     print(f"Sample {i}: Actual={actual_label}, Predicted={predicted_label}") 


--- Sample Predictions (Test Set) ---
Sample 0: Actual=0, Predicted=0
Sample 1: Actual=0, Predicted=1
Sample 2: Actual=0, Predicted=1
Sample 3: Actual=1, Predicted=1
Sample 4: Actual=1, Predicted=1
Sample 5: Actual=1, Predicted=1
Sample 6: Actual=0, Predicted=0
Sample 7: Actual=0, Predicted=0
Sample 8: Actual=0, Predicted=0
Sample 9: Actual=0, Predicted=0


In [36]:
#, Probabilities={predicted_prob}")