# Classification on Structured Data

## Implementation of six different classifiers

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.feature_selection import VarianceThreshold

In [32]:

# Base settings
n_samples = 1000
n_features = 100
n_informative = 10
n_redundant = 10
n_classes = 5
random_state = 42

# Generate a synthetic dataset
X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative,
                           n_redundant=n_redundant, n_clusters_per_class=1, n_classes=n_classes,
                           random_state=random_state)

# Add noise features
np.random.seed(random_state)
noise = np.random.normal(size=(n_samples, n_features - n_informative - n_redundant))
X = np.hstack((X, noise))

# Feature names
feature_names = [f'feature_{i}' for i in range(1, n_features + 1)]

In [33]:
# Classifiers
knn = make_pipeline(StandardScaler(), KNeighborsClassifier())
nb = GaussianNB()
log_reg = make_pipeline(StandardScaler(), LogisticRegression(random_state=42))
svm = make_pipeline(StandardScaler(), SVC(kernel='linear', random_state=42))
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)


In [34]:
# Evaluate models using cross-validation
models = [knn, nb, log_reg, svm, dt, rf]
model_names = ['KNN', 'Naive Bayes', 'Logistic Regression', 'SVM', 'Decision Tree', 'Random Forest']

scores = {name: cross_val_score(model, X, y, cv=5) for name, model in zip(model_names, models)}
for name, score in scores.items():
    print(f"{name} Accuracy: {np.mean(score):.2f} ± {np.std(score):.2f}")


KNN Accuracy: 0.49 ± 0.04
Naive Bayes Accuracy: 0.67 ± 0.01
Logistic Regression Accuracy: 0.56 ± 0.02
SVM Accuracy: 0.59 ± 0.03
Decision Tree Accuracy: 0.56 ± 0.04
Random Forest Accuracy: 0.73 ± 0.02


## Feature Selection

### Why we need to select feature? (How do you justify your method in research?)

Let's dicsuss.

In [35]:
def select_features_knn(X, y, k=10):
    vt = VarianceThreshold(threshold=0.1)
    X_vt = vt.fit_transform(X)
    selector = SelectKBest(f_classif, k=k)
    X_selected = selector.fit_transform(X_vt, y)
    knn = make_pipeline(StandardScaler(), KNeighborsClassifier())
    return X_selected, knn

# 2. Naive Bayes with SelectKBest
def select_features_nb(X, y, k=10):
    selector = SelectKBest(f_classif, k=k)
    X_selected = selector.fit_transform(X, y)
    nb = GaussianNB()
    return X_selected, nb

# 3. Logistic Regression with RFE
def select_features_logreg(X, y, n_features_to_select=10):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    log_reg = LogisticRegression(random_state=42)
    selector = RFE(estimator=log_reg, n_features_to_select=n_features_to_select)
    X_selected = selector.fit_transform(X_scaled, y)
    final_log_reg = LogisticRegression(random_state=42)
    return X_selected, final_log_reg

# 4. SVM with RFE
def select_features_svm(X, y, n_features_to_select=10):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    svm = SVC(kernel='linear', random_state=42)
    selector = RFE(estimator=svm, n_features_to_select=n_features_to_select)
    X_selected = selector.fit_transform(X_scaled, y)
    final_svm = SVC(kernel='linear', random_state=42)
    return X_selected, final_svm

# 5. Decision Tree with built-in feature importance
def select_features_dt(X, y, threshold=0.01):
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X, y)
    importances = dt.feature_importances_
    selected_features = importances > threshold
    X_selected = X[:, selected_features]
    final_dt = DecisionTreeClassifier(random_state=42)
    return X_selected, final_dt

# 6. Random Forest with built-in feature importance
def select_features_rf(X, y, threshold=0.01):
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X, y)
    importances = rf.feature_importances_
    selected_features = importances > threshold
    X_selected = X[:, selected_features]
    final_rf = RandomForestClassifier(random_state=42)
    return X_selected, final_rf

### Explanation of `f_classif` and `RFE`

#### `f_classif` (F-Classification)
`f_classif` is a scoring function used in feature selection methods like `SelectKBest`. It performs an ANOVA F-test to evaluate the relationship between each feature and the target variable. This method assumes that higher F-values indicate that the feature provides more significant discriminatory power for the target classes. Essentially, `f_classif` helps to select the features that have the strongest relationship with the target.

- **How it works**: 
  - It computes the variance between groups (based on the target classes) and compares it to the variance within groups (i.e., within the same class).
  - Features with larger between-group variance are considered more useful for classification tasks.
  
#### `RFE` (Recursive Feature Elimination)
`RFE` is a feature selection method that recursively removes less important features to reduce the feature set size. The algorithm starts by training a model (such as Logistic Regression or SVM) on the full feature set. It then ranks the importance of each feature according to the model's coefficients or importance scores. The least important feature is removed, and the process is repeated until the desired number of features is left.

- **How it works**:
  1. The model is trained on all features.
  2. Features are ranked based on their importance (e.g., the magnitude of their coefficients in Logistic Regression).
  3. The least important feature is removed, and the model is retrained on the remaining features.
  4. Steps 2-3 are repeated until the desired number of features is selected.

- **Use case**: `RFE` is particularly useful when you want to optimize model performance by retaining only the most relevant features and reducing the computational cost.


In [39]:
# Define the original models
knn = make_pipeline(StandardScaler(), KNeighborsClassifier())
nb = GaussianNB()
log_reg = make_pipeline(StandardScaler(), LogisticRegression(random_state=42))
svm = make_pipeline(StandardScaler(), SVC(kernel='linear', random_state=42))
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)

models = [knn, nb, log_reg, svm, dt, rf]
model_names = ['KNN', 'Naive Bayes', 'Logistic Regression', 'SVM', 'Decision Tree', 'Random Forest']

# Evaluate the original models
original_scores = {}
for name, model in zip(model_names, models):
    scores = cross_val_score(model, X, y, cv=5)
    original_scores[name] = scores

# Apply feature selection and create new models
X_knn, knn = select_features_knn(X, y)
X_nb, nb = select_features_nb(X, y)
X_log_reg, log_reg = select_features_logreg(X, y)
X_svm, svm = select_features_svm(X, y)
X_dt, dt = select_features_dt(X, y)
X_rf, rf = select_features_rf(X, y)

# Create a dictionary to store the selected features and corresponding models
selected_data = {
    'KNN': (X_knn, knn),
    'Naive Bayes': (X_nb, nb),
    'Logistic Regression': (X_log_reg, log_reg),
    'SVM': (X_svm, svm),
    'Decision Tree': (X_dt, dt),
    'Random Forest': (X_rf, rf)
}

# Evaluate models after feature selection using cross-validation
scores_after_selection = {}

for name, (X_selected, model) in selected_data.items():
    scores = cross_val_score(model, X_selected, y, cv=5)
    scores_after_selection[name] = scores

# Print the results
print("Results after feature selection:")
for name, score in scores_after_selection.items():
    print(f"{name} Accuracy: {np.mean(score):.2f} ± {np.std(score):.2f}")

# Compare with the original results
print("\nComparison with original results:")
for name in model_names:
    original_score = original_scores[name]
    new_score = scores_after_selection[name]
    print(f"{name}:")
    print(f"  Original: {np.mean(original_score):.2f} ± {np.std(original_score):.2f}")
    print(f"  After feature selection: {np.mean(new_score):.2f} ± {np.std(new_score):.2f}")
    print(f"  Difference: {np.mean(new_score) - np.mean(original_score):.2f}")
    print()


Results after feature selection:
KNN Accuracy: 0.77 ± 0.03
Naive Bayes Accuracy: 0.64 ± 0.02
Logistic Regression Accuracy: 0.73 ± 0.04
SVM Accuracy: 0.81 ± 0.01
Decision Tree Accuracy: 0.63 ± 0.03
Random Forest Accuracy: 0.81 ± 0.02

Comparison with original results:
KNN:
  Original: 0.49 ± 0.04
  After feature selection: 0.77 ± 0.03
  Difference: 0.28

Naive Bayes:
  Original: 0.67 ± 0.01
  After feature selection: 0.64 ± 0.02
  Difference: -0.03

Logistic Regression:
  Original: 0.56 ± 0.02
  After feature selection: 0.73 ± 0.04
  Difference: 0.17

SVM:
  Original: 0.59 ± 0.03
  After feature selection: 0.81 ± 0.01
  Difference: 0.21

Decision Tree:
  Original: 0.56 ± 0.04
  After feature selection: 0.63 ± 0.03
  Difference: 0.07

Random Forest:
  Original: 0.73 ± 0.02
  After feature selection: 0.81 ± 0.02
  Difference: 0.09



In [41]:
from sklearn.decomposition import PCA

# Define the PCA transformation function
def apply_pca(X, n_components=10):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    return X_pca, pca

# Choose the number of principal components
n_components = 10

# Apply PCA to each model and evaluate
pca_models = []
pca_scores = {}

for name, model in zip(model_names, models):
    X_pca, pca = apply_pca(X, n_components)
    model_pca = make_pipeline(pca, model)
    pca_models.append(model_pca)
    scores = cross_val_score(model_pca, X, y, cv=5)
    pca_scores[name] = scores

# Print the results after applying PCA to the models
print("Results after applying PCA:")
for name, scores in pca_scores.items():
    print(f"{name} Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

# Compare with original results
print("\nComparison with original results:")
for name in model_names:
    original_score = original_scores[name]
    pca_score = pca_scores[name]
    print(f"{name}:")
    print(f"  Original: {np.mean(original_score):.2f} ± {np.std(original_score):.2f}")
    print(f"  After PCA: {np.mean(pca_score):.2f} ± {np.std(pca_score):.2f}")
    print(f"  Difference: {np.mean(pca_score) - np.mean(original_score):.2f}")
    print()


Results after applying PCA:
KNN Accuracy: 0.84 ± 0.01
Naive Bayes Accuracy: 0.78 ± 0.01
Logistic Regression Accuracy: 0.72 ± 0.04
SVM Accuracy: 0.80 ± 0.01
Decision Tree Accuracy: 0.67 ± 0.02
Random Forest Accuracy: 0.83 ± 0.03

Comparison with original results:
KNN:
  Original: 0.49 ± 0.04
  After PCA: 0.84 ± 0.01
  Difference: 0.35

Naive Bayes:
  Original: 0.67 ± 0.01
  After PCA: 0.78 ± 0.01
  Difference: 0.11

Logistic Regression:
  Original: 0.56 ± 0.02
  After PCA: 0.72 ± 0.04
  Difference: 0.16

SVM:
  Original: 0.59 ± 0.03
  After PCA: 0.80 ± 0.01
  Difference: 0.21

Decision Tree:
  Original: 0.56 ± 0.04
  After PCA: 0.67 ± 0.02
  Difference: 0.11

Random Forest:
  Original: 0.73 ± 0.02
  After PCA: 0.83 ± 0.03
  Difference: 0.10



### My next question, which methods is better, how do you justify

### Common tips to justify your method in reasearch