---
title: segmentation
---

---
title: Implémentation de classifieurs binaires 
---

## Imports

In [None]:
from joblib import parallel_backend
parallel_backend("loky", n_jobs=-1)

In [None]:
from get_dataset import dataset_loaders
dataset = list(dataset_loaders.keys())[0]

In [None]:
# Parameters
dataset = "segmentation"


In [None]:
from get_dataset import load_dataset

X, y = load_dataset(dataset)

models = dict()

def store_results(name, grid):
    models[name] = {
        "best_params": grid.best_params_,
        "best_estimator": grid.best_estimator_,
    }
     
    pass
    

## Data presentation

**{eval}`dataset`\** dataset contains `n` = {eval}`X.shape[0]` samples and `p` = {eval}`X.shape[1]` features.

The target variable is binary and {}`y.mean() * 100:.2f`% of the samples are positive.

In [35]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

from sklearn.preprocessing import StandardScaler

# Normalize data using only the training set
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Entraînement des classifieurs

### Classifieurs non paramétriques

#### K-Nearest Neighbors

In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

model = KNeighborsClassifier(weights='uniform', algorithm='auto')

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid, 
    cv=5, 
    scoring='accuracy',
    refit=True
    )

grid_search.fit(X_train, y_train)
store_results('KNN', grid_search)

#### Distance-Weighted KNN

In [37]:
model = KNeighborsClassifier(weights='distance', algorithm='auto')

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid, 
    cv=5, 
    scoring='accuracy',
    refit=True
    )

grid_search.fit(X_train, y_train)
store_results('KNN Distance Weighted', grid_search)

#### Condensed Nearest Neighbor

In [38]:
from imblearn.under_sampling import CondensedNearestNeighbour
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_X_y
from sklearn.utils.validation import validate_data

class CondensedNearestNeighbourTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, sampling_strategy = "auto", random_state = 42, n_neighbors = None, n_seeds_S = 1):
        self.sampling_strategy = sampling_strategy
        self.random_state = random_state
        self.n_neighbors = n_neighbors
        self.n_seeds_S = n_seeds_S

    def fit(self, X, y=None):
        # validate_data(X, y, accept_sparse=True, reset=True)
        self.n_features_in_ = X.shape[1]
        
        return self

    def transform(self, X, y=None):
        # check_X_y(X, y)

        if y is None:
            return X
        else:    
          return CondensedNearestNeighbour(
            sampling_strategy = self.sampling_strategy,
            random_state = self.random_state,
            n_neighbors = self.n_neighbors,
            n_seeds_S = self.n_seeds_S
          ).fit_resample(X, y)

from sklearn.utils.estimator_checks import check_estimator
# check_estimator(CondensedNearestNeighbourTransformer())

In [39]:
from sklearn.pipeline import Pipeline

model = Pipeline([
    ('cnn', CondensedNearestNeighbourTransformer(sampling_strategy='auto', n_neighbors=3, n_seeds_S=1)),
    ('knn', KNeighborsClassifier(weights='uniform', algorithm='auto'))
])

param_grid = {
    'cnn__n_neighbors': [3, 5, 7, 9],
    'knn__n_neighbors': [3, 5, 7, 9],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid, 
    cv=5, 
    scoring='accuracy',
    refit=True
    )

grid_search.fit(X_train, y_train)
store_results('KNN Condensed Nearest Neighbor', grid_search)

#### Locally Adaptive KNN

In [None]:
class LocallyAdaptiveKNN(KNeighborsClassifier):
    def predict(self, X):
        distances, indices = self.kneighbors(X)
        predictions = []
        for i, neighbors in enumerate(indices):
            local_k = int(len(neighbors) / 2)  # Example of adapting k locally
            local_knn = KNeighborsClassifier(n_neighbors=local_k)
            local_knn.fit(self._fit_X[neighbors], self._y[neighbors])
            predictions.append(local_knn.predict([X[i]])[0])
        return predictions

model = LocallyAdaptiveKNN(weights='uniform', algorithm='auto')

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid, 
    cv=5, 
    scoring='accuracy',
    refit=True
    )

grid_search.fit(X_train, y_train)
store_results('KNN Locally Adaptive', grid_search)

### Classifieurs binaires non linéaires

#### DecisionTrees

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)

param_grid = {
  'max_depth': [3, 5, 7, 9],
  'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
  estimator=model,
  param_grid=param_grid, 
  cv=5, 
  scoring='accuracy',
  refit=True
)

grid_search.fit(X_train, y_train)
store_results('Decision Tree', grid_search)

#### RandomForest

**Variantes:**
- simple
- cost-sensitive learning

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)

param_grid = {
  'n_estimators': [50, 100, 200],
  'max_depth': [3, 5, 7, 9],
  'min_samples_split': [2, 5, 10],
  'class_weight': ['balanced', 'balanced_subsample']
}

grid_search = GridSearchCV(
  estimator=model,
  param_grid=param_grid, 
  cv=5, 
  scoring='accuracy',
  refit=True
)

grid_search.fit(X_train, y_train)
store_results('Random Forest', grid_search)

#### AdaBoost

**Variantes:**
- simple
- early stopping

In [None]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=42)

param_grid = {
  'n_estimators': [50, 100, 200],
  'learning_rate': [0.01, 0.1, 1.0]
}

grid_search = GridSearchCV(
  estimator=model,
  param_grid=param_grid, 
  cv=5, 
  scoring='accuracy',
  refit=True
)

grid_search.fit(X_train, y_train)
store_results('AdaBoost', grid_search)

### Classifieurs binaires paramétriques

#### SVM linéaire
- One-class SVM pour gérer le déséquilibre des classes
- Combiner avec des méthodes de sous/sur-échantillonnage (SMOTE, RandomUnderSampling)
- Cost-sensitive SVM : Différentes pénalités C pour chaque classe
- Ensemble de SVMs avec bagging

In [None]:
from sklearn.svm import OneClassSVM, SVC
from sklearn.ensemble import BaggingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# One-class SVM for handling class imbalance
one_class_svm = OneClassSVM(gamma='auto')

# SMOTE for oversampling
smote = SMOTE(random_state=42)

# RandomUnderSampler for undersampling
under_sampler = RandomUnderSampler(random_state=42)

# Cost-sensitive SVM with different penalties for each class
cost_sensitive_svm = SVC(class_weight='balanced', random_state=42)

# Bagging ensemble of SVMs
bagging_svm = BaggingClassifier(base_estimator=cost_sensitive_svm, n_estimators=10, random_state=42)

# Create a pipeline with SMOTE, undersampling, and bagging SVM
model = ImbPipeline([
  ('smote', smote),
  ('under', under_sampler),
  ('svm', bagging_svm)
])

param_grid = {
  'svm__base_estimator__C': [0.1, 1, 10],
  'svm__base_estimator__gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(
  estimator=model,
  param_grid=param_grid, 
  cv=5, 
  scoring='accuracy',
  refit=True
)

grid_search.fit(X_train, y_train)
store_results('AdaBoost', grid_search)

#### Régression logistique
- Régression logistique avec pénalisation élastique (combinaison L1/L2)
- Cost-sensitive avec pondération des classes
- Régression logistique polynomiale
- Régression logistique avec sélection de features

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

# Logistic Regression with elastic net penalty (L1/L2 combination)
elastic_net_lr = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, random_state=42)

# Cost-sensitive Logistic Regression with class weighting
cost_sensitive_lr = LogisticRegression(class_weight='balanced', random_state=42)

# Polynomial Logistic Regression
polynomial_features = PolynomialFeatures(degree=2, include_bias=False)
polynomial_lr = Pipeline([
  ('poly', polynomial_features),
  ('logistic', LogisticRegression(random_state=42))
])

# Logistic Regression with feature selection
feature_selector = SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=42))
selected_features_lr = Pipeline([
  ('feature_selection', feature_selector),
  ('logistic', LogisticRegression(random_state=42))
])

# Define parameter grids for each model
param_grids = {
  'Elastic Net Logistic Regression': {
    'logistic__C': [0.1, 1, 10],
    'logistic__l1_ratio': [0.1, 0.5, 0.9]
  },
  'Cost Sensitive Logistic Regression': {
    'logistic__C': [0.1, 1, 10]
  },
  'Polynomial Logistic Regression': {
    'logistic__C': [0.1, 1, 10]
  },
  'Selected Features Logistic Regression': {
    'feature_selection__max_features': [5, 10, 15],
    'logistic__C': [0.1, 1, 10]
  }
}

# Perform grid search for each model
for name, param_grid in param_grids.items():
  if name == 'Elastic Net Logistic Regression':
    model = elastic_net_lr
  elif name == 'Cost Sensitive Logistic Regression':
    model = cost_sensitive_lr
  elif name == 'Polynomial Logistic Regression':
    model = polynomial_lr
  elif name == 'Selected Features Logistic Regression':
    model = selected_features_lr
  
  grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid, 
    cv=5, 
    scoring='accuracy',
    refit=True
  )
  
  grid_search.fit(X_train, y_train)
  store_results(name, grid_search)

### Évaluation des classifieurs binaires

In [41]:
from utils import roc_plot, precision_recall_plot, table_report

y_pred_weighted_knn = model.predict(X_test)
y_pred_proba_weighted_knn = model.predict_proba(X_test)[:,1]

ImportError: cannot import name 'roc_plot' from 'utils' (/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/src/utils.py)

La [](#table_report_LR1) montre les résultats de la classification par le modèle de régression logistique. On observe que :

- $83,04 \%$ des *spams* sont correctement identifiés
- $99,59 \%$ des *hams* sont correctement identifiés
- $96,88 \%$ des observations classifiées en tant que *spam* sont des *spams*
- $97,43 \%$ des observations classifiées en tant que *ham* sont des *hams*
- Le score F1 moyen pondéré est de $97,98 \%$ 