#***`Step-by-Step Implementation`***

>* ***`1)Load and prepare the dataset`***

>* ***`2)Define the classification algorithms`***

>* ***`3)Train,predict and evaluate metrics`***

>* ***`4)Determine the best algorithm`***

#***`1) Load and Prepare the Dataset`***

In [None]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,confusion_matrix

#Load dataset (you can replace this with any dataset)
data=load_iris()
x=pd.DataFrame(data.data,columns=data.feature_names)
y=pd.Series(data.target)

#split into training and testing sets
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

# optional:standardize features
# scaler=StandardScaler()
# x_train=scaler.fit_transform(x_train)
# x_test=scaler.transform(x_test)

In [None]:
data

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [None]:
data.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

#***`2) Define the Classification Algorithms`***

In [None]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Perceptron, PassiveAggressiveClassifier, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

# Note: XGBoost, LightGBM, CatBoost, CNN, RNN, LSTM require additional libraries
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Models dictionary
models = {
    'Logistic Regression': LogisticRegression(),
    'LDA': LinearDiscriminantAnalysis(),
    'Ridge Classifier': RidgeClassifier(),
    'Perceptron': Perceptron(),
    'Passive Aggressive': PassiveAggressiveClassifier(),
    'SGD Classifier': SGDClassifier(),
    'SVC': SVC(probability=True),
    'Linear SVC': LinearSVC(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0),
    'Gaussian NB': GaussianNB(),
    'Multinomial NB': MultinomialNB(),
    'Bernoulli NB': BernoulliNB(),
    'MLP': MLPClassifier(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
    'Gaussian Process Classifier': GaussianProcessClassifier(),
    # Voting Classifier and Stacking Classifier can be added later using multiple of these base classifiers
}


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



#***`3) Train,Predict and Evaluate Metrics`***

Train each model,predict on the test set,and evaluate performance using various metrics

In [None]:
from sklearn.metrics import classification_report

results = []

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #y_prob = model.predict_proba(X_test)[:,1] if hasttr(model , 'predict_proba') else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    #auc = roc_auc_score(y_test, y_prob,multi_class='ovr')if y_prob is not None else 'N/A'
    cm = confusion_matrix(y_test,y_pred)
    report = classification_report(y_test,y_pred)

    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        #'ROC AUC': reoc_auc,
        'Confusion Matrix': cm,
        'Classification Report': report
     })
# display the results
for results in results:
  print(f"Model : {results['Model']}")
  print(f"Accuracy : {results['Accuracy']:.4f}")
  print(f"Precision : {results['Precision']:.4f}")
  print(f"Recall : {results['Recall']:.4f}")
  print(f"F1 Score : {results['F1 Score']:.4f}")
  #print(f"ROC AUC : {results['ROC AUC']"})
  print("Confusion Matrix")
  print(results['Confusion Matrix'])
  print("Classification Report : ")
  print(results['Classification Report'])
  print("_" * 60)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000823 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 86
[LightGBM] [Info] Number of data points in the train set: 105, number of used features: 4
[LightGBM] [Info] Start training from score -1.219973
[LightGBM] [Info] Start training from score -1.043042
[LightGBM] [Info] Start training from score -1.043042


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model : Logistic Regression
Accuracy : 1.0000
Precision : 1.0000
Recall : 1.0000
F1 Score : 1.0000
Confusion Matrix
[[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]
Classification Report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

____________________________________________________________
Model : LDA
Accuracy : 1.0000
Precision : 1.0000
Recall : 1.0000
F1 Score : 1.0000
Confusion Matrix
[[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]
Classification Report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

