# ML Classification Problems

In [4]:
# Import Libraries
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [6]:
# Import Datsets
datasets = {
    "Iris": load_iris(),
    "Wine": load_wine(),
    "Breast_Cancer": load_breast_cancer(),
    "Synthetic_Binary": make_classification(n_samples=200, n_features=5, n_classes=2, random_state=42),
    "Synthetic_Multi": make_classification(n_samples=200, n_features=5, n_classes=3, n_clusters_per_class=1, random_state=42)
}

In [8]:
# Classification models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SVC": SVC(),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(n_estimators=100)
}


In [10]:
# Results dictionary
all_results = {}

In [12]:
# Loop through datasets
for name, dataset in datasets.items():
    if isinstance(dataset, tuple):
        X, y = dataset
        feature_names = [f"Feature_{i}" for i in range(X.shape[1])]
    else:
        X = dataset.data
        y = dataset.target
        feature_names = dataset.feature_names

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Store results per dataset
    dataset_results = {}

    # Run models
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        report = classification_report(y_test, preds, output_dict=True)
        dataset_results[model_name] = {
            "Accuracy": acc,
            "Classification Report": pd.DataFrame(report).transpose()
        }

    all_results[name] = dataset_results

list(all_results.keys())  # Show dataset names processed


['Iris', 'Wine', 'Breast_Cancer', 'Synthetic_Binary', 'Synthetic_Multi']

In [14]:
summary_data = []

for dataset_name, models_result in all_results.items():
    for model_name, result in models_result.items():
        summary_data.append({
            "Dataset": dataset_name,
            "Model": model_name,
            "Accuracy": result["Accuracy"]
        })

summary_df = pd.DataFrame(summary_data)
summary_df.sort_values(by=["Dataset", "Accuracy"], ascending=[True, False], inplace=True)
summary_df.reset_index(drop=True, inplace=True)
summary_df

Unnamed: 0,Dataset,Model,Accuracy
0,Breast_Cancer,LogisticRegression,0.973684
1,Breast_Cancer,SVC,0.973684
2,Breast_Cancer,RandomForest,0.964912
3,Breast_Cancer,DecisionTree,0.929825
4,Iris,LogisticRegression,1.0
5,Iris,SVC,1.0
6,Iris,DecisionTree,1.0
7,Iris,RandomForest,1.0
8,Synthetic_Binary,LogisticRegression,0.875
9,Synthetic_Binary,RandomForest,0.875


# Automation EDA for Iris Dataset

In [20]:
Iris = load_iris()

In [22]:
import dtale
dtale.show(Iris)



# Y data Profiling for Iris Dataset

In [27]:
import pandas as pd
from sklearn.datasets import load_iris
from ydata_profiling import ProfileReport

# Load and convert to DataFrame
iris_bunch = load_iris()
iris_df = pd.DataFrame(iris_bunch.data, columns=iris_bunch.feature_names)

# Optional: Add target column
iris_df['target'] = iris_bunch.target

# Generate profile
profile = ProfileReport(iris_df, explorative=True)
profile.to_file('Iris.html')


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 52.16it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# SweetViz for Iris Dataset

In [32]:
import pandas as pd
from sklearn.datasets import load_iris
import sweetviz as sv

iris_bunch = load_iris()
iris_df = pd.DataFrame(iris_bunch.data, columns=iris_bunch.feature_names)
iris_df['target'] = iris_bunch.target
report = sv.analyze([iris_df, "Iris Dataset"])
report.show_html("Iris_SweetViz.html")


                                             |      | [  0%]   00:00 -> (? left)

Report Iris_SweetViz.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Autoviz for Iris Dataset

In [41]:
import pandas as pd
from sklearn.datasets import load_iris
from autoviz.AutoViz_Class import AutoViz_Class

iris_bunch = load_iris()
iris_df = pd.DataFrame(iris_bunch.data, columns=iris_bunch.feature_names)
iris_df["Class"] = iris_bunch.target

AV = AutoViz_Class()


dftc = AV.AutoViz(
    filename="None",         
    sep=",",
    depVar="Class",           
    dfte=iris_df,             
    header=0,
    verbose=1,
    lowess=False,
    chart_format="png",
    max_rows_analyzed=300000,
    max_cols_analyzed=30
)

Shape of your Data Set loaded: (150, 5)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  4
    Number of Integer-Categorical Columns =  0
    Number of String-Categorical Columns =  0
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  0
    Number of Numeric-Boolean Columns =  0
    Number of Discrete String Columns =  0
    Number of NLP String Columns =  0
    Number of Date Time Columns =  0
    Number of ID Columns =  0
    Number of Columns to Delete =  0
    4 Predictors classified...
        No variables removed since no ID or low-information variables found in data set

################ Multi_Classification problem #####################
To fix these data quality i

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
sepal length (cm),float64,0.0,,4.3,7.9,No issue
sepal width (cm),float64,0.0,,2.0,4.4,Column has 4 outliers greater than upper bound (4.05) or lower than lower bound(2.05). Cap them or remove them.
petal length (cm),float64,0.0,,1.0,6.9,Column has a high correlation with ['sepal length (cm)']. Consider dropping one of them.
petal width (cm),float64,0.0,,0.1,2.5,"Column has a high correlation with ['sepal length (cm)', 'petal length (cm)']. Consider dropping one of them."
Class,int64,0.0,2.0,0.0,2.0,Target column


2025-06-08 22:39:28,297 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:39:28,300 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:39:28,331 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:39:28,336 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:39:28,395 - INFO     - Using categorical units to plot a list of strings that 

Total Number of Scatter Plots = 10


2025-06-08 22:39:28,873 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:39:28,880 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:39:28,908 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:39:28,919 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:39:28,955 - INFO     - Using categorical units to plot a list of strings that 

All Plots done
Time to run AutoViz = 1 seconds 

 ###################### AUTO VISUALIZATION Completed ########################


# Automation EDA for Wine DataSet

In [50]:
wine = load_wine()

# Dtale

In [55]:
import dtale
dtale.show(wine)



# y-dataprofiling

In [58]:
from sklearn.datasets import load_wine
from ydata_profiling import ProfileReport

wine_bunch = load_wine()
wine_df = pd.DataFrame(wine_bunch.data, columns = wine_bunch.feature_names)

wine_df['target'] = wine_bunch.target

profile1 = ProfileReport(wine_df, explorative=True)
profile1.to_file('Wine.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|███████████████████████████████████████| 14/14 [00:00<00:00, 319131.83it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# SweetViz

In [63]:
from sklearn.datasets import load_wine
import sweetviz as sv

In [65]:
wine_bunch = load_wine()
wine_df = pd.DataFrame(wine_bunch.data, columns = wine_bunch.feature_names)

wine_df['target'] = wine_bunch.target

In [75]:
r1 = sv.analyze([wine_df, "Wine Dataset"])
r1.show_html('WineSweetviz.html')

                                             |      | [  0%]   00:00 -> (? left)

Report WineSweetviz.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# AutoViz

In [78]:
from sklearn.datasets import load_wine
from autoviz.AutoViz_Class import AutoViz_Class

wine_bunch = load_wine()
wine_df = pd.DataFrame(wine_bunch.data, columns = wine_bunch.feature_names)

wine_df['target'] = wine_bunch.target

In [94]:
from autoviz.AutoViz_Class import AutoViz_Class
from sklearn.datasets import load_wine
import pandas as pd

# Load the dataset and convert to DataFrame
wine = load_wine()
wine_df = pd.DataFrame(wine.data, columns=wine.feature_names)
wine_df["Class"] = wine.target

# Initialize AutoViz
AV = AutoViz_Class()

# Provide dummy filename (required) + actual DataFrame
dftc = AV.AutoViz(
    filename="dummy.csv",        # just a dummy string, won't be used
    sep=",",
    depVar="Class",
    dfte=wine_df,                # actual DataFrame to analyze
    header=0,
    verbose=1,
    lowess=False,
    chart_format="png",
    max_rows_analyzed=300000,
    max_cols_analyzed=30
)


Shape of your Data Set loaded: (178, 14)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  13
    Number of Integer-Categorical Columns =  0
    Number of String-Categorical Columns =  0
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  0
    Number of Numeric-Boolean Columns =  0
    Number of Discrete String Columns =  0
    Number of NLP String Columns =  0
    Number of Date Time Columns =  0
    Number of ID Columns =  0
    Number of Columns to Delete =  0
    13 Predictors classified...
        No variables removed since no ID or low-information variables found in data set

################ Multi_Classification problem #####################
To fix these data qualit

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
alcohol,float64,0.0,,11.03,14.83,No issue
malic_acid,float64,0.0,,0.74,5.8,Column has 3 outliers greater than upper bound (5.30) or lower than lower bound(-0.62). Cap them or remove them.
ash,float64,0.0,,1.36,3.23,Column has 3 outliers greater than upper bound (3.08) or lower than lower bound(1.69). Cap them or remove them.
alcalinity_of_ash,float64,0.0,,10.6,30.0,Column has 4 outliers greater than upper bound (27.95) or lower than lower bound(10.75). Cap them or remove them.
magnesium,float64,0.0,,70.0,162.0,Column has 4 outliers greater than upper bound (135.50) or lower than lower bound(59.50). Cap them or remove them.
total_phenols,float64,0.0,,0.98,3.88,No issue
flavanoids,float64,0.0,,0.34,5.08,Column has a high correlation with ['total_phenols']. Consider dropping one of them.
nonflavanoid_phenols,float64,0.0,,0.13,0.66,No issue
proanthocyanins,float64,0.0,,0.41,3.58,Column has 2 outliers greater than upper bound (3.00) or lower than lower bound(0.20). Cap them or remove them.
color_intensity,float64,0.0,,1.28,13.0,Column has 4 outliers greater than upper bound (10.67) or lower than lower bound(-1.25). Cap them or remove them.


2025-06-08 22:57:22,763 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:57:22,766 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:57:22,797 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:57:22,811 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:57:22,870 - INFO     - Using categorical units to plot a list of strings that 

Total Number of Scatter Plots = 91


2025-06-08 22:57:24,502 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:57:24,511 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:57:24,543 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:57:24,546 - INFO     - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-06-08 22:57:24,563 - INFO     - Using categorical units to plot a list of strings that 

All Plots done
Time to run AutoViz = 3 seconds 

 ###################### AUTO VISUALIZATION Completed ########################
