# Using Lazy Predict to test all models accuracy

In [None]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from lazypredict.Supervised import LazyClassifier
import pandas as pd
import random

seed = random.randint(1000, 9999)
print(seed)
# Load your dataset
df = pd.read_csv("ACME-HappinessSurvey2020.csv")

# Define the features (X) and the target variable (y)
X = df[['X1', 'X2', 'X3', 'X4', 'X5', 'X6']]
y = df['Y']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.5, random_state=seed)

# Initialize the LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Fit the models and make predictions
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Print the models and their performance
print(models)


9687


100%|██████████| 31/31 [00:00<00:00, 55.56it/s]

[LightGBM] [Info] Number of positive: 35, number of negative: 28
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24
[LightGBM] [Info] Number of data points in the train set: 63, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.555556 -> initscore=0.223144
[LightGBM] [Info] Start training from score 0.223144
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
ExtraTreeClassifier                0.60               0.59     0.59      0.59   
AdaBoostClassifier                 0.59               0.58     0.58      0.58   
RandomForestClassifier             0.59               0.57     0.57      0.57   
GaussianNB                         0.59            




# Observation

based on lazy predict analysis top 3 models are:

PassiveAggressiveClassifier        0.63               0.61     0.61      0.61   
XGBClassifier                      0.62               0.60     0.60      0.61   
KNeighborsClassifier               0.60               0.59     0.59      0.60  

# Maximizing accuracy of the top 3 models

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

# Define the models and their hyperparameters
models = {
    'PassiveAggressiveClassifier': {
        'model': PassiveAggressiveClassifier(),
        'params': {
            'C': [0.1, 1, 10],
            'max_iter': [1000, 2000, 3000]
        }
    },
    'XGBClassifier': {
        'model': XGBClassifier(),
        'params': {
            'max_depth': [3, 5, 7],
            'learning_rate': [0.1, 0.5, 1],
            'n_estimators': [50, 100, 200]
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
    }
}

# Define the training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Perform grid search for each model
for model_name, model in models.items():
    grid_search = GridSearchCV(
        model['model'], model['params'], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Print the best parameters and the best score
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {grid_search.best_score_}")

    # Train the model with the best parameters and evaluate on the test data
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    print(
        f"Accuracy on test data for {model_name}: {accuracy_score(y_test, y_pred)}")


Best parameters for PassiveAggressiveClassifier: {'C': 1, 'max_iter': 1000}
Best score for PassiveAggressiveClassifier: 0.5800000000000001
Accuracy on test data for PassiveAggressiveClassifier: 0.46153846153846156
Best parameters for XGBClassifier: {'learning_rate': 1, 'max_depth': 7, 'n_estimators': 50}
Best score for XGBClassifier: 0.5900000000000001
Accuracy on test data for XGBClassifier: 0.5769230769230769
Best parameters for KNeighborsClassifier: {'algorithm': 'auto', 'n_neighbors': 7, 'weights': 'distance'}
Best score for KNeighborsClassifier: 0.58
Accuracy on test data for KNeighborsClassifier: 0.46153846153846156


### Test 2 , removing X2 feature since it has very little impact on the result:


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Define the features and target variable
X = df[['X1', 'X3', 'X4', 'X5', 'X6']]
y = df['Y']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Define the models and their hyperparameters
models = {
    'PassiveAggressiveClassifier': {
        'model': PassiveAggressiveClassifier(),
        'params': {
            'C': [0.1, 1, 10],
            'max_iter': [1000, 2000, 3000]
        }
    },
    'XGBClassifier': {
        'model': XGBClassifier(),
        'params': {
            'max_depth': [3, 5, 7],
            'learning_rate': [0.1, 0.5, 1],
            'n_estimators': [50, 100, 200]
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
    }
}

# Perform grid search for each model
for model_name, model in models.items():
    grid_search = GridSearchCV(
        model['model'], model['params'], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Print the best parameters and the best score
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {grid_search.best_score_}")

    # Train the model with the best parameters and evaluate on the test data
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    print(
        f"Accuracy on test data for {model_name}: {accuracy_score(y_test, y_pred)}")


Best parameters for PassiveAggressiveClassifier: {'C': 10, 'max_iter': 2000}
Best score for PassiveAggressiveClassifier: 0.64
Accuracy on test data for PassiveAggressiveClassifier: 0.38461538461538464
Best parameters for XGBClassifier: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}
Best score for XGBClassifier: 0.66
Accuracy on test data for XGBClassifier: 0.6538461538461539
Best parameters for KNeighborsClassifier: {'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'distance'}
Best score for KNeighborsClassifier: 0.61
Accuracy on test data for KNeighborsClassifier: 0.6153846153846154
