
<a href="https://colab.research.google.com/github/petuch03/data-science-things/blob/master/ml_with_python/binary_classification_problem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://www.kaggle.com/competitions/nup-ml-1-2023-competition/leaderboard" target="_parent">Kaggle Competition</a>

## Model without the least important feature, acc = 0.942

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

data = pd.read_csv('binary_classification_datasets/train.csv')
data.fillna(data.mean(), inplace=True)

# Feature selection based on importance (previous launches):
# 7    feature8    0.187670
# 0    feature1    0.113971
# 2    feature3    0.101499
# 4    feature5    0.098769
# 6    feature7    0.092911
# 3    feature4    0.069674
# 9   feature10    0.064359
# 10  feature11    0.060138
# 1    feature2    0.054287
# 11  feature12    0.050472
# 5    feature6    0.046922
# 8    feature9    0.046358
# 12  feature13    0.012970
important_features = ['feature8', 'feature1', 'feature3', 'feature5', 'feature7',
                      'feature4', 'feature10', 'feature11', 'feature2', 'feature12', 
                      'feature6', 'feature9']
X = data[important_features]
y = data['target']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# XGBoost model
model = xgb.XGBClassifier()

# Hyperparameters grid
param_grid = {
    'n_estimators': [175, 200, 250, 300],
    'max_depth': [10, 15, 20, 25, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9, 1.0]
}

# Grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Best Parameters: {'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 200, 'subsample': 0.9}
print("Best Parameters:", grid_search.best_params_)

# Feature importance from the best model
feature_importances = best_model.feature_importances_
feature_names = X.columns
feature_importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
print(feature_importances_df.sort_values(by='Importance', ascending=False))

# Predictions and accuracy
predictions = best_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best Parameters: {'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 200, 'subsample': 0.9}
      Feature  Importance
9   feature12    0.190522
1    feature1    0.165431
0    feature8    0.110811
2    feature3    0.098326
5    feature4    0.091025
4    feature7    0.083082
6   feature10    0.075563
3    feature5    0.068044
7   feature11    0.048532
11   feature9    0.025337
10   feature6    0.022957
8    feature2    0.020371
Accuracy: 0.92


In [40]:
new_data = pd.read_csv('binary_classification_datasets/test.csv')
new_data.fillna(new_data.mean(), inplace=True)
new_data_selected_features = new_data[important_features + ['Id']]

new_data_scaled = scaler.transform(new_data_selected_features.drop('Id', axis=1))
new_predictions = best_model.predict(new_data_scaled)

submission = pd.DataFrame({'target': new_predictions, 'Id': new_data_selected_features['Id']})
submission.to_csv('binary_classification_datasets/final_submission.csv', index=False) # acc = 0.942
print("done")

done


## Theoretically, a slightly better version 
Selecting the model that will have the best accuracy on various number of features. 
Acc = 0.92, with refined grid 0.918

In [33]:
# Read and preprocess the data
data = pd.read_csv('binary_classification_datasets/train.csv')
data.fillna(data.mean(), inplace=True)

# Start with all features
all_features = [f'feature{i}' for i in range(1, 14)]
best_features = all_features.copy()
X = data[all_features]
y = data['target']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

best_accuracy = 0
best_feature_set = best_features.copy()
best_model = None

# Function to perform grid search and return the best model in grid and its accuracy
def perform_grid_search(X_train_grid, X_test_grid, y_train_grid, y_test_grid):
    model_tmp = xgb.XGBClassifier()
    param_grid_tmp = {
        'n_estimators': [150, 200, 250, 300],
        'max_depth': [5, 10, 15, 20, 25],
        'learning_rate': [0.01, 0.05, 0.1, 0.15],
        'subsample': [0.8, 0.9, 1.0]
    }
#     Best Parameters: {'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 200, 'subsample': 0.9}
# Using 13 features, Accuracy: 0.91
# Best Parameters: {'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 300, 'subsample': 0.9}
# Using 12 features, Accuracy: 0.905
# Best Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'subsample': 0.9}
# Using 11 features, Accuracy: 0.93
# Best Parameters: {'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 200, 'subsample': 1.0}
# Using 10 features, Accuracy: 0.915
# Best Parameters: {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 300, 'subsample': 1.0}
# Using 9 features, Accuracy: 0.905
# Best Parameters: {'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 200, 'subsample': 0.8}
# Using 8 features, Accuracy: 0.91

    # param_grid_refined = {
    # 'n_estimators': [175, 200, 225, 250, 275, 300],
    # 'max_depth': [8, 10, 12, 15, 18, 20],
    # 'learning_rate': [0.03, 0.05, 0.07, 0.1],
    # 'subsample': [0.8, 0.85, 0.9, 0.95, 1.0]
    # }
    # param_grid_tmp = param_grid_refined
    grid_search_tmp = GridSearchCV(model_tmp, param_grid_tmp, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
    grid_search_tmp.fit(X_train_grid, y_train_grid)
    
    best_grid_model = grid_search_tmp.best_estimator_
    predictions_tmp = best_grid_model.predict(X_test_grid)
    test_accuracy_tmp = accuracy_score(y_test_grid, predictions_tmp) 
    
    print("Best Parameters:", grid_search_tmp.best_params_)
    return best_grid_model, test_accuracy_tmp

# Iteratively remove the least important feature
# With less than 8 features -- much worse accuracy
while len(all_features) > 7:
    # Train and evaluate model
    current_model, accuracy = perform_grid_search(X_train_scaled, X_test_scaled, y_train, y_test)
    print(f"Using {len(all_features)} features, Accuracy: {accuracy}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_feature_set = all_features.copy()
        best_model = current_model

    # Calculate feature importance and remove the least important feature
    feature_importances = current_model.feature_importances_
    least_important_feature = all_features[feature_importances.argmin()]
    all_features.remove(least_important_feature)
    
    # Update training and testing sets
    X_train_scaled = scaler.fit_transform(X_train[all_features])
    X_test_scaled = scaler.transform(X_test[all_features])

# Output the best feature set and corresponding accuracy
print("Best feature set:", best_feature_set)
print("Best accuracy:", best_accuracy)
print("Model with best accuracy stored in variable 'best_model'")

Fitting 5 folds for each of 720 candidates, totalling 3600 fits
Best Parameters: {'learning_rate': 0.07, 'max_depth': 8, 'n_estimators': 175, 'subsample': 0.85}
Using 13 features, Accuracy: 0.905
Fitting 5 folds for each of 720 candidates, totalling 3600 fits
Best Parameters: {'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 225, 'subsample': 0.95}
Using 12 features, Accuracy: 0.92
Fitting 5 folds for each of 720 candidates, totalling 3600 fits
Best Parameters: {'learning_rate': 0.07, 'max_depth': 10, 'n_estimators': 175, 'subsample': 0.9}
Using 11 features, Accuracy: 0.91
Fitting 5 folds for each of 720 candidates, totalling 3600 fits
Best Parameters: {'learning_rate': 0.07, 'max_depth': 10, 'n_estimators': 200, 'subsample': 1.0}
Using 10 features, Accuracy: 0.92
Fitting 5 folds for each of 720 candidates, totalling 3600 fits
Best Parameters: {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 275, 'subsample': 1.0}
Using 9 features, Accuracy: 0.905
Fitting 5 folds for each

In [34]:
# Read and preprocess the test data
new_data = pd.read_csv('binary_classification_datasets/test.csv')
new_data.fillna(new_data.mean(), inplace=True)

# Select the same features that were selected in the final model
selected_features = best_feature_set
new_data_selected_features = new_data[selected_features + ['Id']]

new_data_scaled = scaler.fit_transform(new_data_selected_features.drop('Id', axis=1))
new_predictions = best_model.predict(new_data_scaled)

# Prepare the submission file
submission = pd.DataFrame({'Id': new_data_selected_features['Id'], 'target': new_predictions})
FILENAME = 'submission2_refined_grid'
submission.to_csv(f'binary_classification_datasets/{FILENAME}.csv', index=False)
print("done")

done
