In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    StratifiedKFold,
    RepeatedStratifiedKFold,
    RandomizedSearchCV,
    GridSearchCV,
    RepeatedKFold
)

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

from sklearn.metrics import balanced_accuracy_score, classification_report, accuracy_score

from scipy.stats import loguniform, uniform, randint

import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Prep data

X = pd.read_csv('train.csv')
X_final= pd.read_csv('test.csv')

y = X['label']

X.drop(['label'], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [34]:
# Baseline Logistic Regression

# Track time
start = time.time()

# define model
baseline_logistic_regression = LogisticRegression() # leave max_iter at default (100) and ignore warning - increasing takes way too long

# fit model
baseline_logistic_regression.fit(X_train, y_train)

# predict on test set and generate classification report
baseline_logistic_regression_preds = baseline_logistic_regression.predict(X_test)
baseline_logistic_regression_report = classification_report(y_test, baseline_logistic_regression_preds)

# get accuracy scores
baseline_logistic_regression_accs = cross_val_score(baseline_logistic_regression, X_train, y_train, scoring='accuracy', cv=RepeatedKFold(n_splits=5, n_repeats=5), n_jobs=-1)

end = time.time()

# print outputs
print("\nLogistic Regression")
print("Execution Time:", time.strftime("%H:%M:%S", time.gmtime(end-start)))
print(f"Average cross-validation accuracy: {baseline_logistic_regression_accs.mean()}")
print(baseline_logistic_regression_report)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression
Execution Time: 00:01:14
Average cross-validation accuracy: 0.9120535714285716
              precision    recall  f1-score   support

           0       0.96      0.95      0.95       814
           1       0.95      0.97      0.96       919
           2       0.90      0.90      0.90       845
           3       0.90      0.89      0.89       889
           4       0.90      0.93      0.92       816
           5       0.88      0.85      0.86       760
           6       0.95      0.94      0.94       838
           7       0.93      0.92      0.93       867
           8       0.87      0.86      0.86       814
           9       0.89      0.90      0.90       838

    accuracy                           0.91      8400
   macro avg       0.91      0.91      0.91      8400
weighted avg       0.91      0.91      0.91      8400



In [35]:
# Baseline Linear Support Vector Classification

# Track time
start = time.time()

# define model
baseline_lsvc = LinearSVC()

# fit model
baseline_lsvc.fit(X_train, y_train)

# predict on test set and generate classification report
baseline_lsvc_preds = baseline_lsvc.predict(X_test)
baseline_lsvc_report = classification_report(y_test, baseline_lsvc_preds)

# get accuracy scores
baseline_lsvc_accs = cross_val_score(baseline_lsvc, X_train, y_train, scoring='accuracy', cv=RepeatedKFold(n_splits=5, n_repeats=2), n_jobs=-1) # only 2 repeats to save time

end = time.time()

# print outputs
print("Linear Support Vector Classification")
print("Execution Time:", time.strftime("%H:%M:%S", time.gmtime(end-start)))
print(f"Average cross-validation accuracy: {baseline_lsvc_accs.mean()}")
print(baseline_lsvc_report)

Linear Support Vector Classification
Execution Time: 00:26:44
Average cross-validation accuracy: 0.8985416666666668
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       814
           1       0.94      0.97      0.96       919
           2       0.90      0.88      0.89       845
           3       0.89      0.88      0.89       889
           4       0.89      0.93      0.91       816
           5       0.87      0.84      0.86       760
           6       0.94      0.94      0.94       838
           7       0.92      0.90      0.91       867
           8       0.87      0.86      0.86       814
           9       0.87      0.88      0.88       838

    accuracy                           0.91      8400
   macro avg       0.90      0.90      0.90      8400
weighted avg       0.91      0.91      0.91      8400



In [36]:
# Baseline Decision Tree

# Track time
start = time.time()

# define model
baseline_decision_tree = DecisionTreeClassifier()

# fit model
baseline_decision_tree.fit(X_train, y_train)

# predict on test set and generate classification report
baseline_decision_tree_preds = baseline_decision_tree.predict(X_test)
baseline_decision_tree_report = classification_report(y_test, baseline_decision_tree_preds)

# get accuracy scores
baseline_decision_tree_accs = cross_val_score(baseline_decision_tree, X_train, y_train, scoring='accuracy', cv=RepeatedKFold(n_splits=5, n_repeats=5), n_jobs=-1)

end = time.time()

# print outputs
print("Decision Tree")
print("Execution Time:", time.strftime("%H:%M:%S", time.gmtime(end-start)))
print(f"Average cross-validation accuracy: {baseline_decision_tree_accs.mean()}")
print(baseline_decision_tree_report)

Decision Tree
Execution Time: 00:00:39
Average cross-validation accuracy: 0.8477321428571429
              precision    recall  f1-score   support

           0       0.90      0.91      0.91       814
           1       0.93      0.95      0.94       919
           2       0.84      0.84      0.84       845
           3       0.83      0.80      0.81       889
           4       0.86      0.86      0.86       816
           5       0.82      0.80      0.81       760
           6       0.88      0.86      0.87       838
           7       0.89      0.90      0.89       867
           8       0.80      0.80      0.80       814
           9       0.81      0.83      0.82       838

    accuracy                           0.86      8400
   macro avg       0.86      0.86      0.86      8400
weighted avg       0.86      0.86      0.86      8400



In [37]:
# Baseline Random Forest

# Track time
start = time.time()

# define model
baseline_random_forest = RandomForestClassifier()

# fit model
baseline_random_forest.fit(X_train, y_train)

# predict on test set and generate classification report
baseline_random_forest_preds = baseline_random_forest.predict(X_test)
baseline_random_forest_report = classification_report(y_test, baseline_random_forest_preds)

# get accuracy scores
baseline_random_forest_accs = cross_val_score(baseline_random_forest, X_train, y_train, scoring='accuracy', cv=RepeatedKFold(n_splits=5, n_repeats=5), n_jobs=-1)

end = time.time()

# print outputs
print("Random Forest")
print("Execution Time:", time.strftime("%H:%M:%S", time.gmtime(end-start)))
print(f"Average cross-validation accuracy: {baseline_random_forest_accs.mean()}")
print(baseline_random_forest_report)

Random Forest
Execution Time: 00:01:26
Average cross-validation accuracy: 0.9620714285714286
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       814
           1       0.99      0.99      0.99       919
           2       0.95      0.97      0.96       845
           3       0.96      0.94      0.95       889
           4       0.96      0.98      0.97       816
           5       0.97      0.95      0.96       760
           6       0.98      0.98      0.98       838
           7       0.98      0.96      0.97       867
           8       0.95      0.96      0.95       814
           9       0.94      0.95      0.94       838

    accuracy                           0.96      8400
   macro avg       0.96      0.96      0.96      8400
weighted avg       0.96      0.96      0.96      8400



In [38]:
# Baseline Histogram Gradient Boosting Classifier

# Track time
start = time.time()

# define model
baseline_hgbc = HistGradientBoostingClassifier()

# fit model
baseline_hgbc.fit(X_train, y_train)

# predict on test set and generate classification report
baseline_hgbc_preds = baseline_hgbc.predict(X_test)
baseline_hgbc_report = classification_report(y_test, baseline_hgbc_preds)

# get accuracy scores
baseline_hgbc_accs = cross_val_score(baseline_hgbc, X_train, y_train, scoring='accuracy', cv=RepeatedKFold(n_splits=5, n_repeats=2), n_jobs=-1) # only 2 repeats to save time

end = time.time()

# print outputs
print("Histogram Gradient Boosting Classifier")
print("Execution Time:", time.strftime("%H:%M:%S", time.gmtime(end-start)))
print(f"Average cross-validation accuracy: {baseline_hgbc_accs.mean()}")
print(baseline_hgbc_report)

Histogram Gradient Boosting Classifier
Execution Time: 00:08:33
Average cross-validation accuracy: 0.9710267857142856
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       814
           1       0.99      0.99      0.99       919
           2       0.96      0.98      0.97       845
           3       0.97      0.96      0.97       889
           4       0.97      0.98      0.97       816
           5       0.98      0.97      0.97       760
           6       0.98      0.98      0.98       838
           7       0.99      0.97      0.98       867
           8       0.97      0.97      0.97       814
           9       0.95      0.97      0.96       838

    accuracy                           0.97      8400
   macro avg       0.97      0.97      0.97      8400
weighted avg       0.97      0.97      0.97      8400



In [39]:
# Baseline K-Nearest Neighbors Classifier

# Track time
start = time.time()

# define model
baseline_knn = KNeighborsClassifier()

# fit model
baseline_knn.fit(X_train, y_train)

# predict on test set and generate classification report
baseline_knn_preds = baseline_knn.predict(X_test)
baseline_knn_report = classification_report(y_test, baseline_knn_preds)

# get accuracy scores
baseline_knn_accs = cross_val_score(baseline_knn, X_train, y_train, scoring='accuracy', cv=RepeatedKFold(n_splits=100, n_repeats=5), n_jobs=-1, error_score='raise') # - increase splits to 100 because of insufficient memory

end = time.time()

# print outputs
print("K-Nearest Neighbors Classifier")
print("Execution Time:", time.strftime("%H:%M:%S", time.gmtime(end-start)))
print(f"Average cross-validation accuracy: {baseline_knn_accs.mean()}")
print(baseline_knn_report)

K-Nearest Neighbors Classifier
Execution Time: 00:02:06
Average cross-validation accuracy: 0.9662261904761904
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       814
           1       0.96      1.00      0.98       919
           2       0.98      0.97      0.97       845
           3       0.97      0.97      0.97       889
           4       0.97      0.96      0.97       816
           5       0.96      0.96      0.96       760
           6       0.98      0.99      0.99       838
           7       0.96      0.96      0.96       867
           8       0.99      0.92      0.95       814
           9       0.94      0.95      0.95       838

    accuracy                           0.97      8400
   macro avg       0.97      0.97      0.97      8400
weighted avg       0.97      0.97      0.97      8400



In [1]:
# Summarize baseline results

print("Baseline results summary:")

print(f"Logistic regression best cross-validation score: {baseline_logistic_regression_accs.mean()}")
print(f"Linear support vector classifier best cross-validation score: {baseline_lsvc_accs.mean()}")
print(f"Decision tree best cross-validation score: {baseline_decision_tree_accs.mean()}")
print(f"Random forest best cross-validation score: {baseline_random_forest_accs.mean()}")
print(f"Histogram Gradient boosting classifier best cross-validation score: {baseline_hgbc_accs.mean()}")
print(f"K-nearest neighbors classifier best cross-validation score: {baseline_knn_accs.mean()}")


Baseline results summary:


NameError: name 'baseline_logistic_regression_accs' is not defined

In [6]:
#define models
lr_model = LogisticRegression()
lsvc_model = LinearSVC()
dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier()
hgbc_model = HistGradientBoostingClassifier()
knn_model = KNeighborsClassifier()

#fit models
lr_model.fit(X_train, y_train)
lsvc_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
hgbc_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)

#predict
lr_pred = lr_model.predict(X_final)
lsvc_pred = lsvc_model.predict(X_final)
dt_pred = dt_model.predict(X_final)
rf_pred = rf_model.predict(X_final)
hgbc_pred = hgbc_model.predict(X_final)
knn_pred = knn_model.predict(X_final)

#create voting models
lr_voting_model = LogisticRegression()
lsvc_voting_model = LinearSVC()
dt_voting_model = DecisionTreeClassifier()
rf_voting_model = RandomForestClassifier()
gbc_voting_model = HistGradientBoostingClassifier()
knn_voting_model = KNeighborsClassifier()

#create voting classifiers
hard_voter = VotingClassifier(estimators=[('lr', lr_voting_model), 
                                     ('dt', dt_voting_model), 
                                     ('rf', rf_voting_model), 
                                     ('gbc', gbc_voting_model), 
                                     ('knn', knn_voting_model)], voting='hard')
hard_voter.fit(X_train, y_train)

soft_voter = VotingClassifier(estimators=[('lr', lr_voting_model),
                                     ('dt', dt_voting_model), 
                                     ('rf', rf_voting_model), 
                                     ('gbc', gbc_voting_model), 
                                     ('knn', knn_voting_model)], voting='soft')
soft_voter.fit(X_train, y_train)

#use voters to predict
hard_pred = hard_voter.predict(X_final)
soft_pred = soft_voter.predict(X_final)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

In [13]:
#create submission files
ImageId = list(range(1, 28001))

lr_data = {'ImageId' : ImageId, 'Label' : lr_pred}
lsvc_data = {'ImageId' : ImageId, 'Label' : lsvc_pred}
dt_data = {'ImageId' : ImageId, 'Label' : dt_pred}
rf_data = {'ImageId' : ImageId, 'Label' : rf_pred}
hgbc_data = {'ImageId' : ImageId, 'Label' : hgbc_pred}
knn_data = {'ImageId' : ImageId, 'Label' : knn_pred}
hard_data = {'ImageId' : ImageId, 'Label' : hard_pred}
soft_data = {'ImageId' : ImageId, 'Label' : soft_pred}

lr_submission = pd.DataFrame(lr_data)
lsvc_submission = pd.DataFrame(lsvc_data)
dt_submission = pd.DataFrame(dt_data)
rf_submission = pd.DataFrame(rf_data)
hgbc_submission = pd.DataFrame(hgbc_data)
knn_submission = pd.DataFrame(knn_data)
hard_submission = pd.DataFrame(hard_data)
soft_submission = pd.DataFrame(soft_data)

lr_submission.to_csv('logistic_regression_submission.csv', index=False)
lsvc_submission.to_csv('linear_svc_submission.csv', index=False)
dt_submission.to_csv('decision_tree_submission.csv', index=False)
rf_submission.to_csv('random_forest_submission.csv', index=False)
hgbc_submission.to_csv('gradient_boosting_classifier_submission.csv', index=False)
knn_submission.to_csv('knearest_neighbors_submission.csv', index=False)
hard_submission.to_csv('hard_voting_classifier_submission.csv', index=False)
soft_submission.to_csv('soft_voting_classifier_submission.csv', index=False)