## Train Existing Best Model With Augmented Data

In [1]:
from sklearn.datasets import fetch_openml

mnist_data = fetch_openml("mnist_784", as_frame=False, parser='auto')
# Split data into train and test sets.

data_orig = mnist_data.data
labels = mnist_data.target

X_train, y_train = data_orig[:60000], labels[:60000]
X_test, y_test = data_orig[60000:], labels[60000:]

In [2]:
# Load Augmented Dataset
import pandas as pd

aug_data = pd.read_csv("augmented.csv")

In [3]:
aug_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59990,59991,59992,59993,59994,59995,59996,59997,59998,59999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
X_new = aug_data.values
X_new = X_new.T

In [7]:
import numpy as np

X_combined = np.concatenate((X_train, X_new), axis=0)
y_combined = np.concatenate((y_train, y_train), axis=0)

In [9]:
# Preprocess the data first.
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline

preprocessing = make_pipeline(StandardScaler())

In [10]:
# Train the model on the combined data.
from xgboost import XGBClassifier

xgb_clf = make_pipeline(preprocessing, XGBClassifier(random_state=42, min_child_weight=0.75, max_depth=6, reg_lambda=0.75, eta=0.4))
xgb_clf.fit(X_combined, y_combined.astype("int"))

In [13]:
# Sanity test.
print(xgb_clf.predict([X_combined[75]]))
print(np.max(xgb_clf.predict_proba([X_combined[75]])))

[0]
0.9996903


In [25]:
# Test accuracy
from sklearn.metrics import accuracy_score

test_preds = xgb_clf.predict(X_test)
test_accuracy = accuracy_score(y_test.astype("int"), test_preds)

print(f"Test accuracy: {test_accuracy * 100}%")

Test accuracy: 98.17%


### There is an improvement of 0.2%! 
Save the model.

In [27]:
from joblib import dump

dump(xgb_clf, "models/best_model.pkl")

['models/best_model.pkl']