In [130]:
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

ON_KAGGLE = False
MODELS_PATH = Path("models")
MODELS_PATH.mkdir(exist_ok=True)  # If directory exists, do nothing.

# Default settings for matplotlib.
DARK_BLUE = "#03002e"
LIGHT_GRAY = "#8f8f99"

plt.rc("font", size=14, family="serif")
plt.rc("legend", fontsize=14)
plt.rc("text", color=DARK_BLUE)

plt.rc("axes", labelsize=14)
plt.rc("axes", titlesize=14)
plt.rc("axes", labelpad=10)
plt.rc("axes", labelcolor=DARK_BLUE)
plt.rc("axes", grid=True)

plt.rc("xtick", labelsize=12, color=DARK_BLUE)
plt.rc("ytick", labelsize=12, color=DARK_BLUE)
plt.rc("xtick.major", pad=10)
plt.rc("ytick.major", pad=10)

plt.rc("grid", color=LIGHT_GRAY)
plt.rc("grid", linestyle="dashed")
plt.rc("grid", linewidth=0.5)
plt.rc("grid", alpha=0.5)

In [131]:
train_set = pd.read_csv("data/train.csv")
test_set = pd.read_csv("data/test.csv")

In [132]:
train_labels = train_set["label"]
train_set = train_set.drop("label", axis=1).values
test_set = test_set.values

In [133]:
from scipy.ndimage import shift


def shift_image(digit, dx, dy):
    digit = digit.reshape(28, 28)
    shifted_image = shift(digit, [dy, dx])
    return shifted_image.reshape(-1)  # Restore the original shape.


train_set_augmented = [digit for digit in train_set]
train_labels_augmented = [label for label in train_labels]

for dx, dy in ((1, 1), (-1, -1), (-1, 1), (1, -1)):
    for digit, label in zip(train_set, train_labels):
        train_set_augmented.append(shift_image(digit, dx, dy))
        train_labels_augmented.append(label)

train_set_augmented = np.array(train_set_augmented)
train_labels_augmented = np.array(train_labels_augmented)

shuffled_ids = np.random.permutation(len(train_set_augmented))
train_set_augmented = train_set_augmented[shuffled_ids]
train_labels_augmented = train_labels_augmented[shuffled_ids]

In [134]:
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
train_valid_ids, test_ids = next(splitter.split(train_set_augmented, train_labels_augmented))

X_train_valid = train_set_augmented[train_valid_ids]
y_train_valid = train_labels_augmented[train_valid_ids]

X_test = train_set_augmented[test_ids]
y_test = train_labels_augmented[test_ids]

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_ids, valid_ids = next(splitter.split(X_train_valid, y_train_valid))

X_train = X_train_valid[train_ids]
y_train = y_train_valid[train_ids]

X_valid = X_train_valid[valid_ids]
y_valid = y_train_valid[valid_ids]


In [135]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)


In [137]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=200, random_state=42)
forest_clf.fit(X_valid_scaled, y_valid)
forest_clf.score(X_test_scaled, y_test)

0.9521428571428572

In [138]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=4)
knn_clf.fit(X_valid_scaled, y_valid)
knn_clf.score(X_test_scaled, y_test)


0.9526666666666667

In [139]:
from sklearn.ensemble import ExtraTreesClassifier

extra_trees_clf = ExtraTreesClassifier(n_estimators=200, random_state=42)
extra_trees_clf.fit(X_valid_scaled, y_valid)
extra_trees_clf.score(X_test_scaled, y_test)

0.956

In [140]:
X_valid_predictions = np.empty((len(X_valid_scaled), 3))
for index, estimator in enumerate((forest_clf, knn_clf, extra_trees_clf)):
    X_valid_predictions[:, index] = estimator.predict(X_valid_scaled)

X_test_predictions = np.empty((len(X_test_scaled), 3))
for index, estimator in enumerate((forest_clf, knn_clf, extra_trees_clf)):
    X_test_predictions[:, index] = estimator.predict(X_test_scaled)

In [141]:
from sklearn.ensemble import GradientBoostingClassifier

gbc_blender = GradientBoostingClassifier(random_state=42)
extra_trees_clf.fit(X_valid_predictions, y_valid)
extra_trees_clf.score(X_test_predictions, y_test)

0.9595238095238096

In [142]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_set_augmented_scaled = scaler.fit_transform(train_set_augmented)
test_set_scaled = scaler.transform(test_set)

In [143]:
from sklearn.ensemble import StackingClassifier

stack_clf = StackingClassifier(
    [
        ("random_forest_clf", forest_clf),
        ("knn_clf", knn_clf),
        ("extra_trees_clf", extra_trees_clf),
    ],
    final_estimator=gbc_blender,
    n_jobs=-1,
    cv=3,
)

stack_clf.fit(train_set_augmented_scaled, train_labels_augmented);


In [None]:
results = stack_clf.predict(test_set_scaled)

sumbission = pd.DataFrame({"ImageID": range(1, len(test_set) + 1), "Label": results})
sumbission.set_index("ImageID").to_csv("data/submission.csv")
