## Load the SVHN Dataset

In [None]:
import wget
from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
train_url = "http://ufldl.stanford.edu/housenumbers/train_32x32.mat"
test_url = "http://ufldl.stanford.edu/housenumbers/test_32x32.mat"

train_data = wget.download(train_url)
test_data = wget.download(test_url)

In [None]:
#type of origin file is matlab
data_train = loadmat(train_data)
data_test = loadmat(test_data)

## EDA

In [None]:
#output is as a tuple
type(data_train), type(data_test)

In [None]:
data_train.keys()

In [None]:
data_test.keys()

In [None]:
X_train = data_train['X']
X_test = data_test['X']

y_train = data_train['y']
y_test = data_train['y']

In [None]:
type(X_train)

In [None]:
X_train.shape

In [None]:
def image_viz (X, y):
  fig, axes = plt.subplots(2,5,figsize=(12,6))

  #flatten the axes array for easy indexing
  axes = axes.ravel()

  for i in range(10):
      img = X[:, :, :, i] 
      
      axes[i].imshow(img)
      axes[i].set_title(f'Label: {y[i]}')
      axes[i].axis('off') #remove the thicks on axis

  plt.tight_layout()
  plt.show();

In [None]:
image_viz(X_train, y_train)

In [None]:
image_viz(X_test, y_test)

In [None]:
#check y distribution(to see if the dataset is imbalance or not)
def label_dist(y):
    unique_labels, counts = np.unique(y,return_counts=True)
    return dict(zip(unique_labels, counts))

In [None]:
train_dist = label_dist(y_train)
test_dist = label_dist(y_test)

In [None]:
plt.figure(figsize=(10,5), dpi=100)
plt.subplot(1, 2, 1)
plt.bar(train_dist.keys(), train_dist.values())
plt.title("train label dist")
plt.xlabel("Label")
plt.subplot(1, 2, 2)
plt.bar(test_dist.keys(), test_dist.values())
plt.title("test label dist")
plt.xlabel("Label")

plt.tight_layout()

In [None]:
X_train.max(), X_train.min()

## Preprocessing

In [None]:
## Normalization
def normalizer(x):
    x = x / 255
    return x

In [None]:
x_train_norm = normalizer(X_train)
x_test_norm = normalizer(X_test)

In [None]:
#output is 3(channel) matrix 32*32 in each other and have 73257 of this matrix 3 channel
np.shape(x_train_norm)

In [None]:
# rgb to gray_scale(3 channel to 1 channel)
def convert_gray(x):
    return np.mean(x,axis=2,keepdims=True)

In [None]:
x_train_norm_gray = convert_gray(x_train_norm)
x_test_norm_gray = convert_gray(x_test_norm)
np.shape(x_train_norm_gray)

In [None]:
#reshape to prepare inputs for ML models
# change dim 4(or 3 becuase python is zero index) to dim 1(dimension 4 is then number of sample:73257 
    # that in a matrix m*n should be the first one)
# change dim 1(0) , 2(1) , 3(2) to dim 2 that means n in matrix(flatten)
def reshape(x):
    return x.transpose(3, 0, 1, 2).reshape(np.shape(x)[3],-1) # -1 means multiplication dim 0, 1, 2

In [None]:
x_tr = reshape(x_train_norm_gray)
x_ts = reshape(x_test_norm_gray)

In [None]:
x_tr_rgb = reshape(x_train_norm)
x_ts_rgb = reshape(x_test_norm)

In [None]:
x_tr.shape

In [None]:
x_tr_rgb.shape

In [None]:
# train phase
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth=20, n_estimators=100)
rf_clf.fit(x_tr, y_train)

In [None]:
# test phase
from sklearn.metrics import accuracy_score
y_pred = rf_clf.predict(x_ts)
score = accuracy_score(y_test,y_pred)

In [None]:
y_test.shape

In [None]:
print(y_pred.shape, type(y_pred))

In [None]:
# dim reduction
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_tr)

In [None]:
# transform
x_tr_pca = pca.transform(x_tr)
x_ts_pca = pca.transform(x_ts)

In [None]:
# importance of each components
variances = pca.explained_variance_ratio_

In [None]:
# cumulativepercentage of components' importance
cum_variances = np.cumsum(variances)

In [None]:
plt.plot(cum_variances, marker = "o")
plt.grid()

In [None]:
component_90 = np.argmax(cum_variances > 0.9) + 1

In [None]:
pca_90 = PCA(n_components=component_90)
x_tr_90 = pca_90.fit_transform(x_tr)
x_ts_90 = pca_90.transform(x_ts)

In [None]:
rf_clf = RandomForestClassifier(max_depth=20, n_estimators=100)
rf_clf.fit(x_tr_90, y_train)

In [None]:
# test phase
y_pred = rf_clf.predict(x_ts_90)
score = accuracy_score(y_test,y_pred)

In [None]:
# change 10 to 0
y_train_0 = np.where(y_train==10, 0, y_train)
y_test_0 = np.where(y_test==10, 0, y_test)

In [None]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(n_estimators=200)
xgb_clf.fit(x_tr_90, y_train_0)

In [None]:
y_pred = xgb_clf.predict(x_ts_90)
accuracy_score(y_test_0, y_pred)