In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC

from skimage.feature import hog
from skimage import exposure
from skimage.transform import rotate
from skimage import draw
%matplotlib inline

In [23]:
pre_train = pd.read_csv('sign_mnist_train.csv')
pre_test = pd.read_csv('sign_mnist_test.csv')



**TAKE A LOOK**

In [24]:
pre_train.head()


Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,3,107,118,127,134,139,143,146,150,153,...,207,207,207,207,206,206,206,204,203,202
1,6,155,157,156,156,156,157,156,158,158,...,69,149,128,87,94,163,175,103,135,149
2,2,187,188,188,187,187,186,187,188,187,...,202,201,200,199,198,199,198,195,194,195
3,2,211,211,212,212,211,210,211,210,210,...,235,234,233,231,230,226,225,222,229,163
4,13,164,167,170,172,176,179,180,184,185,...,92,105,105,108,133,163,157,163,164,179


In [25]:
#CHECK FOR NULL
print(pre_train.isnull().sum())
print(pre_test.isnull().sum())



label       0
pixel1      0
pixel2      0
pixel3      0
pixel4      0
           ..
pixel780    0
pixel781    0
pixel782    0
pixel783    0
pixel784    0
Length: 785, dtype: int64
label       0
pixel1      0
pixel2      0
pixel3      0
pixel4      0
           ..
pixel780    0
pixel781    0
pixel782    0
pixel783    0
pixel784    0
Length: 785, dtype: int64


In [26]:
train_set, val_set = train_test_split(pre_train, test_size=0.1, random_state=42)

print("train shape", train_set.shape)
print("val shape", val_set.shape)


train shape (24709, 785)
val shape (2746, 785)


In [27]:
print(train_set["label"].value_counts().sort_index(ascending=True) *100 / len(train_set),)

label
0     4.156380
1     3.690963
2     4.152333
3     4.330406
4     3.529078
5     4.326359
6     3.913554
7     3.707151
8     4.281841
10    4.071391
11    4.488243
12    3.767858
13    4.273746
14    4.415395
15    3.917601
16    4.621798
17    4.690599
18    4.302076
19    4.338500
20    4.269699
21    3.966166
22    4.468008
23    4.241370
24    4.079485
Name: count, dtype: float64


In [28]:
train_set.max()

label        24
pixel1      255
pixel2      255
pixel3      255
pixel4      255
           ... 
pixel780    255
pixel781    255
pixel782    255
pixel783    255
pixel784    255
Length: 785, dtype: int64

**PREPARE SOME FUNCTIONS TO SEE WHAT ARE THE IMAGES** 

process_raw_file_into_2_label_histogram : the function has 2 assignments, hist FALSE = remove the label column of dataframe, hist TRUE = turn dataframe into pixel histogram of each image

In [29]:
def process_raw_file_into_histogram_and_label_or_dataset_and_label(df_link, hist=False):
    # check filename or DataFrame
    if isinstance(df_link, str):
        df = pd.read_csv(df_link)
    elif isinstance(df_link, pd.DataFrame):
        df = df_link
    else:
        raise ValueError("Input must be a file path or a pandas DataFrame")

    # separate label
    labels = df['label'].values

    # process the value in df into histogram
    if hist: #chỗ này viết tắt của if hist = True, True provide Histogram, False just normal data without label
        # number of rows from original - số lượng data #df.shape là hàm trả về (số dòng, số cột) index 0 trả về số dòng, 256 sẽ là số cột mà np.empty tạo
        
        df.drop(columns=['label'])
        row_quantity = df.shape[0]
        pixels = np.empty((row_quantity, 256))
        # because the maximum value we check is less than 256
        # tạo ra ma trận có row = data gốc nhưng có tới 256 cột, tương ứng 256 value
        for i in range(row_quantity):
            temp = df.iloc[i]

            # count unique False for tăng dần
            temp = temp.value_counts(sort=False)

            # make into histogram from 0 to 255, cái nào ko có nó cho bằng 0
            temp = temp.reindex(range(0, 256), fill_value=0)

            temp = temp.to_numpy()
            # assign value from temp to array pixels:
            pixels[i] = temp
    else:
        # hist is False, just drop the 'label' column
        pixels = df.drop(columns=['label']).values

    pixels = pixels.astype(np.float64)

    return pixels, labels


In [30]:
# read data from csv file
X_train, y_train = process_raw_file_into_histogram_and_label_or_dataset_and_label(train_set)
X_val, y_val = process_raw_file_into_histogram_and_label_or_dataset_and_label(val_set)
X_test, y_test = process_raw_file_into_histogram_and_label_or_dataset_and_label(pre_test)

# normalize data from 0 to 1 range
X_train = X_train/255
X_val = X_val/255
X_test = X_test/255



In [31]:
#show first n images
def show_first_n_images(pixels, labels, n, img_size=28):

    pixels_tmp = pixels.reshape(-1, img_size, img_size, 1)
    plt.figure(figsize=(18, int(n/3)))

    for i in range(n):
        plt.subplot((n - 1) // 6 + 1, 6, i + 1)  #6 columns for images
        plt.axis('off')
        plt.title(f'Label: {labels[i]}')
        plt.imshow(pixels_tmp[i, :, :, 0], cmap='gray')

    plt.show()



import matplotlib.pyplot as plt



#show random n image in the chosen label that we want to see
def show_images_by_condition(pixels, labels, chosen_label, img_size=28, n = 5):
    # filter the pixels and labels for the chosen label
    filtered_pixels = pixels[labels == chosen_label]
    filtered_labels = labels[labels == chosen_label]

    # Choose n random indices
    
    random_indices = np.random.choice(filtered_pixels.shape[0], n, replace=False)
    filtered_pixels = filtered_pixels[random_indices].reshape(-1, img_size, img_size)
    filtered_labels = filtered_labels[random_indices]

    # Set up the subplot dimensions
    num_rows = (n - 1) // 6 + 1

    plt.figure(figsize=(18, num_rows * 3))

    for i in range(n):
        plt.subplot(num_rows, 6, i + 1)
        plt.axis('off')
        plt.title(f'Label: {filtered_labels[i]}')
        plt.imshow(filtered_pixels[i], cmap='gray')

    plt.show()


KNN

In [32]:
knn = KNeighborsClassifier()

# Define the parameter grid
param_grid_KNN = {
    'n_neighbors': [3, 5, 7, 9, 11],
}

# Perform GridSearchCV
grid_search_KNN = GridSearchCV(estimator=knn, param_grid=param_grid_KNN, cv=5, scoring='accuracy')
grid_search_KNN.fit(X_train, y_train)

In [33]:
print(grid_search_KNN.best_estimator_)
print(grid_search_KNN.best_score_)

predicionts_KNN = grid_search_KNN.predict(X_val)
print(classification_report(y_val, predicionts_KNN))

predicionts_KNN_test = grid_search_KNN.predict(X_test)
print(classification_report(y_test, predicionts_KNN_test))


KNeighborsClassifier(n_neighbors=3)
0.9974502856900417
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        99
           1       1.00      1.00      1.00        98
           2       1.00      1.00      1.00       118
           3       1.00      1.00      1.00       126
           4       1.00      1.00      1.00        85
           5       1.00      1.00      1.00       135
           6       1.00      1.00      1.00       123
           7       1.00      1.00      1.00        97
           8       1.00      0.99      1.00       104
          10       1.00      1.00      1.00       108
          11       1.00      1.00      1.00       132
          12       1.00      0.99      1.00       124
          13       0.99      1.00      0.99        95
          14       1.00      1.00      1.00       105
          15       1.00      1.00      1.00       120
          16       1.00      1.00      1.00       137
          17       0.99   

SVN - kernel linear

In [34]:
param_grid_SVM_linear = {
    'C': [0.1, 1, 10, 100], # Regularization parameter
}

# Initialize an SVM model
svm = SVC(kernel="linear")

# Initialize the grid search with cross-validation
grid_search_SVM = GridSearchCV(estimator=svm, param_grid=param_grid_SVM_linear, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search to the training data
grid_search_SVM.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [35]:
print(grid_search_SVM.best_estimator_)
print(grid_search_SVM.best_score_)

predicionts_SVM = grid_search_SVM.predict(X_val)
print(classification_report(y_val, predicionts_SVM))

predicionts_SVM_test = grid_search_KNN.predict(X_test)
print(classification_report(y_test, predicionts_SVM_test))

SVC(C=0.1, kernel='linear')
1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        99
           1       1.00      1.00      1.00        98
           2       1.00      1.00      1.00       118
           3       1.00      1.00      1.00       126
           4       1.00      1.00      1.00        85
           5       1.00      1.00      1.00       135
           6       1.00      1.00      1.00       123
           7       1.00      1.00      1.00        97
           8       1.00      1.00      1.00       104
          10       1.00      1.00      1.00       108
          11       1.00      1.00      1.00       132
          12       1.00      1.00      1.00       124
          13       1.00      1.00      1.00        95
          14       1.00      1.00      1.00       105
          15       1.00      1.00      1.00       120
          16       1.00      1.00      1.00       137
          17       1.00      1.00      1.00      

SVM- polynomial kernel

In [36]:
param_grid_SVM_poly = {
    'C': [0.1, 1, 10],          # Regularization parameter
    'degree': [2, 3, 4, 5,7], #degree
}

# Initialize an SVM model
svm = SVC(kernel="poly")

# Initialize the grid search with cross-validation
grid_search_SVM_poly = GridSearchCV(estimator=svm, param_grid=param_grid_SVM_poly, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search to the training data
grid_search_SVM_poly.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [37]:
print(grid_search_SVM_poly.best_estimator_)
print(grid_search_SVM_poly.best_score_)

predicionts_SVM_poly = grid_search_SVM_poly.predict(X_val)
print(classification_report(y_val, predicionts_SVM_poly))

predicionts_SVM_poly_test = grid_search_KNN.predict(X_test)
print(classification_report(y_test, predicionts_SVM_poly_test))

SVC(C=0.1, kernel='poly')
1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        99
           1       1.00      1.00      1.00        98
           2       1.00      1.00      1.00       118
           3       1.00      1.00      1.00       126
           4       1.00      1.00      1.00        85
           5       1.00      1.00      1.00       135
           6       1.00      1.00      1.00       123
           7       1.00      1.00      1.00        97
           8       1.00      1.00      1.00       104
          10       1.00      1.00      1.00       108
          11       1.00      1.00      1.00       132
          12       1.00      1.00      1.00       124
          13       1.00      1.00      1.00        95
          14       1.00      1.00      1.00       105
          15       1.00      1.00      1.00       120
          16       1.00      1.00      1.00       137
          17       1.00      1.00      1.00       1

SVM - rbf kernel

In [38]:
param_grid_SVM_rbf = {
    'gamma': [1, 0.1, 0.01, 0.001],  # gamma
}

# Initialize an SVM model
svm = SVC(kernel="rbf")

# Initialize the grid search with cross-validation
grid_search_SVM_rbf = GridSearchCV(estimator=svm, param_grid=param_grid_SVM_rbf, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search to the training data
grid_search_SVM_rbf.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [40]:
print(grid_search_SVM_rbf.best_estimator_)
print(grid_search_SVM_rbf.best_score_)

predicionts_SVM_rbf = grid_search_SVM_rbf.predict(X_val)
print(classification_report(y_val, predicionts_SVM_rbf))

predicionts_SVM_rbf = grid_search_SVM_rbf.predict(X_test)
print(classification_report(y_test, predicionts_SVM_rbf))

SVC(gamma=0.1)
0.9995143502721019
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        99
           1       1.00      1.00      1.00        98
           2       1.00      1.00      1.00       118
           3       1.00      1.00      1.00       126
           4       1.00      1.00      1.00        85
           5       1.00      1.00      1.00       135
           6       1.00      1.00      1.00       123
           7       1.00      1.00      1.00        97
           8       1.00      0.99      1.00       104
          10       1.00      1.00      1.00       108
          11       1.00      1.00      1.00       132
          12       1.00      1.00      1.00       124
          13       1.00      1.00      1.00        95
          14       1.00      1.00      1.00       105
          15       1.00      1.00      1.00       120
          16       1.00      1.00      1.00       137
          17       1.00      1.00      1.00    

Logistic regression

In [41]:
model_logistic = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=10000)

# Train the model
model_logistic.fit(X_train, y_train)

In [42]:
predicionts_logistic = model_logistic.predict(X_val)
print(classification_report(y_val, predicionts_logistic))

predicionts_logistic = model_logistic.predict(X_test)
print(classification_report(y_test, predicionts_logistic))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        99
           1       1.00      1.00      1.00        98
           2       1.00      1.00      1.00       118
           3       1.00      1.00      1.00       126
           4       1.00      1.00      1.00        85
           5       1.00      1.00      1.00       135
           6       1.00      1.00      1.00       123
           7       1.00      1.00      1.00        97
           8       1.00      1.00      1.00       104
          10       1.00      1.00      1.00       108
          11       1.00      1.00      1.00       132
          12       1.00      1.00      1.00       124
          13       1.00      1.00      1.00        95
          14       1.00      1.00      1.00       105
          15       0.99      1.00      1.00       120
          16       1.00      1.00      1.00       137
          17       1.00      1.00      1.00       135
          18       1.00    

Random Forest

In [43]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)


In [45]:
predicionts_logistic = rf_classifier.predict(X_val)
print(classification_report(y_val, predicionts_logistic))

predicionts_logistic = rf_classifier.predict(X_test)
print(classification_report(y_test, predicionts_logistic))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        99
           1       1.00      1.00      1.00        98
           2       1.00      1.00      1.00       118
           3       1.00      1.00      1.00       126
           4       1.00      1.00      1.00        85
           5       1.00      1.00      1.00       135
           6       1.00      1.00      1.00       123
           7       1.00      1.00      1.00        97
           8       1.00      0.98      0.99       104
          10       1.00      1.00      1.00       108
          11       1.00      1.00      1.00       132
          12       1.00      0.99      1.00       124
          13       0.99      1.00      0.99        95
          14       1.00      1.00      1.00       105
          15       1.00      1.00      1.00       120
          16       0.99      1.00      1.00       137
          17       0.99      1.00      1.00       135
          18       0.99    