In [1]:
import numpy as np
import cv2
img = cv2.imread("dataset/#/0.jpg")
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img.shape

(28, 28)

In [2]:
import re
def sorted_alphanumeric(data):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(data, key=alphanum_key)

In [3]:
import os
char_dict = {"char": [], "matrix": []}
char_set = dict()
src = r"dataset"
for i in range(250):
    for folder in os.listdir(src):
        char_dict["char"].append(folder)
        if folder not in char_set:
            char_set[folder] = sorted_alphanumeric(os.listdir(src + '/' + folder))
        img = cv2.imread(f"{src}/{folder}/{char_set[folder][i]}")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        if not img.shape == (32, 32):
            img = cv2.resize(img, (32, 32))
        img = img.flatten()
        char_dict["matrix"].append(img)

In [4]:
len(char_dict["char"]), len(char_dict["matrix"])

(9750, 9750)

In [5]:
X = char_dict["matrix"]
X = np.array(X)
y = char_dict["char"]
y = np.array(y)

In [6]:
X[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [7]:
len(X)*0.9

8775.0

In [8]:
train_X = X[:8775]
train_y = y[:8775]
test_X = X[8775:]
test_y = y[8775:]

In [9]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators=100)

In [10]:
clf_rf.fit(train_X, train_y)

RandomForestClassifier()

In [11]:
y_pred = clf_rf.predict(test_X)

In [12]:
from sklearn.metrics import accuracy_score
#print(f"Accuracy: {accuracy_score(test_y, y_pred)}")
score = accuracy_score(test_y, y_pred)
score

0.8256410256410256

In [13]:
# import pickle
# with open('modelRandomForest.dat', 'wb') as file:
#     pickle.dump(clf_rf, file)

### trying for seeds

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
best = 0.
best_seed = 0
for i in range(100):
    clf_rf = RandomForestClassifier(n_estimators=100, random_state=i)
    clf_rf.fit(train_X, train_y)
    y_pred = clf_rf.predict(test_X)
    score = accuracy_score(test_y, y_pred)
    if score > best:
        best = score
        best_seed = i
    print(f"Seed: {i}, Score: {round(score, 4)}")
print(f"Best accuracy: {best}, best seed: {best_seed}")

Seed: 0, Score: 0.8318
Seed: 1, Score: 0.8236
Seed: 2, Score: 0.8215
Seed: 3, Score: 0.8308
Seed: 4, Score: 0.8297
Seed: 5, Score: 0.8277
Seed: 6, Score: 0.8215
Seed: 7, Score: 0.8267
Seed: 8, Score: 0.8256
Seed: 9, Score: 0.8164
Seed: 10, Score: 0.8164
Seed: 11, Score: 0.8226
Seed: 12, Score: 0.8164
Seed: 13, Score: 0.8256
Seed: 14, Score: 0.8318
Seed: 15, Score: 0.8287
Seed: 16, Score: 0.8287
Seed: 17, Score: 0.8256
Seed: 18, Score: 0.8277
Seed: 19, Score: 0.8297
Seed: 20, Score: 0.8359
Seed: 21, Score: 0.8277
Seed: 22, Score: 0.8236
Seed: 23, Score: 0.8308
Seed: 24, Score: 0.8236
Seed: 25, Score: 0.8215
Seed: 26, Score: 0.8164
Seed: 27, Score: 0.8195
Seed: 28, Score: 0.8318
Seed: 29, Score: 0.8246
Seed: 30, Score: 0.8174
Seed: 31, Score: 0.8215
Seed: 32, Score: 0.8215
Seed: 33, Score: 0.8174
Seed: 34, Score: 0.8349
Seed: 35, Score: 0.8113
Seed: 36, Score: 0.8164
Seed: 37, Score: 0.8256
Seed: 38, Score: 0.8308
Seed: 39, Score: 0.8277
Seed: 40, Score: 0.8267
Seed: 41, Score: 0.8215
Se