In [1]:
import numpy as np
import cv2
img = cv2.imread("dataset/#/0.jpg")
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img.shape

(28, 28)

In [2]:
import re
def sorted_alphanumeric(data):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(data, key=alphanum_key)

In [3]:
import os
char_dict = {"char": [], "matrix": []}
char_set = dict()
ignore = ["#", "$", "&", "@"]
src = r"dataset"
for i in range(250):
    for folder in os.listdir(src):
        if folder in ignore:
            continue
        char_dict["char"].append(folder)
        if folder not in char_set:
            char_set[folder] = sorted_alphanumeric(os.listdir(src + '/' + folder))
        img = cv2.imread(f"{src}/{folder}/{char_set[folder][i]}")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        if not img.shape == (32, 32):
            img = cv2.resize(img, (32, 32))
        img = img.flatten()
        char_dict["matrix"].append(img)

In [4]:
len(char_dict["char"]), len(char_dict["matrix"])

(8750, 8750)

In [5]:
X = char_dict["matrix"]
y = char_dict["char"]

In [6]:
type(X[0]), X[0].shape

(numpy.ndarray, (1024,))

In [7]:
len(X)*0.9

7875.0

In [8]:
train_X = X[:7875]
train_y = y[:7875]
test_X = X[7875:]
test_y = y[7875:]

In [9]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators=100)

In [10]:
clf_rf.fit(train_X, train_y)

RandomForestClassifier()

In [11]:
y_pred = clf_rf.predict(test_X)

In [12]:
from sklearn.metrics import accuracy_score
#print(f"Accuracy: {accuracy_score(test_y, y_pred)}")
score = accuracy_score(test_y, y_pred)
score

0.8011428571428572

In [13]:
# import pickle
# with open('modelRandomForest.dat', 'wb') as file:
#     pickle.dump(clf_rf, file)

In [14]:
y_pred[12]

'C'

### trying for seeds

In [15]:
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import accuracy_score
#best = 0.
#best_seed = 0
#for i in range(100):
#    clf_rf = RandomForestClassifier(n_estimators=100, random_state=i)
#    clf_rf.fit(train_X, train_y)
#    y_pred = clf_rf.predict(test_X)
#    score = accuracy_score(test_y, y_pred)
#    if score > best:
#        best = score
#        best_seed = i
#    print(f"Seed: {i}, Score: {round(score, 4)}")
#print(f"Best accuracy: {best}, best seed: {best_seed}")

# Best accuracy: 0.8217142857142857, best seed: 9

In [16]:
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import accuracy_score
#best = 0.
#best_tree = 0
#for i in range(100, 250, 10):
#    clf_rf = RandomForestClassifier(n_estimators=i, random_state=1)
#    clf_rf.fit(train_X, train_y)
#    y_pred = clf_rf.predict(test_X)
#    score = accuracy_score(test_y, y_pred)
#    if score > best:
#        best = score
#        best_tree = i
#    print(f"Nr of trees: {i}, Score: {round(score, 4)}")
#print(f"Best accuracy: {best}, best nr of trees: {best_tree}")

# Best accuracy: 0.8091428571428572, best nr of trees: 170

In [17]:
import os
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

start = time.time()
char_dict = {"char": [], "matrix": []}
char_set = dict()
ignore = ["#", "$", "&", "@"]
src = r"dataset"
for i in range(1000):
    for folder in os.listdir(src):
        if folder in ignore:
            continue
        if folder not in char_set:
            char_set[folder] = sorted_alphanumeric(os.listdir(src + '/' + folder))
        img = cv2.imread(f"{src}/{folder}/{char_set[folder][i]}")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        if not img.shape == (32, 32):
            img = cv2.resize(img, (32, 32))
        img = img.flatten()
        char_dict["matrix"].append(img)
        char_dict["char"].append(folder)
        
X = char_dict["matrix"]
y = char_dict["char"]
        
split = int(len(X)*0.9)
train_X = X[:split]
train_y = y[:split]
test_X = X[split:]
test_y = y[split:]
        
best = 0.
best_tree = 0
best_seed = 0
for i in range(100, 250, 10):
    for j in range(100):
        clf_rf = RandomForestClassifier(n_estimators=i, random_state=j)
        clf_rf.fit(train_X, train_y)
        y_pred = clf_rf.predict(test_X)
        score = accuracy_score(test_y, y_pred)
        if score > best:
            best = score
            best_tree = i
            best_seed = j
        print(f"Nr of trees: {i}, Seed: {j}, Score: {round(score, 4)}")
print(f"Best accuracy: {best}, best nr of trees: {best_tree}, best seed: {best_seed}")

end = time.time()

Nr of trees: 100, Seed: 0, Score: 0.8649
Nr of trees: 100, Seed: 1, Score: 0.8646
Nr of trees: 100, Seed: 2, Score: 0.8677
Nr of trees: 100, Seed: 3, Score: 0.8674
Nr of trees: 100, Seed: 4, Score: 0.8686
Nr of trees: 100, Seed: 5, Score: 0.8689
Nr of trees: 100, Seed: 6, Score: 0.8689
Nr of trees: 100, Seed: 7, Score: 0.8703
Nr of trees: 100, Seed: 8, Score: 0.866
Nr of trees: 100, Seed: 9, Score: 0.8643
Nr of trees: 100, Seed: 10, Score: 0.8651
Nr of trees: 100, Seed: 11, Score: 0.8643
Nr of trees: 100, Seed: 12, Score: 0.8626
Nr of trees: 100, Seed: 13, Score: 0.8609
Nr of trees: 100, Seed: 14, Score: 0.868
Nr of trees: 100, Seed: 15, Score: 0.868
Nr of trees: 100, Seed: 16, Score: 0.8663
Nr of trees: 100, Seed: 17, Score: 0.8691
Nr of trees: 100, Seed: 18, Score: 0.87
Nr of trees: 100, Seed: 19, Score: 0.8709
Nr of trees: 100, Seed: 20, Score: 0.8643
Nr of trees: 100, Seed: 21, Score: 0.8683
Nr of trees: 100, Seed: 22, Score: 0.8617
Nr of trees: 100, Seed: 23, Score: 0.8674
Nr of t

In [20]:
final_time = end-start
print(f"Best accuracy: {round(best, 5)}, best nr of trees: {best_tree}, best seed: {best_seed}, time: {final_time//60}m;{final_time%60}s")
# 1000 -> 0.8685714285714285

Best accuracy: 0.87657, best nr of trees: 230, best seed: 90, time: 1599.0m;55.479313135147095s


In [21]:
final_time

95995.47931313515