# Optical Character Recognition In Python

This is a study of a topic that's been addressed many times


$\S 1: \bf{Object Recognition}$

The first step is to figure out what English characters look like

Part 1:
1. Resize, grayscale, then binarize images
2. Create "nudged" dataset that accounts for small variations in letter locations

Part 2:
1. Connected components/feature recognition
2. Export character sub-pics as 20x20 blobs
3. Feed these results into model from part 1 for prediction, real text output


In [1]:
# sklearn models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import scale

# sklearn metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import auc
from sklearn.metrics import classification_report
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split

# graphs
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# images
from scipy.ndimage import convolve
from skimage import data, io, segmentation, color
from skimage.measure import regionprops
from skimage import draw
from skimage.filters import threshold_otsu
from skimage.transform import resize 
from skimage.transform import warp 
from PIL import Image

# basics
import pandas as pd
import numpy as np
from pprint import pprint

ImportError: No module named seaborn

How to make a SQLite DB from a CSV

In [2]:
def show_img(img):
    width = 10.0
    height = img.shape[0]*width/img.shape[1]
    f = plt.figure(figsize=(width, height))
    plt.imshow(img)

In [3]:
import math
import cv2

def img_round(x, base=75):
    """
    Now useless function (replaced by binarization) for flattening image data
    """
    return (base * math.floor(float(x)/base))

vround = np.vectorize(img_round) 

def get_img(i):
    """
    Returns image from my file directory with corresponding index i
    """
    img = Image.open('/users/derekjanni/pyocr/train/'+ str(i+1) + '.Bmp')
    img = img.convert("L")
    img = img.resize((50,50))
    image = np.asarray(img)
    image.setflags(write=True)
    thresh = threshold_otsu(image)
    binary = image > thresh
    return binary

def nudge_dataset(X, Y):
    """
    This produces a dataset 5 times bigger than the original one,
    by moving the (50 x 50) images around by 1px to left, right, down, up
    """
    direction_vectors = [
        [[0, 1, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [1, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 1],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 1, 0]]]

    shift = lambda x, w: convolve(x.reshape((50, 50)), mode='constant',
                                  weights=w).ravel()
    X = np.concatenate([X] +
                       [np.apply_along_axis(shift, 1, X, vector)
                        for vector in direction_vectors])
    Y = np.concatenate([Y for _ in range(5)], axis=0)
    return X, Y

def show_img(img):
    width = 10.0
    height = img.shape[0]*width/img.shape[1]
    f = plt.figure(figsize=(width, height))
    plt.imshow(img)

In [6]:
# declare models for explicit-ness
models = {'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5), 
          'Gaussian Naive Bayes': GaussianNB(),
          'Random Forest Classifier': RandomForestClassifier(),
          'Bernoulli Naive Bayes': BernoulliNB(),
          'Support Vector Machine': SVC()
         }

In [7]:
df = pd.read_csv('trainLabels.csv', header=0)
raw_y = np.asarray(df['Class'])
raw_x = np.asarray([get_img(i) for i in df.index]).astype(float)

In [9]:
x = np.asarray([i.ravel() for i in raw_x])
y = raw_y
x, y = nudge_dataset(x, y)
print x.shape, y.shape

(31415, 2500) (31415,)


In [10]:
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, y)

In [11]:
def precision_recall_by_class(model):
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    return classification_report(Y_test, Y_pred)

In [None]:
for i in models:
    print i + ':\n' + str(precision_recall_by_class(models[i]))

K-Nearest Neighbors:
             precision    recall  f1-score   support

          0       0.81      0.84      0.83        90
          1       0.87      0.95      0.91        83
          2       1.00      0.97      0.98        67
          3       0.96      0.96      0.96        50
          4       0.98      1.00      0.99        41
          5       0.93      0.96      0.95        56
          6       0.96      1.00      0.98        50
          7       0.94      0.91      0.92        33
          8       0.96      0.87      0.92        31
          9       1.00      0.97      0.99        35
          A       0.97      0.99      0.98       581
          B       0.96      0.95      0.96       106
          C       0.90      0.96      0.92       225
          D       0.90      0.89      0.90       199
          E       0.96      0.95      0.95       474
          F       0.86      0.93      0.90        92
          G       0.98      0.94      0.96       162
          H       0.95  

In [184]:
import datetime
import pickle
now = datetime.datetime.now()
with open('imagesY'+ str(now)+'.pkl', 'w') as picklefile:
    pickle.dump(y, picklefile)
with open('imagesX'+ str(now)+'.pkl', 'w') as picklefile:
    pickle.dump(x, picklefile)

In [104]:
# complicated label simplification 
from string import maketrans   # Required to call maketrans function.
intab = "CMOPSUVWXYZ"
outtab = "cmopsuvwxyz"
trantab = maketrans(intab, outtab)
simplified_y = np.asarray([i.translate(trantab) for i in y])

In [105]:
# declare models for explicit-ness
models = {'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5), 
          'Gaussian Naive Bayes': GaussianNB(),
          'Random Forest Classifier': RandomForestClassifier(),
          'Bernoulli Naive Bayes': BernoulliNB(),  
         }

In [106]:
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, simplified_y)

In [None]:
for i in models:
    print i + ':\n' + str(precision_recall_by_class(models[i]))

KNN is an interesting (and intuitive) solution to this problem. Lets see how the algorithm performs for varying K.

In [None]:
def accuracy_knn(k):
    neighbors = KNeighborsClassifier(n_neighbors=k)
    neighbors.fit(X_train, Y_train)
    return accuracy_score(Y_test, neighbors.predict(X_test))

k = [i for i in range(1, 10)]
acc_knn = [accuracy_knn(i) for i in k]
plt.figure(figsize=(10,7)).suptitle("Accuracy Score vs. K in KNN for OCR", fontsize='15')
plt.plot(k, acc_knn, label='K-Nearest Neighbors')

This is the optimal model (sike, it's not but I'm gonna work on that)

In [122]:
model = KNeighborsClassifier(n_neighbors=5).fit(X_train, Y_train)
for i in range(1, 10)

In [130]:
def get_test_img(i):
    """
    Returns image from my file directory with corresponding index i
    """
    img = Image.open('/users/derekjanni/pyocr/test/'+ str(i+6283) + '.Bmp')
    img = img.convert("L")
    img = img.resize((50,50))
    image = np.asarray(img)
    image.setflags(write=True)
    thresh = threshold_otsu(image)
    binary = image > thresh
    return binary


In [None]:
with open('submission73115.csv', 'w') as outfile:
for i in range(6284, 12503):
    pre = model.predict(get_test_img(i).ravel()
    outfile.write(i, pre)
    input('')