## Prerequisites to running the notebook



1.   Create a directory named 'ofl' and paste the font directories in it {The directory structure should be: **'/content/ofl/'font_directories'/'(.ttf or .otf) font_files**
2.   The other option is to mount your Google Drive and include the .ttf files in the /ofl/'font_directories'/'(.ttf or .otf) font_files structure and replace the '/ofl' directory path from '/content/ofl to the /ofl path from your Google Drive trhoughout this notebook
3.   Please carefully read all the cell titles and comments in each cell carefully before executing



## Import necessary ilbrary modules

In [10]:
import os, sys, time, random, re, optparse, datetime, shutil
import re
import pandas as pd                 
from PIL import Image, ImageDraw, ImageFont, ImageOps
import string
from numpy import asarray
import numpy as np
import cv2
import matplotlib.pyplot as plt
import math
from scipy import ndimage
import glob
import csv
import json
import unicodedata

In [77]:
!pip install numpy_indexed
!pip install fontTools



In [78]:
import numpy_indexed as npi
from fontTools.ttLib import TTFont

## Loading a character list

**Run the following cell if the characters are stored in a text file**

In [None]:
with open('/content/'insert text file name') as f:
    characters = f.readlines()
print(characters)
# If the characters have a newline character present (ex:'A\n') then run the following commands otherwise comment them out
for i in range(len(characters)):
  characters[i]=characters[i].strip()
print(characters)
print('Total number of characters are: ',len(characters))

**Run the following cell if you want to directly paste the characters**

In [103]:
unicode_text=u"Paste all characters here"
# Example: unicode_text=u"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
characters_temp=[]
characters_temp[0:]=unicode_text
print(characters_temp)
double_char_check='SIGN'
characters=[]
for i in range(len(unicode_text)):
  if double_char_check in unicodedata.name(unicode_text[i]):
    new_char1=unicode_text[i-1]
    new_char2=unicode_text[i]
    new_char=str(new_char1)+str(new_char2)
    print('new_char is: ',new_char)
    characters[i-1]=new_char
    
  else:
    characters.append(unicode_text[i])
    
    
print('the final character list is: ',characters)
print('Total number of characters are: ',len(characters))

['अ', 'ं', 'B', '1', 'A', 'प', 'फ', 'ब', 'भ', 'य', 'र', 'व', 'ळ', 'श', 'ष', 'स', 'ह', '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', 'ក', 'ខ', 'គ', 'ឃ', 'ង', 'ច', 'ឆ']
new_char is:  अं
the final character list is:  ['अं', 'B', '1', 'A', 'प', 'फ', 'ब', 'भ', 'य', 'र', 'व', 'ळ', 'श', 'ष', 'स', 'ह', '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', 'ក', 'ខ', 'គ', 'ឃ', 'ង', 'ច', 'ឆ']
Total number of characters are:  33


## Important functions to generate a character image on an empty canvas and modify the image to an MNIST-style image 

In [6]:
# FUNCTION TO DISPLAY IMAGE ON BLANK CANVAS
def display_image(unicode_text_1,text_width,text_height):
  # create a blank canvas with extra space between lines
  canvas = Image.new('L', (text_width+10, text_height+10), "white")
  # draw the text onto the text canvas, and use black as the text color
  draw = ImageDraw.Draw(canvas)
  draw.text((5,5), unicode_text_1, 'black', font)
  return canvas

In [7]:
# FUNCTION TO MAKE MNIST-STYLE IMAGE
def preprocess(canvas):
  #convert image to grayscale  
  imgGray = canvas.convert('L') 
  #invert image pixels and crop digit part of image
  inv_sample = ImageOps.invert(imgGray)
  bbox = inv_sample.getbbox()
  crop = inv_sample.crop(bbox)
  #Resize image to 20x20 without losing aspect ratio
  crop = np.asarray(crop)
  width,height = np.shape(crop)
  if width > height:
    factor = 20.0/width
    width = 20
    height = int(round(height*factor))
    gray = cv2.resize(crop, (height,width))
  else:
    factor = 20.0/height
    height = 20
    width = int(round(width*factor))
    gray = cv2.resize(crop, (height,width))
  #Add zero padding to make image 28x28
  widthPadding = (int(math.ceil((28-height)/2.0)),int(math.floor((28-height)/2.0)))
  heightPadding = (int(math.ceil((28-width)/2.0)),int(math.floor((28-width)/2.0)))
  gray = Image.fromarray(gray)
  gray = ImageOps.expand(gray, (widthPadding[0], heightPadding[0],widthPadding[1], heightPadding[1]))
  gray=np.asarray(gray)
  #Calculate center of mass
  cy,cx = ndimage.measurements.center_of_mass(gray)
  #use affine transform to translate the image so that the center of mass is at 14,14
  width,height = gray.shape
  shiftx = np.round(height/2.0-cx).astype(int)
  shifty = np.round(width/2.0-cy).astype(int)
  M = np.float32([[1,0,shiftx],[0,1,shifty]])
  shifted = cv2.warpAffine(gray,M,(height,width))
  return shifted 

In [81]:
## Function to check if a font file can create the image of a character
#font = TTFont('/content/ofl/abhayalibre/AbhayaLibre-Bold.ttf')   # specify the path to the font in question

#unicode_char='अ'
def char_in_font(unicode_char, font):
    for cmap in font['cmap'].tables:
        if cmap.isUnicode():
            if ord(unicode_char) in cmap.cmap:
                return True
    return False

## Final block of cells to generate the images and a csv file containing the following headers will also be created


1.   'font_name'
2.   'character_name'
3.   'label'
4.   784 grayscale values for each image

**Only images for characters included in the map table for the .ttf font file will be created**



In [104]:
# Specify font directory
font_dir=os.path.abspath('/content/ofl') # change the path to wherever you've stored the 'ofl' directory if necessary
print(font_dir)
#Create a list to store all font name directories
dir=sorted(os.listdir(font_dir))
print(np.shape(dir))

/content/ofl
(2,)


In [105]:
# store an array of unicodedata names of characters
Character_name_array=[]
for i in range(len(characters)):
  ltr=characters[i].strip()
  if (len(ltr)>1):
    ltr1=[]
    name=[]
    for j in range(len(ltr)):
      if j==0:
        name=unicodedata.name(ltr[j])
      else:
        name=str(name)+' + '+unicodedata.name(ltr[j])
  else:
    name=unicodedata.name(ltr)
  Character_name_array.append(name)
print('length of Character_name_array is ',len(Character_name_array))
print('Character names are ', Character_name_array)
print('The characters are :', characters)

length of Character_name_array is  33
Character names are  ['DEVANAGARI LETTER A + DEVANAGARI SIGN ANUSVARA', 'LATIN CAPITAL LETTER B', 'DIGIT ONE', 'LATIN CAPITAL LETTER A', 'DEVANAGARI LETTER PA', 'DEVANAGARI LETTER PHA', 'DEVANAGARI LETTER BA', 'DEVANAGARI LETTER BHA', 'DEVANAGARI LETTER YA', 'DEVANAGARI LETTER RA', 'DEVANAGARI LETTER VA', 'DEVANAGARI LETTER LLA', 'DEVANAGARI LETTER SHA', 'DEVANAGARI LETTER SSA', 'DEVANAGARI LETTER SA', 'DEVANAGARI LETTER HA', 'DEVANAGARI DIGIT ZERO', 'DEVANAGARI DIGIT ONE', 'DEVANAGARI DIGIT TWO', 'DEVANAGARI DIGIT THREE', 'DEVANAGARI DIGIT FOUR', 'DEVANAGARI DIGIT FIVE', 'DEVANAGARI DIGIT SIX', 'DEVANAGARI DIGIT SEVEN', 'DEVANAGARI DIGIT EIGHT', 'DEVANAGARI DIGIT NINE', 'KHMER LETTER KA', 'KHMER LETTER KHA', 'KHMER LETTER KO', 'KHMER LETTER KHO', 'KHMER LETTER NGO', 'KHMER LETTER CA', 'KHMER LETTER CHA']
The characters are : ['अं', 'B', '1', 'A', 'प', 'फ', 'ब', 'भ', 'य', 'र', 'व', 'ळ', 'श', 'ष', 'स', 'ह', '०', '१', '२', '३', '४', '५', '६', '७', '८

In [106]:
# Create a directory to store all images created
!mkdir /content/TMNIST_Images

In [107]:
characters_image_array=[]

ind=0
ttf_count=0
for a,b in enumerate(dir):
  dir_1 = '/content/ofl/'+str(b)
  font_dir = os.path.abspath(dir_1)
  fonts={}
  print(b)
  #Create a new directory in TMNIST_Images folder to store images for individual font files
  os.makedirs('/content/TMNIST_Images/'+str(b),exist_ok=True) 
  #2nd For loop to iterate over all .ttf or .otf files
  for file in os.listdir(font_dir):       
    (root, ext) = os.path.splitext(file)
    #Check if font file exists and assign f as font file name
    if (ext.lower() == '.otf') or (ext.lower() == '.ttf'):
      f=file
      # Create a new directory in TMNIST_Images/'font_file'/ to store images of individual font_styles
      os.mkdir('/content/TMNIST_Images/'+str(b)+'/'+str(f))
      ttf_count = ttf_count+1
      print('font_style name is: ',file)
      fonts[file]=root
      image_array_1=[]
      printable_character_index =[] # array to store index of character for which image can be generated by the font style
      for a in range(len(characters)):
        font=TTFont(str(dir_1)+'/'+str(f))
        unicode_char=characters[a]
        if (len(unicode_char)>1):
          unicode_char=unicode_char[0]
        if (char_in_font(unicode_char,font)==False):
          continue
        
        printable_character_index.append(a)
        
        unicode_text_1 = str(characters[a])
        final_font_dir = os.path.join(font_dir,f)
        font = ImageFont.truetype(final_font_dir, 28,encoding="unic")
        text_width, text_height = font.getsize(unicode_text_1)
        canvas = display_image(unicode_text_1,text_width,text_height)
        shifted = preprocess(canvas)
        ##SAVE SHIFTED IMAGES IN A font_style DIRECTORY 
        os.chdir('/content/TMNIST_Images/'+str(b)+'/'+str(f))
        im=Image.fromarray(shifted)
        im.save(str(unicode_text_1)+'.png')
        os.chdir('/content')
        image_array_1.append(shifted.flatten())
        image_array_2=[]
      for i in range(len(printable_character_index)):
        tr1=image_array_1[i].tolist()
        character_name=str(characters[printable_character_index[i]])
        character_name.strip('\n')
        tr1.insert(0,character_name)
        tr1.insert(0,Character_name_array[printable_character_index[i]])
        image_array_2.append(tr1)
      x=np.array(np.array(image_array_2))
      
      final_array=x
      final_image_array_1=final_array.tolist()
      for j in range(len(final_image_array_1)):
        final_image_array_1[j].insert(0,f[:-4])
      
      ind=ind+1
      if ind==1:
        characters_image_array.append(final_image_array_1)
        characters_image_array=np.array(characters_image_array)
        characters_image_array=characters_image_array[0]
        
        
        
      else:
        
        final_image_array_1=np.array(final_image_array_1)
        characters_image_array=np.concatenate((characters_image_array,final_image_array_1), axis=0)
        
        

characters_image_array = np.array(characters_image_array)	
# save numpy array as csv file
from numpy import asarray
from numpy import savetxt
a=list(range(1,785))
b=['font_name','character_name','label']
header_list=b+a
df = pd.DataFrame(characters_image_array)
df.to_csv('TMNIST_data.csv', index=False, header=header_list)




.ipynb_checkpoints
abhayalibre
font_style name is:  AbhayaLibre-Bold.ttf
font_style name is:  AbhayaLibre-ExtraBold.ttf
