In [1]:
import numpy as np
import csv
import math
from tqdm import tqdm
import random
import copy
import os
import pickle
import sys

In [2]:
import PIL
from PIL import ImageFont
from PIL import Image
from PIL import ImageDraw

## Sample Images

In [4]:
entries = os.listdir('Fonts/')
fonts = []
for e in entries:
    if '.otf' in e or '.ttf' in e:
        fonts.append(e)
        
        
img = Image.new('RGB', (150, 30), color = (235, 235, 235))

fnt = ImageFont.truetype(f'Fonts/{np.random.choice(fonts)}', 17)
d = ImageDraw.Draw(img)
w, h = d.textsize("qwqwertreftgfsu", font=fnt)
d.text(((150-w)/2,(30-h)/2), "qwqwertreftgfsu", font=fnt, fill="black")
 
img.save('text4.png')


### Creating Datasets

#### words

In [13]:
words = []
prob = []
sum_prob = 0
with open("Datasets/unigram_freq.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        if len(row[0]) > 20: 
            continue
        words.append([row[0], 'word'])
        prob.append(int(row[1]))
        sum_prob += int(row[1])
        
for i in range(len(prob)):
    prob[i] /= sum_prob
print(words[:20])

[['the', 'word'], ['of', 'word'], ['and', 'word'], ['to', 'word'], ['a', 'word'], ['in', 'word'], ['for', 'word'], ['is', 'word'], ['on', 'word'], ['that', 'word'], ['by', 'word'], ['this', 'word'], ['with', 'word'], ['i', 'word'], ['you', 'word'], ['it', 'word'], ['not', 'word'], ['or', 'word'], ['be', 'word'], ['are', 'word']]


In [14]:
size = len(words)
print(size)

332873


### API - Extract Meaningful words

In [19]:
import requests


In [24]:
URL = 'https://api.dictionaryapi.dev/api/v2/entries/en/'

In [30]:
meaningful_words = []
res = []
for w, t in tqdm(words):
    # sending get request and saving the response as response object
    new_URL = URL + w
    r = requests.get(url = new_URL)

    # extracting data in json format
    data = r.json()
    res.append([w, data])


In [29]:
print(res)

[['the', [{'word': 'the', 'phonetic': 'ðə', 'phonetics': [{'text': 'ðə', 'audio': '//ssl.gstatic.com/dictionary/static/sounds/20200429/the--_gb_1.mp3'}, {'text': 'ðɪ'}, {'text': 'ðiː'}], 'origin': 'Old English se, sēo, thæt, ultimately superseded by forms from Northumbrian and North Mercian thē, of Germanic origin; related to Dutch de, dat, and German der, die, das .', 'meanings': [{'partOfSpeech': 'determiner', 'definitions': [{'definition': 'denoting one or more people or things already mentioned or assumed to be common knowledge.', 'example': "what's the matter?", 'synonyms': [], 'antonyms': []}, {'definition': 'used to point forward to a following qualifying or defining clause or phrase.', 'example': 'the fuss that he made of her', 'synonyms': [], 'antonyms': []}, {'definition': 'used to make a generalized reference to something rather than identifying a particular instance.', 'example': 'he taught himself to play the violin', 'synonyms': [], 'antonyms': []}, {'definition': 'enough

### DIC- Extract Meaningful words

In [79]:
dictionary = []
with open("Datasets/engmix.txt", 'r', encoding='unicode_escape') as file:
    for line in file:
        w = line.strip()
        dictionary.append(w)

In [80]:
print(dictionary)



In [81]:
meaningful_words = []

for w, t in tqdm(words):
    if w in dictionary:
        meaningful_words.append([w, t])

100%|█████████████████████████████████| 332873/332873 [04:45<00:00, 1167.72it/s]


###  elexicon - Extract Meaningful words

In [77]:
elexicon = []
with open("Datasets/unigram_freq.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        elexicon.append(row[0])

print(len(elexicon))
print(elexicon[:10])

333333
['the', 'of', 'and', 'to', 'a', 'in', 'for', 'is', 'on', 'that']


In [78]:
meaningful_words = []

for w, t in tqdm(words):
    if w in elexicon:
        meaningful_words.append([w, t])

100%|██████████████████████████████████| 332873/332873 [12:59<00:00, 427.19it/s]


In [58]:
print(len(meaningful_words))

64949


## Create New Dataset for Meaningful words with Frequency

In [86]:
freq = {}
with open("Datasets/unigram_freq.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        if len(row[0]) > 20: 
            continue
        freq[row[0]] = row[1]

In [87]:
print(freq['car'])

264720374


In [88]:
with open("Datasets/meaningful_words_with_frq.csv", 'w', newline='') as file:
    writer = csv.writer(file)
    for k in tqdm(range(len(meaningful_words))):
        writer.writerow([meaningful_words[k][0], freq[meaningful_words[k][0]]])

100%|█████████████████████████████████| 64949/64949 [00:00<00:00, 700886.99it/s]


## Read Dataset


In [93]:
words = []
prob = []
sum_prob = 0
with open("Datasets/meaningful_words_with_frq.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        if len(row[0]) > 20: 
            continue
        words.append([row[0], 'word'])
        prob.append(int(row[1]))
        sum_prob += int(row[1])
        
for i in range(len(prob)):
    prob[i] /= sum_prob
print(words[:20])

[['the', 'word'], ['of', 'word'], ['and', 'word'], ['to', 'word'], ['a', 'word'], ['in', 'word'], ['for', 'word'], ['is', 'word'], ['on', 'word'], ['that', 'word'], ['by', 'word'], ['this', 'word'], ['with', 'word'], ['i', 'word'], ['you', 'word'], ['it', 'word'], ['not', 'word'], ['or', 'word'], ['be', 'word'], ['are', 'word']]


In [94]:
print(len(words))

64949


#### nonwordrs

In [95]:
non_words = []
with open("Datasets/nonword.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        if len(row[0]) > 20: 
            continue
        non_words.append(row[0])
        if len(non_words) >= len(meaningful_words):
            break
print(non_words[:20])

['breighf', 'gnelcked', 'dwefed', 'threuve', 'gwoughth', 'proarnds', 'stryphts', 'sckirch', 'glaummed', 'muics', 'whirph', 'sploarm', 'phraubb', 'luizzed', 'scwowdged', 'ghink', 'squeebbed', 'psoapth', 'ghleuphth', 'kloffth']


In [96]:
print(len(non_words))

64949


In [97]:
word_terms = [w[0] for w in meaningful_words]
cnt = len(non_words)
final = len(meaningful_words)
pbar = tqdm(total = final-len(non_words))
while cnt <= final:
    new_word = str(np.random.choice(word_terms, size=1, p=prob, replace=False)[0])
    non_w = ''.join(random.sample(new_word,len(new_word)))
    if non_w not in word_terms and non_w not in non_words:
        non_words.append(non_w)
        cnt += 1
        pbar.update(1)


1it [00:33, 33.39s/it]


In [98]:
print(len(non_words))

64950


In [99]:
nonwords = []
for n in non_words:
    nonwords.append([n, 'nonword'])

In [100]:
print(nonwords[-1])

['ilelnaca', 'nonword']


In [101]:
string_dataset =copy.deepcopy(meaningful_words + nonwords) 

In [102]:
random.shuffle(string_dataset)

In [103]:
print(string_dataset[:20])

[['ghryed', 'nonword'], ['jesters', 'word'], ['torshavn', 'word'], ['failure', 'word'], ['toargue', 'nonword'], ['snekked', 'nonword'], ['prirced', 'nonword'], ['motherland', 'word'], ['kroarmb', 'nonword'], ['fogging', 'word'], ['brawlds', 'nonword'], ['quoarsh', 'nonword'], ['tenths', 'word'], ['pages', 'word'], ['loves', 'word'], ['denise', 'word'], ['phorg', 'nonword'], ['doinn', 'nonword'], ['tovvs', 'nonword'], ['jelps', 'nonword']]


In [104]:
with open("Datasets/word_nonword_dataset.csv", 'w', newline='') as file:
    writer = csv.writer(file)
    for k in tqdm(range(len(string_dataset))):
        writer.writerow(string_dataset[k])

100%|███████████████████████████████| 129899/129899 [00:00<00:00, 801643.05it/s]


## Reading Word-Nonword Dataset


In [105]:
string_dataset = []
with open("Datasets/word_nonword_dataset.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        string_dataset.append([row[0], row[1]])


## Creating Image Dataset

In [106]:
max_len = 0
l = ''
for item in string_dataset:
    if len(item[0]) > max_len:
        max_len = len(item[0])
        l = item[0]
print(max_len)
print(l)

20
institutionalization


In [107]:
entries = os.listdir('Fonts/')
fonts = []
for e in entries:
    if '.otf' in e or '.ttf' in e:
        fonts.append(e)


In [108]:
image_dataset = []
for item in tqdm(string_dataset):
    img = Image.new('RGB', (150, 30), color = (240, 240, 240))
    font_size = int(np.random.uniform(12, 17))
    if len(item[0]) > 15:
        font_size = 12
    fnt = ImageFont.truetype(f'Fonts/{np.random.choice(fonts)}' , font_size)
    d = ImageDraw.Draw(img)
    w, h = d.textsize(item[0], font=fnt)
    d.text(((150-w)/2,(30-h)/2), item[0], font=fnt, fill="black")
    if item[1] == 'word':
        img.save(f'All_Images/word/{string_dataset.index(item)}_{item[0]}.png', "PNG")
    if item[1] == 'nonword':
        img.save(f'All_Images/nonword/{string_dataset.index(item)}_{item[0]}.png', "PNG")

100%|██████████████████████████████████| 129899/129899 [09:22<00:00, 230.77it/s]
