In [1]:
import os
import cv2 as cv
import numpy as np
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt
import json

# Functions

In [28]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])


def convert_image_batch_to_array(batch, folder_path):
        
    batch_image_matrix_list = []  # Image matrix array
    batch_image_label_list  = []  # Image label array

    # For each image of the current batch
    for image in tqdm(batch, desc='Batch'):
        
        # Get the current image label thanks to his name
        image_label = image.split("_")[0]

        # Create the complete path of the current image
        image_path = os.path.join(folder_path, image)

        # Image process
        image_charged  = cv.imread(image_path) 
        image_resized  = cv.resize(image_charged, (128,128))
        B, G, R        = cv.split(image_resized)
        image_matrix   = np.stack([R, G, B], axis=-1)
            
        # Add the current image matrix and label to their array
        batch_image_matrix_list.append(image_matrix)
        batch_image_label_list.append(image_label)

        # Free memory
        del image_charged
        del image_resized
        

    # Return both matrix and label array
    yield np.array(batch_image_matrix_list)
    yield np.array(batch_image_label_list)

    del batch_image_matrix_list
    del batch_image_label_list
    gc.collect


# Define the quantization function
def convert_to_8_shades(image):
    thresholds = [18.23, 54.70, 91.16, 127.63, 164.09, 200.56, 237.02]
    levels = [0, 36, 72, 109, 145, 182, 218, 255]

    # Vectorized approach to quantize grayscale image
    quantized_image = np.digitize(image, thresholds, right=True)
    
    # Map the quantized values to the corresponding levels
    quantized_image = np.array([levels[i] for i in quantized_image.ravel()]).reshape(image.shape)
    
    # Convert the result back to np.uint8 to reduce the file size
    return quantized_image.astype(np.uint8)

# Variables

In [31]:
imagenet_folder_path = 'imagenet_images'
batch_size = 20000

# Create batches
folder = os.listdir(imagenet_folder_path)
num_file = len(folder)
batches = np.array_split(folder, len(folder) // batch_size + 1)

# Load images and convert them into array

In [4]:
cur_batch_num = 1
# For each batches 
for batch in batches:
    
    # Matrix and label arrays
    image_matrix_list = []  
    image_label_list = []  

    # Get all matrix and label from this batch
    image, label = convert_image_batch_to_array(batch, imagenet_folder_path)

    # Add to the final array
    image_matrix_list.extend(image)
    image_label_list.extend(label)

    # Save array into .npy files 
    np.save(f'RGB_Image_Matrix/ImageNet_Matrix_{cur_batch_num}.npy', image_matrix_list)
    np.save(f'Image_Labels/ImageNet_Label_{cur_batch_num}.npy', image_label_list)

    # Free memory
    del image
    del label
    del image_matrix_list
    del image_label_list
    gc.collect

    cur_batch_num += 1

Batch: 100%|██████████| 19711/19711 [00:30<00:00, 645.78it/s]
Batch: 100%|██████████| 19711/19711 [00:29<00:00, 664.38it/s]
Batch: 100%|██████████| 19711/19711 [00:31<00:00, 633.78it/s]
Batch: 100%|██████████| 19711/19711 [00:30<00:00, 656.93it/s]
Batch: 100%|██████████| 19711/19711 [00:29<00:00, 669.65it/s]
Batch: 100%|██████████| 19711/19711 [00:29<00:00, 674.30it/s]
Batch: 100%|██████████| 19711/19711 [00:29<00:00, 662.96it/s]
Batch: 100%|██████████| 19711/19711 [00:28<00:00, 687.42it/s]
Batch: 100%|██████████| 19711/19711 [00:28<00:00, 698.56it/s]
Batch: 100%|██████████| 19711/19711 [00:29<00:00, 664.50it/s]
Batch: 100%|██████████| 19711/19711 [00:29<00:00, 661.43it/s]
Batch: 100%|██████████| 19711/19711 [00:29<00:00, 658.54it/s]
Batch: 100%|██████████| 19711/19711 [00:29<00:00, 664.56it/s]
Batch: 100%|██████████| 19711/19711 [00:29<00:00, 675.91it/s]
Batch: 100%|██████████| 19711/19711 [00:29<00:00, 672.19it/s]
Batch: 100%|██████████| 19711/19711 [00:29<00:00, 677.57it/s]
Batch: 1

# Convert to 8 Shades gray scale

In [5]:
# For each batch in the 'RGB_Image_Matrix_Label' folder
for batch in tqdm(range(1,len(batches)+1,1), desc='batch'):

    # Load the current batch
    x_train = np.load(f'RGB_Image_Matrix/ImageNet_Matrix_{batch}.npy')
    # Convert all rgb images into gray scale
    x_train_gray = rgb2gray(x_train)
    # Convert all gray scale images into 8 shades
    x_train_gray_8s = convert_to_8_shades(x_train_gray)

    # Save in a new file
    np.save(f'8Shades_Gray_Image_Matrix/ImageNet_Gray8ShadesMatrix_{batch}.npy', x_train_gray_8s)

    #Free memory
    del x_train
    del x_train_gray
    del x_train_gray_8s
    gc.collect

batch: 100%|██████████| 65/65 [32:09<00:00, 29.69s/it]


# Convert to tuples

In [6]:
# For each batch
for batch_num in range(1, len(batches)+1, 1):

    # Load the current batch
    gray_8_shades = np.load(f'8Shades_Gray_Image_Matrix/ImageNet_Gray8ShadesMatrix_{batch_num}.npy')

    char_lst = []  

    for outer_list in tqdm(gray_8_shades, desc = 'batch'):
        for i in range(0, len(outer_list), 2):
            for j in range(128):
                char_lst.append((int(outer_list[i][j]), int(outer_list[i+1][j])))

    with open(f'Image_Tuples/ImageNet_Tuples_{batch_num}.json', 'w') as file:
        json.dump(char_lst, file)
    
    # Free memory
    del char_lst
    del gray_8_shades
    gc.collect

batch: 100%|██████████| 19711/19711 [00:55<00:00, 357.76it/s]
batch: 100%|██████████| 19711/19711 [00:55<00:00, 355.82it/s]
batch: 100%|██████████| 19711/19711 [00:54<00:00, 359.73it/s]
batch: 100%|██████████| 19711/19711 [00:54<00:00, 363.75it/s]
batch: 100%|██████████| 19711/19711 [00:54<00:00, 359.38it/s]
batch: 100%|██████████| 19711/19711 [00:56<00:00, 348.24it/s]
batch: 100%|██████████| 19711/19711 [00:54<00:00, 359.74it/s]
batch: 100%|██████████| 19711/19711 [00:54<00:00, 361.91it/s]
batch: 100%|██████████| 19711/19711 [00:55<00:00, 356.92it/s]
batch: 100%|██████████| 19711/19711 [00:54<00:00, 358.63it/s]
batch: 100%|██████████| 19711/19711 [00:56<00:00, 351.18it/s]
batch: 100%|██████████| 19711/19711 [00:55<00:00, 357.47it/s]
batch: 100%|██████████| 19711/19711 [00:55<00:00, 357.61it/s]
batch: 100%|██████████| 19711/19711 [00:55<00:00, 356.85it/s]
batch: 100%|██████████| 19711/19711 [00:54<00:00, 362.31it/s]
batch: 100%|██████████| 19711/19711 [00:55<00:00, 352.26it/s]
batch: 1

# Encoding to Alphabets

## Tuple to alphabet

In [7]:
tuple_to_alphabet = {(0, 36): 'A',
 (145, 36): 'B',
 (218, 72): 'C',
 (182, 0): 'D',
 (0, 109): 'E',
 (72, 36): 'F',
 (218, 145): 'G',
 (0, 182): 'H',
 (145, 109): 'I',
 (255, 36): 'J',
 (72, 109): 'K',
 (0, 72): 'L',
 (145, 182): 'M',
 (145, 72): 'N',
 (182, 36): 'O',
 (255, 109): 'P',
 (145, 255): 'Q',
 (0, 145): 'R',
 (72, 182): 'S',
 (109, 0): 'T',
 (182, 109): 'U',
 (72, 255): 'V',
 (0, 218): 'W',
 (36, 0): 'X',
 (255, 182): 'Y',
 (72, 145): 'Z',
 (255, 72): 'a',
 (145, 218): 'b',
 (255, 145): 'c',
 (182, 72): 'd',
 (72, 218): 'e',
 (182, 255): 'f',
 (109, 36): 'g',
 (255, 218): 'h',
 (182, 218): 'i',
 (218, 0): 'j',
 (36, 109): 'k',
 (109, 182): 'l',
 (109, 72): 'm',
 (109, 255): 'n',
 (36, 182): 'o',
 (36, 255): 'p',
 (109, 145): 'q',
 (36, 72): 'r',
 (218, 36): 's',
 (36, 145): 't',
 (109, 218): 'u',
 (145, 0): 'v',
 (218, 109): 'w',
 (36, 218): 'x',
 (72, 0): 'y',
 (218, 182): 'z',
 (0, 0): '0',
 (36, 36): '1',
 (72, 72): '2',
 (109, 109): '3',
 (145, 145): '4',
 (182, 182): '5',
 (218, 218): '6',
 (255, 255): '7',
 (218, 255): '8',
 (182, 145): '9',
 (0, 255): '@',
 (255, 0): '!'}

In [8]:
# For each batch
for batch_num in range(1, len(batches)+1, 1):

    # Load the json file
    with open(f'Image_Tuples/ImageNet_Tuples_{batch_num}.json', 'r') as file:
        char_lst = json.load(file)

    print(f'file_{batch_num} loaded')

    # Convert to a list of tuples
    char_lst = [tuple(item) for item in tqdm(char_lst, desc=f'ToTuple_{batch_num}')]

    # Convert the list of tuples to an alphabet list 
    for key, value in tqdm(tuple_to_alphabet.items(), desc=f'ToAlphabet_{batch_num}'):
      for i in range(len(char_lst)):
        if char_lst[i] == key:
          char_lst[i] = value

    # Join all elements in the list into a single string
    result_string_test = ''.join(char_lst)

    # Write the string to a text file
    with open(f'Image_Strings/ImageNet_Strings_{batch_num}.txt', 'w') as file:
        file.write(result_string_test)

    del char_lst, result_string_test
    gc.collect
    

file_1 loaded


ToTuple_1: 100%|██████████| 161472512/161472512 [00:21<00:00, 7421061.25it/s]
ToAlphabet_1: 100%|██████████| 64/64 [07:50<00:00,  7.35s/it]


file_2 loaded


ToTuple_2: 100%|██████████| 161472512/161472512 [00:21<00:00, 7406231.67it/s]
ToAlphabet_2: 100%|██████████| 64/64 [07:46<00:00,  7.29s/it]


file_3 loaded


ToTuple_3: 100%|██████████| 161472512/161472512 [00:21<00:00, 7648382.30it/s]
ToAlphabet_3: 100%|██████████| 64/64 [08:04<00:00,  7.56s/it]


file_4 loaded


ToTuple_4: 100%|██████████| 161472512/161472512 [00:21<00:00, 7493297.73it/s]
ToAlphabet_4: 100%|██████████| 64/64 [07:49<00:00,  7.33s/it]


file_5 loaded


ToTuple_5: 100%|██████████| 161472512/161472512 [00:22<00:00, 7252240.51it/s]
ToAlphabet_5: 100%|██████████| 64/64 [07:56<00:00,  7.44s/it]


file_6 loaded


ToTuple_6: 100%|██████████| 161472512/161472512 [00:21<00:00, 7540418.83it/s]
ToAlphabet_6: 100%|██████████| 64/64 [07:48<00:00,  7.33s/it]


file_7 loaded


ToTuple_7: 100%|██████████| 161472512/161472512 [00:21<00:00, 7528231.31it/s]
ToAlphabet_7: 100%|██████████| 64/64 [07:46<00:00,  7.29s/it]


file_8 loaded


ToTuple_8: 100%|██████████| 161472512/161472512 [00:21<00:00, 7547175.14it/s]
ToAlphabet_8: 100%|██████████| 64/64 [07:47<00:00,  7.31s/it]


file_9 loaded


ToTuple_9: 100%|██████████| 161472512/161472512 [00:21<00:00, 7539755.83it/s]
ToAlphabet_9: 100%|██████████| 64/64 [07:52<00:00,  7.38s/it]


file_10 loaded


ToTuple_10: 100%|██████████| 161472512/161472512 [00:21<00:00, 7543848.24it/s]
ToAlphabet_10: 100%|██████████| 64/64 [07:53<00:00,  7.40s/it]


file_11 loaded


ToTuple_11: 100%|██████████| 161472512/161472512 [00:21<00:00, 7550561.04it/s]
ToAlphabet_11: 100%|██████████| 64/64 [07:42<00:00,  7.23s/it]


file_12 loaded


ToTuple_12: 100%|██████████| 161472512/161472512 [00:21<00:00, 7613390.90it/s]
ToAlphabet_12: 100%|██████████| 64/64 [08:05<00:00,  7.59s/it]


file_13 loaded


ToTuple_13: 100%|██████████| 161472512/161472512 [00:21<00:00, 7567473.63it/s]
ToAlphabet_13: 100%|██████████| 64/64 [07:53<00:00,  7.39s/it]


file_14 loaded


ToTuple_14: 100%|██████████| 161472512/161472512 [00:20<00:00, 7702937.75it/s]
ToAlphabet_14: 100%|██████████| 64/64 [07:49<00:00,  7.33s/it]


file_15 loaded


ToTuple_15: 100%|██████████| 161472512/161472512 [00:20<00:00, 7762587.71it/s]
ToAlphabet_15: 100%|██████████| 64/64 [07:55<00:00,  7.43s/it]


file_16 loaded


ToTuple_16: 100%|██████████| 161472512/161472512 [00:21<00:00, 7620303.80it/s]
ToAlphabet_16: 100%|██████████| 64/64 [07:54<00:00,  7.41s/it]


file_17 loaded


ToTuple_17: 100%|██████████| 161472512/161472512 [00:21<00:00, 7563164.51it/s]
ToAlphabet_17: 100%|██████████| 64/64 [07:52<00:00,  7.38s/it]


file_18 loaded


ToTuple_18: 100%|██████████| 161464320/161464320 [00:21<00:00, 7496591.19it/s]
ToAlphabet_18: 100%|██████████| 64/64 [08:02<00:00,  7.54s/it]


file_19 loaded


ToTuple_19: 100%|██████████| 161464320/161464320 [00:21<00:00, 7376307.95it/s]
ToAlphabet_19: 100%|██████████| 64/64 [07:57<00:00,  7.46s/it]


file_20 loaded


ToTuple_20: 100%|██████████| 161464320/161464320 [00:27<00:00, 5899871.80it/s]
ToAlphabet_20: 100%|██████████| 64/64 [07:47<00:00,  7.31s/it]


file_21 loaded


ToTuple_21: 100%|██████████| 161464320/161464320 [00:21<00:00, 7532440.01it/s]
ToAlphabet_21: 100%|██████████| 64/64 [07:51<00:00,  7.37s/it]


file_22 loaded


ToTuple_22: 100%|██████████| 161464320/161464320 [00:21<00:00, 7548059.74it/s]
ToAlphabet_22: 100%|██████████| 64/64 [07:56<00:00,  7.45s/it]


file_23 loaded


ToTuple_23: 100%|██████████| 161464320/161464320 [00:21<00:00, 7401807.43it/s]
ToAlphabet_23: 100%|██████████| 64/64 [07:49<00:00,  7.34s/it]


file_24 loaded


ToTuple_24: 100%|██████████| 161464320/161464320 [00:21<00:00, 7520013.26it/s]
ToAlphabet_24: 100%|██████████| 64/64 [07:56<00:00,  7.45s/it]


file_25 loaded


ToTuple_25: 100%|██████████| 161464320/161464320 [00:21<00:00, 7537434.37it/s]
ToAlphabet_25: 100%|██████████| 64/64 [07:42<00:00,  7.23s/it]


file_26 loaded


ToTuple_26: 100%|██████████| 161464320/161464320 [00:27<00:00, 5850247.86it/s]
ToAlphabet_26: 100%|██████████| 64/64 [07:48<00:00,  7.33s/it]


file_27 loaded


ToTuple_27: 100%|██████████| 161464320/161464320 [00:20<00:00, 7719285.01it/s]
ToAlphabet_27: 100%|██████████| 64/64 [07:46<00:00,  7.30s/it]


file_28 loaded


ToTuple_28: 100%|██████████| 161464320/161464320 [00:22<00:00, 7188297.99it/s]
ToAlphabet_28: 100%|██████████| 64/64 [07:49<00:00,  7.34s/it]


file_29 loaded


ToTuple_29: 100%|██████████| 161464320/161464320 [00:20<00:00, 7928496.73it/s]
ToAlphabet_29: 100%|██████████| 64/64 [07:43<00:00,  7.24s/it]


file_30 loaded


ToTuple_30: 100%|██████████| 161464320/161464320 [00:20<00:00, 7791647.67it/s]
ToAlphabet_30: 100%|██████████| 64/64 [07:44<00:00,  7.25s/it]


file_31 loaded


ToTuple_31: 100%|██████████| 161464320/161464320 [00:21<00:00, 7664631.05it/s]
ToAlphabet_31: 100%|██████████| 64/64 [07:58<00:00,  7.48s/it]


file_32 loaded


ToTuple_32: 100%|██████████| 161464320/161464320 [00:20<00:00, 7701386.97it/s]
ToAlphabet_32: 100%|██████████| 64/64 [08:05<00:00,  7.58s/it]


file_33 loaded


ToTuple_33: 100%|██████████| 161464320/161464320 [00:21<00:00, 7542881.93it/s]
ToAlphabet_33: 100%|██████████| 64/64 [07:52<00:00,  7.38s/it]


file_34 loaded


ToTuple_34: 100%|██████████| 161464320/161464320 [00:20<00:00, 7762841.72it/s]
ToAlphabet_34: 100%|██████████| 64/64 [07:51<00:00,  7.37s/it]


file_35 loaded


ToTuple_35: 100%|██████████| 161464320/161464320 [00:20<00:00, 7734571.70it/s]
ToAlphabet_35: 100%|██████████| 64/64 [08:04<00:00,  7.58s/it]


file_36 loaded


ToTuple_36: 100%|██████████| 161464320/161464320 [00:21<00:00, 7560276.61it/s]
ToAlphabet_36: 100%|██████████| 64/64 [07:47<00:00,  7.31s/it]


file_37 loaded


ToTuple_37: 100%|██████████| 161464320/161464320 [00:21<00:00, 7535549.33it/s]
ToAlphabet_37: 100%|██████████| 64/64 [08:00<00:00,  7.51s/it]


file_38 loaded


ToTuple_38: 100%|██████████| 161464320/161464320 [00:20<00:00, 7729247.23it/s]
ToAlphabet_38: 100%|██████████| 64/64 [07:57<00:00,  7.46s/it]


file_39 loaded


ToTuple_39: 100%|██████████| 161464320/161464320 [00:20<00:00, 7739216.00it/s]
ToAlphabet_39: 100%|██████████| 64/64 [08:00<00:00,  7.51s/it]


file_40 loaded


ToTuple_40: 100%|██████████| 161464320/161464320 [00:20<00:00, 7750506.92it/s]
ToAlphabet_40: 100%|██████████| 64/64 [07:55<00:00,  7.43s/it]


file_41 loaded


ToTuple_41: 100%|██████████| 161464320/161464320 [00:21<00:00, 7546539.46it/s]
ToAlphabet_41: 100%|██████████| 64/64 [08:13<00:00,  7.70s/it]


file_42 loaded


ToTuple_42: 100%|██████████| 161464320/161464320 [00:21<00:00, 7561809.95it/s]
ToAlphabet_42: 100%|██████████| 64/64 [07:47<00:00,  7.31s/it]


file_43 loaded


ToTuple_43: 100%|██████████| 161464320/161464320 [00:20<00:00, 7717110.77it/s]
ToAlphabet_43: 100%|██████████| 64/64 [07:43<00:00,  7.24s/it]


file_44 loaded


ToTuple_44: 100%|██████████| 161464320/161464320 [00:20<00:00, 7817967.70it/s]
ToAlphabet_44: 100%|██████████| 64/64 [07:46<00:00,  7.28s/it]


file_45 loaded


ToTuple_45: 100%|██████████| 161464320/161464320 [00:20<00:00, 7887362.75it/s]
ToAlphabet_45: 100%|██████████| 64/64 [07:47<00:00,  7.30s/it]


file_46 loaded


ToTuple_46: 100%|██████████| 161464320/161464320 [00:20<00:00, 7730040.62it/s]
ToAlphabet_46: 100%|██████████| 64/64 [07:52<00:00,  7.39s/it]


file_47 loaded


ToTuple_47: 100%|██████████| 161464320/161464320 [00:20<00:00, 7726109.93it/s]
ToAlphabet_47: 100%|██████████| 64/64 [07:54<00:00,  7.42s/it]


file_48 loaded


ToTuple_48: 100%|██████████| 161464320/161464320 [00:20<00:00, 7900099.21it/s]
ToAlphabet_48: 100%|██████████| 64/64 [07:43<00:00,  7.24s/it]


file_49 loaded


ToTuple_49: 100%|██████████| 161464320/161464320 [00:20<00:00, 7761119.57it/s]
ToAlphabet_49: 100%|██████████| 64/64 [07:52<00:00,  7.38s/it]


file_50 loaded


ToTuple_50: 100%|██████████| 161464320/161464320 [00:20<00:00, 7931492.54it/s]
ToAlphabet_50: 100%|██████████| 64/64 [07:45<00:00,  7.27s/it]


file_51 loaded


ToTuple_51: 100%|██████████| 161464320/161464320 [00:21<00:00, 7629024.25it/s]
ToAlphabet_51: 100%|██████████| 64/64 [07:59<00:00,  7.49s/it]


file_52 loaded


ToTuple_52: 100%|██████████| 161464320/161464320 [00:20<00:00, 7727105.63it/s]
ToAlphabet_52: 100%|██████████| 64/64 [07:49<00:00,  7.33s/it]


file_53 loaded


ToTuple_53: 100%|██████████| 161464320/161464320 [00:20<00:00, 7719687.22it/s]
ToAlphabet_53: 100%|██████████| 64/64 [07:50<00:00,  7.36s/it]


file_54 loaded


ToTuple_54: 100%|██████████| 161464320/161464320 [00:20<00:00, 7752878.77it/s]
ToAlphabet_54: 100%|██████████| 64/64 [07:38<00:00,  7.16s/it]


file_55 loaded


ToTuple_55: 100%|██████████| 161464320/161464320 [00:20<00:00, 7732739.26it/s]
ToAlphabet_55: 100%|██████████| 64/64 [07:45<00:00,  7.28s/it]


file_56 loaded


ToTuple_56: 100%|██████████| 161464320/161464320 [00:20<00:00, 7895238.43it/s]
ToAlphabet_56: 100%|██████████| 64/64 [07:49<00:00,  7.33s/it]


file_57 loaded


ToTuple_57: 100%|██████████| 161464320/161464320 [00:20<00:00, 7880464.40it/s]
ToAlphabet_57: 100%|██████████| 64/64 [07:56<00:00,  7.44s/it]


file_58 loaded


ToTuple_58: 100%|██████████| 161464320/161464320 [00:20<00:00, 7723377.42it/s]
ToAlphabet_58: 100%|██████████| 64/64 [07:54<00:00,  7.42s/it]


file_59 loaded


ToTuple_59: 100%|██████████| 161464320/161464320 [00:20<00:00, 7719110.80it/s]
ToAlphabet_59: 100%|██████████| 64/64 [08:17<00:00,  7.78s/it]


file_60 loaded


ToTuple_60: 100%|██████████| 161464320/161464320 [00:20<00:00, 7756210.09it/s]
ToAlphabet_60: 100%|██████████| 64/64 [07:50<00:00,  7.35s/it]


file_61 loaded


ToTuple_61: 100%|██████████| 161464320/161464320 [00:20<00:00, 7706277.35it/s]
ToAlphabet_61: 100%|██████████| 64/64 [07:49<00:00,  7.33s/it]


file_62 loaded


ToTuple_62: 100%|██████████| 161464320/161464320 [00:20<00:00, 7904880.63it/s]
ToAlphabet_62: 100%|██████████| 64/64 [07:43<00:00,  7.23s/it]


file_63 loaded


ToTuple_63: 100%|██████████| 161464320/161464320 [00:21<00:00, 7677581.76it/s]
ToAlphabet_63: 100%|██████████| 64/64 [07:52<00:00,  7.38s/it]


file_64 loaded


ToTuple_64: 100%|██████████| 161464320/161464320 [00:21<00:00, 7586849.11it/s]
ToAlphabet_64: 100%|██████████| 64/64 [07:56<00:00,  7.45s/it]


file_65 loaded


ToTuple_65: 100%|██████████| 161464320/161464320 [00:20<00:00, 7744876.87it/s]
ToAlphabet_65: 100%|██████████| 64/64 [09:09<00:00,  8.58s/it]


# Vocabulary Creation

In [57]:
def update_dictionary(my_dict, key):
    if key in my_dict:
        my_dict[key] += 1
    else:
        my_dict[key] = 1
    return my_dict


# This function create a dictionary of n-gram and their frequencythanks to a .txt
def n_gram(gram_value, document_set, tmp_words_freq, batch_num):
    for i in tqdm(range(len(document_set)), desc=f'{gram_value}-gram_for_batch_{batch_num}'):
        for j in range(len(document_set[i])):
            if j + gram_value > len(document_set[i]):
                break
            else:
                ngram = document_set[i][j:j + gram_value]
                update_dictionary(tmp_words_freq, ngram)

    
    return tmp_words_freq


def add_value_to_dict(tmp_dict, final_dict):
    # For each key
    for key, value in tqdm(tmp_dict.items(), desc = f'add_value_to_final_dict_for_batch_{batch_num}'):

        # If the key already exists in the final dict
        if key in final_dict:
            final_dict[key] += value  # Add the value to the key corresponding in the final dict
        else:
            final_dict[key] = value  # Create a new key with his value
    return final_dict


# This function sort a dictionary depending on values
def sorting(dictionary):
  sorted_dict = dict(sorted(dictionary.items(), key=lambda item: item[1], reverse=True))
  return sorted_dict

In [62]:
bi_words_freq   = {} 
tri_words_freq  = {}
four_words_freq = {}
five_words_freq = {}

for batch_num in range(1, len(batches)+1):
    
    with open(f"Image_Strings/ImageNet_Strings_{batch_num}.txt", "r") as file:
        
        # Read the entire file content
        content = file.read()

        document_set = []
        for i in range(0, len(content), 128):
          chunk = content[i:i+128]
          document_set.append(chunk)

        tmp_bi_words_freq   = {}
        tmp_tri_words_freq  = {}
        tmp_four_words_freq = {}
        tmp_five_words_freq = {}
        
        # This function create a dictionary of n-gram and their frequency thanks to a .txt
        tmp_bi_words_freq   = n_gram(2, document_set, tmp_bi_words_freq, batch_num)
        tmp_tri_words_freq  = n_gram(3, document_set, tmp_tri_words_freq, batch_num)
        tmp_four_words_freq = n_gram(4, document_set, tmp_four_words_freq, batch_num)
        tmp_five_words_freq = n_gram(5, document_set, tmp_five_words_freq, batch_num)

        # This function add all values of the temp dict to the final dict 
        #(add value if the key already exist in the final dict and create the key if it's not exist)
        bi_words_freq   = add_value_to_dict(tmp_bi_words_freq, bi_words_freq)
        tri_words_freq  = add_value_to_dict(tmp_tri_words_freq, tri_words_freq)
        four_words_freq = add_value_to_dict(tmp_four_words_freq, four_words_freq)
        five_words_freq = add_value_to_dict(tmp_five_words_freq, five_words_freq)
    
        del content
        del tmp_bi_words_freq
        del document_set
        gc.collect

# Sort all the dictionaries
sorted_bigram   = sorting(bi_words_freq)
sorted_trigram  = sorting(tri_words_freq)
sorted_fourgram = sorting(four_words_freq)
sorted_fivegram = sorting(five_words_freq)

with open(f'Image_N-Gram/ImageNet_Bigram_.json', 'w') as file:
    json.dump(sorted_bigram, file)

with open(f'Image_N-Gram/ImageNet_Trigram_.json', 'w') as file:
    json.dump(sorted_trigram, file)

with open(f'Image_N-Gram/ImageNet_Fourgram_.json', 'w') as file:
    json.dump(sorted_fourgram, file)

with open(f'Image_N-Gram/ImageNet_Fivegram_.json', 'w') as file:
    json.dump(sorted_fivegram, file)

del sorted_bigram
del sorted_trigram
del sorted_fourgram
del sorted_fivegram
gc.collect

2-gram_for_batch_1: 100%|██████████| 1261504/1261504 [00:29<00:00, 42516.93it/s]
3-gram_for_batch_1: 100%|██████████| 1261504/1261504 [00:31<00:00, 39939.67it/s]
4-gram_for_batch_1: 100%|██████████| 1261504/1261504 [00:34<00:00, 36284.97it/s]
5-gram_for_batch_1: 100%|██████████| 1261504/1261504 [00:39<00:00, 31900.59it/s]
add_value_to_final_dict_for_batch_1: 100%|██████████| 4096/4096 [00:00<00:00, 5165324.47it/s]
add_value_to_final_dict_for_batch_1: 100%|██████████| 208563/208563 [00:00<00:00, 6050718.84it/s]
add_value_to_final_dict_for_batch_1: 100%|██████████| 2605306/2605306 [00:00<00:00, 5124758.36it/s]
add_value_to_final_dict_for_batch_1: 100%|██████████| 9791549/9791549 [00:02<00:00, 3866268.83it/s]
2-gram_for_batch_2: 100%|██████████| 1261504/1261504 [00:28<00:00, 43641.80it/s]
3-gram_for_batch_2: 100%|██████████| 1261504/1261504 [00:31<00:00, 40478.45it/s]
4-gram_for_batch_2: 100%|██████████| 1261504/1261504 [00:35<00:00, 35651.45it/s]
5-gram_for_batch_2: 100%|██████████| 1261

<function gc.collect(generation=2)>

In [19]:
# Load the json file
with open(f'Image_N-Gram/ImageNet_Bigram_.json', 'r') as file:
    sorted_bigram = json.load(file)

'''test = 0
for key in sorted_bigram:
    
    test += sorted_bigram[key]

print(test)'''

print(list(sorted_bigram.items())[4095])

('S!', 775)


In [26]:
# Load the json file
with open(f'Image_N-Gram/ImageNet_Trigram_.json', 'r') as file:
    sorted_trigram = json.load(file)
print(len(sorted_trigram))
list(sorted_trigram.items())[230000]

262131


('R8k', 23)

In [36]:
with open(f'Image_N-Gram/ImageNet_Fourgram_.json', 'r') as file:
    sorted_fourgram = json.load(file)

print(len(sorted_fourgram))
list(sorted_fourgram.items())[3501150]

11301150


('1vNw', 15)

In [37]:
with open(f'Image_N-Gram/ImageNet_Fivegram_.json', 'r') as file:
    sorted_fivegram = json.load(file)

116886330


('2Z321', 2346)

In [50]:
list(sorted_fivegram.items())[17001150]

('ALq4k', 8)

In [51]:
bigram = dict(list(sorted_bigram.items()))
trigram = dict(list(sorted_trigram.items())[:230000])
fourgram = dict(list(sorted_fourgram.items())[:3501150])
fivegram = dict(list(sorted_fivegram.items())[:17001150])

In [52]:
list(fivegram.items())[:19]

[('11111', 491908771),
 ('44444', 479443222),
 ('33333', 437563921),
 ('55555', 417072887),
 ('22222', 413866638),
 ('77777', 348533937),
 ('66666', 336282730),
 ('00000', 330577143),
 ('F1111', 14970017),
 ('1111F', 14901242),
 ('1111r', 14702808),
 ('r1111', 14661956),
 ('11112', 13046359),
 ('21111', 13040109),
 ('m2222', 12406113),
 ('2222m', 12351319),
 ('4444I', 12217132),
 ('2222K', 12176809),
 ('I4444', 12171953)]

In [53]:
del fivegram['11111']
del fourgram['1111']
del trigram['111']
del bigram['11']

del fivegram['22222']
del fourgram['2222']
del trigram['222']
del bigram['22']

del fivegram['33333']
del fourgram['3333']
del trigram['333']
del bigram['33']

del fivegram['44444']
del fourgram['4444']
del trigram['444']
del bigram['44']

del fivegram['55555']
del fourgram['5555']
del trigram['555']
del bigram['55']

In [54]:
list_of_vocabs = list(fivegram.keys()) + list(fourgram.keys()) + list(trigram.keys()) + list(bigram.keys())

In [55]:
with open(f'Image_N-Gram/ImageNet_ListVocabs_.json', 'w') as file:
    json.dump(list_of_vocabs, file)