In [12]:
import numpy as np
import pandas as pd
import scipy.io

import torch

## Extract data

In [4]:
import os 
import zipfile

def extract_password_protected_zip(zip_file_path, extract_dir_path, password=None):
    """Extracts a password-protected zip file to a specified directory."""
    try:
        if not os.path.exists(extract_dir_path):
            with zipfile.ZipFile(zip_file_path, 'r') as zObject:
                if password is not None:
                    zObject.setpassword(password)
                zObject.extractall(path=extract_dir_path)
            print(
                f'Successfully extracted {zip_file_path} to {extract_dir_path}.')
        else:
            print(f'{extract_dir_path} already exists, skipping extraction.')
    except zipfile.BadZipFile:
        print(f'{zip_file_path} is not a valid zip file.')
    except RuntimeError as e:
        print(
            f'An error occurred while extracting {zip_file_path} to {extract_dir_path}: {e}.')
        

def extract_all_password_protected_zips(zip_file_paths, extract_dir_path, password=None):
    """Extracts all password-protected zip files in a list to a specified directory."""
    for zip_file_path in zip_file_paths:
        extract_dir_name = os.path.splitext(os.path.basename(zip_file_path))[0]
        extract_dir_path_zip = os.path.join(extract_dir_path, extract_dir_name)
        if os.path.exists(extract_dir_path_zip):
            print(f'{extract_dir_path_zip} already exists, skipping extraction.')
        else:
            extract_password_protected_zip(
                zip_file_path, extract_dir_path_zip, password)


In [8]:
# extract_password_protected_zip('data1.zip', 'data1')
# extract_password_protected_zip('data2.zip', 'data2')


data1 already exists, skipping extraction.
data2 already exists, skipping extraction.


## Data1: 

##### Mat files

In [10]:
categories_mat = scipy.io.loadmat('data1/27 higher-level categories/categories.mat')
category_mat_bottom_up = scipy.io.loadmat('data1/27 higher-level categories/category_mat_bottom_up.mat')
category_mat_top_down = scipy.io.loadmat('data1/27 higher-level categories/category_mat_top_down.mat')
category_mat_manual = scipy.io.loadmat('data1/27 higher-level categories/category_mat_manual.mat')

In [33]:
print(categories_mat.keys())
# explore the data categories_mat
# print(categories_mat['categories'])
print(categories_mat['categories'].shape)
categories_list = [categories_mat['categories'][0][i][0] for i in range(27)]
print(categories_list)


dict_keys(['__header__', '__version__', '__globals__', 'categories'])
(1, 27)
['animal', 'bird', 'body part', 'clothing', 'clothing accessory', 'container', 'dessert', 'drink', 'electronic device', 'food', 'fruit', 'furniture', 'home decor', 'insect', 'kitchen appliance', 'kitchen tool', 'medical equipment', 'musical instrument', 'office supply', 'part of car', 'plant', 'sports equipment', 'tool', 'toy', 'vegetable', 'vehicle', 'weapon']


In [39]:
# explore the data category_mat_bottom_up
print(category_mat_bottom_up.keys())
print(category_mat_bottom_up['category_mat_bottom_up'].shape)
print(category_mat_top_down['category_mat_top_down'].shape)
print(category_mat_manual['category_mat_manual'].shape)

# print(category_mat_bottom_up['category_mat_bottom_up'])

dict_keys(['__header__', '__version__', '__globals__', 'category_mat_bottom_up'])
(1854, 27)
(1854, 27)
(1854, 27)


In [77]:
category_mat_bottom_up_list = [category_mat_bottom_up['category_mat_bottom_up'][i][0] for i in range(27)]
print(category_mat_bottom_up_list)

print(category_mat_bottom_up['category_mat_bottom_up'][:, 0])
# index at which the category is present
print()
print("Category mat bottom up: ")
print('Animal: ')
print(np.where(category_mat_bottom_up['category_mat_bottom_up'][:, 0] == 1)[0])
print(len(np.where(category_mat_bottom_up['category_mat_bottom_up'][:, 0] == 1)[0]))

print()
print('Bird: ')
print(np.where(category_mat_bottom_up['category_mat_bottom_up'][:, 1] == 1)[0])
print(len(np.where(category_mat_bottom_up['category_mat_bottom_up'][:, 1] == 1)[0]))
print()
print('Body part: ')
print(np.where(category_mat_bottom_up['category_mat_bottom_up'][:, 2] == 1)[0])
print(len(np.where(category_mat_bottom_up['category_mat_bottom_up'][:, 2] == 1)[0]))
print()

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
[1 0 0 ... 1 0 0]

Category mat bottom up: 
Animal: 
[   0   12   15   25   26   53   87  105  107  128  134  151  214  241
  245  284  315  321  323  332  334  423  426  427  476  495  504  525
  554  578  638  666  675  684  694  695  718  722  743  774  781  798
  815  845  862  875  884  889  909  917  933  938  992 1014 1016 1017
 1018 1026 1080 1084 1085 1105 1106 1145 1170 1173 1198 1211 1220 1226
 1228 1258 1265 1266 1267 1282 1301 1306 1384 1397 1409 1414 1459 1471
 1472 1476 1529 1659 1736 1779 1781 1797 1831 1845 1851]
95

Bird: 
[ 274  321  323  372  446  525  526  537  608  693  760  811 1084 1091
 1121 1130 1140 1145 1160 1171 1248 1330 1395 1591 1690 1733 1768]
27

Body part: 
[  22   36  161  242  319  330  538  552  564  569  588  628  733  744
  780  877  911 1031 1052 1053 1063 1431 1456 1458 1560 1653 1668 1675
 1678 1685 1842]
31



In [76]:
print("Category mat top down: ")
print('Animal: ')
print(np.where(category_mat_top_down['category_mat_top_down'][:, 0] == 1)[0])
print(len(np.where(category_mat_top_down['category_mat_top_down'][:, 0] == 1)[0]))
print()
print('Bird: ')
print(np.where(category_mat_top_down['category_mat_top_down'][:, 1] == 1)[0])
print(len(np.where(category_mat_top_down['category_mat_top_down'][:, 1] == 1)[0]))
print()
print('Body part: ')
print(np.where(category_mat_top_down['category_mat_top_down'][:, 2] == 1)[0])
print(len(np.where(category_mat_top_down['category_mat_top_down'][:, 2] == 1)[0]))
print()


Category mat top down: 
Animal: 
[   0   12   15   24   25   26   53   75   87  105  107  111  115  128
  134  146  151  212  214  229  241  245  274  284  286  287  315  321
  324  326  332  334  349  371  372  373  406  423  426  427  428  434
  446  471  476  495  502  504  514  525  526  537  541  545  554  578
  600  608  621  638  644  666  675  684  688  693  694  695  706  718
  722  743  760  774  781  798  811  815  825  845  862  875  884  888
  889  909  917  929  933  937  938  939  955  964  992 1014 1016 1017
 1018 1020 1023 1026 1042 1070 1080 1084 1085 1091 1092 1105 1106 1121
 1130 1140 1145 1160 1170 1171 1173 1198 1211 1220 1221 1226 1228 1238
 1248 1249 1258 1265 1266 1267 1282 1285 1288 1301 1306 1330 1384 1393
 1395 1396 1397 1409 1414 1459 1471 1472 1476 1477 1513 1528 1529 1541
 1556 1591 1603 1614 1626 1656 1659 1690 1733 1736 1768 1779 1781 1785
 1797 1804 1831 1836 1845 1851]
174

Bird: 
[ 274  321  324  372  446  525  526  537  608  693  760  811 1084 1091


In [78]:
print("Category mat manual: ")
print('Animal: ')
print(np.where(category_mat_manual['category_mat_manual'][:, 0] == 1)[0])
print(len(np.where(category_mat_manual['category_mat_manual'][:, 0] == 1)[0]))
print()
print('Bird: ')
print(np.where(category_mat_manual['category_mat_manual'][:, 1] == 1)[0])
print(len(np.where(category_mat_manual['category_mat_manual'][:, 1] == 1)[0]))
print()
print('Body part: ')
print(np.where(category_mat_manual['category_mat_manual'][:, 2] == 1)[0])
print(len(np.where(category_mat_manual['category_mat_manual'][:, 2] == 1)[0]))

Category mat manual: 
Animal: 
[   0   12   15   24   25   26   53   75   87  105  107  111  115  128
  134  146  151  212  214  229  241  245  274  284  286  287  315  321
  324  326  332  334  349  371  372  373  406  423  426  427  428  434
  446  471  476  495  502  504  514  525  526  537  541  545  554  578
  600  608  621  638  644  666  675  684  688  693  694  695  706  718
  722  743  760  774  781  798  811  815  825  845  862  875  884  888
  889  909  917  929  933  937  938  939  955  964  992 1014 1016 1017
 1018 1020 1023 1026 1042 1070 1080 1084 1085 1091 1092 1105 1106 1121
 1130 1140 1145 1160 1170 1171 1173 1198 1211 1220 1221 1226 1228 1238
 1248 1249 1258 1265 1266 1282 1285 1288 1301 1306 1330 1359 1373 1384
 1393 1395 1396 1397 1409 1414 1438 1459 1471 1472 1476 1477 1513 1528
 1529 1541 1556 1591 1603 1614 1626 1656 1659 1664 1690 1733 1736 1768
 1779 1781 1785 1797 1804 1831 1836 1845 1851]
177

Bird: 
[ 128  274  321  372  446  525  526  537  608  693  760  8

In [79]:
# category_mat_bottom_up, category_mat_top_down, category_mat_manual all provide information about the 1854 object concepts... 
# in terms of 27 categories
# there are 27 categories for 1854 object concepts
# 3 files provide different information about the categories


##### CSV Files:

In [59]:
categorization_tsv = pd.read_csv("./data1/27 higher-level categories/categorization.tsv", sep='\t', header=None)
category_mat_bottom_up_tsv = pd.read_csv("./data1/27 higher-level categories/category_mat_bottom_up.tsv", sep='\t')
category_mat_top_down_tsv = pd.read_csv("./data1/27 higher-level categories/category_mat_top_down.tsv", sep='\t')
category_mat_manual_tsv = pd.read_csv("./data1/27 higher-level categories/category_mat_manual.tsv", sep='\t')

In [60]:
print(categorization_tsv.shape)
categorization_tsv.head()

# categorization_tsv provides the 20 labels for all the  images

(1854, 21)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,aardvark,aardvark,animal,animal,animal,animal,animal,animal,animal,animal,...,animal,animal,animal,animal,animal,animal,animal,animal,anteater,mammal
1,abacus,academics,calculating tool,calculator,calculator,calculators,counting,counting device,game,instrument,...,math tool,math tool,math tool,mathematic,mathematic tool,mathematical device,mathematical device,mathematics,science instrument,tool
2,accordion,music,music,music equipment,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,...,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument
3,acorn,acorn,fruit,fruit,nut,nut,nut,nut,nut,nut,...,nut,plant,seed,seed,seed,seed,squirrel meal,tree nut,tree seed,wooden
4,air conditioner,air,appliance,appliance,appliance,appliance,appliance,appliance,cooling,cooling system,...,home,home appliance,home appliance,home appliance,home furnishing,household appliance,household climate control,hvac,portable cooling units,temperature controller


Index([   0,   12,   15,   25,   26,   53,   87,  105,  107,  128,  134,  151,
        214,  241,  245,  284,  315,  321,  323,  332,  334,  423,  426,  427,
        476,  495,  504,  525,  554,  578,  638,  666,  675,  684,  694,  695,
        718,  722,  743,  774,  781,  798,  815,  845,  862,  875,  884,  889,
        909,  917,  933,  938,  992, 1014, 1016, 1017, 1018, 1026, 1080, 1084,
       1085, 1105, 1106, 1145, 1170, 1173, 1198, 1211, 1220, 1226, 1228, 1258,
       1265, 1266, 1267, 1282, 1301, 1306, 1384, 1397, 1409, 1414, 1459, 1471,
       1472, 1476, 1529, 1659, 1736, 1779, 1781, 1797, 1831, 1845, 1851],
      dtype='int64')
95


In [91]:
category_mat_bottom_up_tsv.head()

Unnamed: 0,animal,bird,body part,clothing,clothing accessory,container,dessert,drink,electronic device,food,...,musical instrument,office supply,part of car,plant,sports equipment,tool,toy,vegetable,vehicle,weapon
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
print(category_mat_bottom_up_tsv.shape)

# find the column numbers which are labelled 1 for animal, bird, body part
animal_col = category_mat_bottom_up_tsv[category_mat_bottom_up_tsv['animal'] == 1].index
bird_col = category_mat_bottom_up_tsv[category_mat_bottom_up_tsv['bird'] == 1].index
body_part_col = category_mat_bottom_up_tsv[category_mat_bottom_up_tsv['body part'] == 1].index

print("category_mat_bottom_up_tsv:")
print("Animal column numbers length:", len(animal_col))
print("Bird column numbers length:", len(bird_col))
print("Body part column numbers length:", len(body_part_col))

(1854, 27)
category_mat_bottom_up_tsv:
Animal column numbers length: 95
Bird column numbers length: 27
Body part column numbers length: 31


In [94]:
# find the column numbers which are labelled 1 for animal, bird, body part
animal_col = category_mat_top_down_tsv[category_mat_top_down_tsv['animal'] == 1].index
bird_col = category_mat_top_down_tsv[category_mat_top_down_tsv['bird'] == 1].index
body_part_col = category_mat_top_down_tsv[category_mat_top_down_tsv['body part'] == 1].index

print("category_mat_top_down_tsv:")
print("Animal column numbers length:", len(animal_col))
print("Bird column numbers length:", len(bird_col))
print("Body part column numbers length:", len(body_part_col))

category_mat_top_down_tsv:
Animal column numbers length: 174
Bird column numbers length: 27
Body part column numbers length: 31


In [95]:
# find the column numbers which are labelled 1 for animal, bird, body part
animal_col = category_mat_manual_tsv[category_mat_manual_tsv['animal'] == 1].index
bird_col = category_mat_manual_tsv[category_mat_manual_tsv['bird'] == 1].index
body_part_col = category_mat_manual_tsv[category_mat_manual_tsv['body part'] == 1].index

print("category_mat_manual_tsv:")
print("Animal column numbers length:", len(animal_col))
print("Bird column numbers length:", len(bird_col))
print("Body part column numbers length:", len(body_part_col))

category_mat_manual_tsv:
Animal column numbers length: 177
Bird column numbers length: 27
Body part column numbers length: 34


In [15]:
print(len(categories_tsv))
categories_tsv[0:2]


1853


Unnamed: 0,aardvark,aardvark.1,animal,animal.1,animal.2,animal.3,animal.4,animal.5,animal.6,animal.7,...,animal.9,animal.10,animal.11,animal.12,animal.13,animal.14,animal.15,animal.16,anteater,mammal
0,abacus,academics,calculating tool,calculator,calculator,calculators,counting,counting device,game,instrument,...,math tool,math tool,math tool,mathematic,mathematic tool,mathematical device,mathematical device,mathematics,science instrument,tool
1,accordion,music,music,music equipment,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,...,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument,musical instrument


In [16]:
# find unique elements in a dataframe column
len(categories_tsv['mammal'].unique())


1013

In [17]:

# Replace these with the actual values
zip_file_paths = ["./data1/Images_zip/object_images_A-C.zip",
                  "./data1/Images_zip/object_images_D-K.zip",
                  "./data1/Images_zip/object_images_L-Q.zip",
                  "./data1/Images_zip/object_images_R-S.zip",
                  "./data1/Images_zip/object_images_T-Z.zip"]
extract_dir_path = "./data1/Images"
password = 'things4all'.encode('utf-8')

extract_all_password_protected_zips(zip_file_paths, extract_dir_path, password)


./data1/Images/object_images_A-C already exists, skipping extraction.
./data1/Images/object_images_D-K already exists, skipping extraction.
./data1/Images/object_images_L-Q already exists, skipping extraction.
./data1/Images/object_images_R-S already exists, skipping extraction.
./data1/Images/object_images_T-Z already exists, skipping extraction.


## Data 2: 

In [11]:
! ls -l ./data2/

total 1150788
-rw-rw-r-- 1 mehul mehul       6138 Apr 16 11:16 description.txt
-rw-rw-r-- 1 mehul mehul       1817 Mar 22 05:56 LICENSE_THINGSplus.txt
drwxrwxr-x 4 mehul mehul       4096 Mar 22 05:56 Metadata
-rw-rw-r-- 1 mehul mehul 1178383468 Mar 22 05:56 object_images_CC0.zip


In [12]:
# unzip object_images_CCO.zip
from zipfile import ZipFile
zip_file_path = "data2/object_images_CC0.zip"
extract_dir_path = "./data2/Images"
# unzip(zip_file_path, extract_dir_path)
# unzpip(zip_file_path, extract_dir_path)

# loading the temp.zip and creating a zip object
with ZipFile(zip_file_path, 'r') as zObject:

	# Extracting all the members of the zip
	# into a specific location.
	zObject.extractall(
		path=extract_dir_path)


In [13]:
# count images inside ./data2/Images/images_resized using terminal
! ls -l ./data2/Images/images_resized | wc -l


1855
