In [1]:
import sys
print(sys.version_info)
assert sys.version_info >= (3, 7)

from packaging import version
import sklearn
print ("Scikit-Learn version: ", sklearn.__version__)
assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

sys.version_info(major=3, minor=11, micro=4, releaselevel='final', serial=0)
Scikit-Learn version:  1.3.0


## data preparation

In [2]:
from pathlib import Path

path = Path() / "npz"
print(path)

npz


### data check
**no need if you are sure**

In [18]:
dataSize = 4

for i in range(1, dataSize+1):
    num_curr = "num%d.npz"%i
    sym_curr = "sym%d.npz"%i
    
    num = np.load(path / num_curr)
    sym = np.load(path / sym_curr)
    
    num_data, num_label = num['img'], num['label']
    sym_data, sym_label = sym['img'], sym['label']
    
    print(f"{i}번쨰 데이터 체크: ")
    print(num_data.shape,num_label.shape,sym_data.shape,sym_label.shape)

1번쨰 데이터 체크: 
(400, 28, 28) (400,) (400, 28, 28) (400,)
2번쨰 데이터 체크: 
(400, 28, 28) (400,) (400, 28, 28) (400,)
3번쨰 데이터 체크: 
(400, 28, 28) (400,) (400, 28, 28) (400,)
4번쨰 데이터 체크: 
(400, 28, 28) (400,) (400, 28, 28) (400,)


## merge

In [12]:
dataSize = 4
num_final_data = []
sym_final_data = []

num_final_label = []
sym_final_label = []

for i in range(1, dataSize+1):
    num_curr = "num%d.npz"%i
    sym_curr = "sym%d.npz"%i
    
    num = np.load(path / num_curr)
    sym = np.load(path / sym_curr)
    
    num_data, num_label = num['img'].reshape(400,784), num['label'].astype(str)
    
    # set 1. to int value 255
    num_data[num_data == 1.] = 255.
    num_data = num_data.astype(np.int64)
    
    sym_data, sym_label = sym['img'].reshape(400,784), sym['label'].astype(str)
    
    # set 1. to int value 255
    sym_data[sym_data == 1.] = 255.
    sym_data = sym_data.astype(np.int64)
    
    if i == 1:
        num_final_label = num_label
        sym_final_label = sym_label
        
        num_final_data = num_data
        sym_final_data = sym_data
    else:
        num_final_data = np.concatenate((num_final_data, num_data), axis=0)
        sym_final_data = np.concatenate((sym_final_data, sym_data), axis=0)
        
        num_final_label = np.concatenate((num_final_label, num_label), axis=0)
        sym_final_label = np.concatenate((sym_final_label, sym_label), axis=0)

print(num_final_data.shape, num_final_label.shape,sym_final_data.shape,sym_final_label.shape)

(1600, 784) (1600,) (1600, 784) (1600,)


## save

In [13]:
np.savez_compressed(path / 'fixed2_final_num.npz',img=num_final_data,label=num_final_label)
np.savez_compressed(path / 'fixed2_final_sym.npz',img=sym_final_data,label=sym_final_label)

## check

In [14]:
test_data=np.load(path / 'fixed2_final_num.npz')
print(test_data['img'].shape, test_data['img'].dtype, test_data['label'].shape, test_data['label'].dtype)

(1600, 784) int64 (1600,) <U11


In [16]:
test_data['label']

array(['0', '1', '2', ..., '7', '8', '9'], dtype='<U11')