In [1]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%cd ...

C:\Users\Minjae\Desktop\EEG_Project\notebook\nii_test


-----

## 환경 구성

In [2]:
# Load some packages
import os
import re
import copy
import glob
import shutil
import pprint
import datetime
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython import display

import nibabel as nib

In [3]:
# Other settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # cleaner text

plt.style.use('default') 
# ['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 
#  'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 
#  'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 
#  'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 
#  'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']

plt.rcParams['image.interpolation'] = 'bicubic'
plt.rcParams["font.family"] = 'Arial' # 'NanumGothic' # for Hangul in Windows
plt.rcParams["animation.html"] = "jshtml"
plt.rcParams['animation.ffmpeg_path'] = 'c:/ffmpeg/bin/ffmpeg.exe'

-----

## 파일 구조 파악

In [4]:
folder_path1 = r'H:\Other_DB\CAU_EEG\230314_nii_file'
folder_path2 = r'H:\Other_DB\CAU_EEG\230315_MRI_3DT1'

file_dict = {}

for file_path in glob.glob(os.path.join(folder_path1, '**.nii')):
    file_name = os.path.basename(file_path)

    subject_id = None
    for chunk in re.split('[-_.() ]', file_name):
        if chunk.isnumeric() and len(chunk) >= 5:
            subject_id = chunk
            break
            
    if subject_id in file_dict:
        file_dict[subject_id].append(file_path)
    else:
        file_dict[subject_id] = [file_path]

for file_path in glob.glob(os.path.join(folder_path2, '**.nii')):
    file_name = os.path.basename(file_path)
    
    for chunk in re.split('[-_.() ]', file_name):
        if chunk.isnumeric() and len(chunk) >= 5:
            subject_id = chunk
            break

    if subject_id in file_dict:
        file_dict[subject_id].append(file_path)
    else:
        file_dict[subject_id] = [file_path]
        
pprint.pprint(file_dict)

{'00000381': ['H:\\Other_DB\\CAU_EEG\\230315_MRI_3DT1\\00000381.nii'],
 '00000613': ['H:\\Other_DB\\CAU_EEG\\230315_MRI_3DT1\\00000613.nii'],
 '00001233': ['H:\\Other_DB\\CAU_EEG\\230315_MRI_3DT1\\00001233.nii'],
 '00001809': ['H:\\Other_DB\\CAU_EEG\\230314_nii_file\\00001809_T1_3DVOL_1.nii',
              'H:\\Other_DB\\CAU_EEG\\230314_nii_file\\00001809_T1_3DVOL_2.nii',
              'H:\\Other_DB\\CAU_EEG\\230315_MRI_3DT1\\00001809.nii'],
 '00002281': ['H:\\Other_DB\\CAU_EEG\\230315_MRI_3DT1\\00002281.nii'],
 '00003197': ['H:\\Other_DB\\CAU_EEG\\230314_nii_file\\00003197_MPRAGE_1.nii',
              'H:\\Other_DB\\CAU_EEG\\230314_nii_file\\00003197_MPRAGE_2.nii',
              'H:\\Other_DB\\CAU_EEG\\230315_MRI_3DT1\\00003197.nii'],
 '000058064': ['H:\\Other_DB\\CAU_EEG\\230314_nii_file\\000058064_T1_3DOVL_2.nii',
               'H:\\Other_DB\\CAU_EEG\\230314_nii_file\\000058064_T1_3DVOL_1.nii',
               'H:\\Other_DB\\CAU_EEG\\230314_nii_file\\000058064_T1_3DVOL_3.nii',
     

### 파일 중복 검사

In [5]:
duplication = {subject_id: [] for subject_id in file_dict.keys()}

for subject_id, file_path_list in tqdm(file_dict.items()):
    checker = [False] * len(file_path_list)
    
    for i, file_path in enumerate(file_path_list):
        if checker[i]:
            continue
        current_set = set([file_path])
        checker[i] = True
        
        pivot = nib.load(file_path).get_fdata()
        for k in range(i + 1, len(file_path_list)):
            if checker[k]:
                continue
            latter = nib.load(file_path_list[k]).get_fdata()                
            if pivot.shape == latter.shape and (pivot == latter).all():
                current_set.add(file_path_list[k])
                checker[k] = True
        duplication[subject_id].append(current_set)

  0%|          | 0/1917 [00:00<?, ?it/s]

In [7]:
target_folder = r"H:\Other_DB\CAU_EEG\230403_MRI_Integration"
for dup_list in tqdm(duplication.values()):
    for dup_set in dup_list:
        longest_name = ''
        longest_path = ''
        for file_path in dup_set:
            file_name = os.path.basename(file_path)
            if len(longest_name) < len(file_name):
                longest_name = file_name
                longest_path = file_path
        shutil.copy2(longest_path, os.path.join(target_folder, longest_name))

  0%|          | 0/1917 [00:00<?, ?it/s]