[markdown記法](https://qiita.com/tbpgr/items/989c6badefff69377da7)

[米国データサイエンティストがやさしく教えるデータサイエンスのためのPython講座](https://www.udemy.com/course/ds_for_python/learn/lecture/21614968#overview)

# その他データサイエンスに使えるライブラリ

In [1]:
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## glob
ワイルドカードを使ってファイル（パス）名のリストを取得する

In [2]:
glob('public-covid-data/*')

['public-covid-data/rp_msk', 'public-covid-data/rp_im']

## osとpathlib
- os.path.split()
- os.path.join()
- os.path.exists()
- os.path.mkdirs()

In [3]:
import os
from pathlib import Path

### 練習

In [4]:
p = Path('public-covid-data/')

In [52]:
p

PosixPath('public-covid-data')

In [31]:
list(p.iterdir())

[PosixPath('public-covid-data/rp_msk'), PosixPath('public-covid-data/rp_im')]

In [32]:
sub_p = list(p.iterdir())[0]

In [34]:
list(sub_p.iterdir())

[PosixPath('public-covid-data/rp_msk/3.nii.gz'),
 PosixPath('public-covid-data/rp_msk/1.nii.gz'),
 PosixPath('public-covid-data/rp_msk/9.nii.gz'),
 PosixPath('public-covid-data/rp_msk/2.nii.gz'),
 PosixPath('public-covid-data/rp_msk/6.nii.gz'),
 PosixPath('public-covid-data/rp_msk/8.nii.gz'),
 PosixPath('public-covid-data/rp_msk/7.nii.gz'),
 PosixPath('public-covid-data/rp_msk/4.nii.gz'),
 PosixPath('public-covid-data/rp_msk/5.nii.gz')]

In [37]:
target_file = list(sub_p.glob('*[6-9]*'))[0]

In [40]:
folder_p, file_name = os.path.split(target_file)

In [41]:
folder_p

'public-covid-data/rp_msk'

In [42]:
file_name

'9.nii.gz'

### os.path.join()
pathの連結

In [43]:
os.path.join(folder_p, file_name)

'public-covid-data/rp_msk/9.nii.gz'

### フォルダの作成

In [47]:
# public-covid-data/new_folder を作成

p = Path('public-covid-data/')
new_folder_name = 'new_folder'
new_folder_path = os.path.join(p, new_folder_name)
new_folder_path

'public-covid-data/new_folder'

### os.path.exists()
ファイルまたはディレクトリの存在確認

In [49]:
os.path.exists(new_folder_path)

False

### os.makedirs()

In [50]:
if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)

## tqdm
プログレスバーを表示する

In [53]:
from tqdm import tqdm

In [55]:
tqdm()

0it [00:00, ?it/s]

<tqdm.std.tqdm at 0x7fd057421b20>

In [59]:
import time
sum_ = 0

before = time.time()
for i in tqdm(range(int(1e7))):
    sum_ += 1
print(sum_)
after = time.time()

print('it took {}'.format(after - before))

100%|██████████| 10000000/10000000 [00:01<00:00, 6373126.99it/s]

10000000
it took 1.5717971324920654





In [131]:
# データフレームを作成
# columns: path_im, filename, path_msk
# path_im -> 'public-covid-data/rp_im/5.nii.gz'
# filename -> '5.nii.gz'
# path_msk -> 'public-covid-data/rp_msk/5.nii.gz'

p = Path('public-covid-data')
sub_p_msk, sub_p_im = list(p.glob('rp_*'))
filename_list = []
path_msk_list = []
path_im_list = []
for _ in list(sub_p_msk.iterdir()):
    _, file_name = os.path.split(_)
    path_msk_list.append(os.path.join(sub_p_msk, file_name))
    path_im_list.append(os.path.join(sub_p_im, file_name))
    filename_list.append(file_name)

data = {'path_im': path_im_list, 'filename': filename_list, 'path_msk': path_msk_list}
df = pd.DataFrame(data)
df

Unnamed: 0,path_im,filename,path_msk
0,public-covid-data/rp_im/3.nii.gz,3.nii.gz,public-covid-data/rp_msk/3.nii.gz
1,public-covid-data/rp_im/1.nii.gz,1.nii.gz,public-covid-data/rp_msk/1.nii.gz
2,public-covid-data/rp_im/9.nii.gz,9.nii.gz,public-covid-data/rp_msk/9.nii.gz
3,public-covid-data/rp_im/2.nii.gz,2.nii.gz,public-covid-data/rp_msk/2.nii.gz
4,public-covid-data/rp_im/6.nii.gz,6.nii.gz,public-covid-data/rp_msk/6.nii.gz
5,public-covid-data/rp_im/8.nii.gz,8.nii.gz,public-covid-data/rp_msk/8.nii.gz
6,public-covid-data/rp_im/7.nii.gz,7.nii.gz,public-covid-data/rp_msk/7.nii.gz
7,public-covid-data/rp_im/4.nii.gz,4.nii.gz,public-covid-data/rp_msk/4.nii.gz
8,public-covid-data/rp_im/5.nii.gz,5.nii.gz,public-covid-data/rp_msk/5.nii.gz


In [130]:
# 別解

p = Path('public-covid-data')
df_list = []
for folder in p.iterdir():
    file_list = [os.path.split(p)[1] for p in list(folder.iterdir())]
    path_list = [p.as_posix() for p in list(folder.iterdir())]
    df_list.append(pd.DataFrame({'path':path_list, 'filename': file_list}))

df = df_list[0].merge(df_list[2], on='filename',suffixes=('_msk', '_im'))
df

Unnamed: 0,path_msk,filename,path_im
0,public-covid-data/rp_msk/3.nii.gz,3.nii.gz,public-covid-data/rp_im/3.nii.gz
1,public-covid-data/rp_msk/1.nii.gz,1.nii.gz,public-covid-data/rp_im/1.nii.gz
2,public-covid-data/rp_msk/9.nii.gz,9.nii.gz,public-covid-data/rp_im/9.nii.gz
3,public-covid-data/rp_msk/2.nii.gz,2.nii.gz,public-covid-data/rp_im/2.nii.gz
4,public-covid-data/rp_msk/6.nii.gz,6.nii.gz,public-covid-data/rp_im/6.nii.gz
5,public-covid-data/rp_msk/8.nii.gz,8.nii.gz,public-covid-data/rp_im/8.nii.gz
6,public-covid-data/rp_msk/7.nii.gz,7.nii.gz,public-covid-data/rp_im/7.nii.gz
7,public-covid-data/rp_msk/4.nii.gz,4.nii.gz,public-covid-data/rp_im/4.nii.gz
8,public-covid-data/rp_msk/5.nii.gz,5.nii.gz,public-covid-data/rp_im/5.nii.gz


In [134]:
for idx, row in tqdm(df.iterrows(), total=len(df)):
    print('image path for {} is here {}'.format(row['path_im'], row['filename']))

100%|██████████| 9/9 [00:00<00:00, 6194.41it/s]

image path for public-covid-data/rp_im/3.nii.gz is here 3.nii.gz
image path for public-covid-data/rp_im/1.nii.gz is here 1.nii.gz
image path for public-covid-data/rp_im/9.nii.gz is here 9.nii.gz
image path for public-covid-data/rp_im/2.nii.gz is here 2.nii.gz
image path for public-covid-data/rp_im/6.nii.gz is here 6.nii.gz
image path for public-covid-data/rp_im/8.nii.gz is here 8.nii.gz
image path for public-covid-data/rp_im/7.nii.gz is here 7.nii.gz
image path for public-covid-data/rp_im/4.nii.gz is here 4.nii.gz
image path for public-covid-data/rp_im/5.nii.gz is here 5.nii.gz



