In [4]:
from pathlib import Path
import pandas as pd 
from tqdm import tqdm
import os

In [5]:
os.chdir(Path(os.getcwd()).parent)

In [6]:
image_patch_path = Path('lsc/image_list_patch.txt')
image_contracted_path = Path('lsc/image_list_contracted.txt')

fix_dict = {}
with image_patch_path.open() as f:
    lines = f.readlines()
    for line in lines:
        assert len(line.split()) == 2, 'image.txt should have 2 columns'
        image_filename, image_path = line.split()
        image_path = Path(image_path)
        assert Path(image_path).exists(), f'{image_path} does not exist'    
        if image_filename in fix_dict:
            if fix_dict[image_filename] != image_path:
                print(f'Warning: {image_filename} has multiple paths')
        fix_dict[image_filename] = image_path

In [7]:
def create_lsc_image_path(image_filename):
    indict = False
    image_filename = image_filename.split('\n')[0]
    if image_filename in fix_dict:
        p = Path(fix_dict[image_filename])
        indict = True
    else:
        yyyymm = image_filename[:6]
        dd = image_filename[6:8]
        p = Path(f'lsc/extracted/{yyyymm}/{dd}/{image_filename}.jpg')
    
    assert Path(p).exists(), f'{p} does not exist, the image_filename is {image_filename}, the filename exists in patch file: {indict}'
    return p

In [8]:
image_filenames = []
image_paths = []
years = []
months = []
days = []

In [9]:
assert image_contracted_path.exists(), f'{image_contracted_path} does not exist'
with image_contracted_path.open() as f:
    lines = f.readlines()
    for line in tqdm(lines):
        image_filename = line.strip()
        try:
            image_path = create_lsc_image_path(line) 
        except AssertionError as e:
            with open('lsc/fix_image_path.txt', 'a') as f:
                f.write(str(e) + '\n')

        image_filenames.append(image_filename)
        image_paths.append(image_path)

        day = image_path.parent.name
        year_month = image_path.parent.parent.name
        year = year_month[:4]
        month = year_month[4:6]

        years.append(year)
        months.append(month)
        days.append(day)

100%|██████████| 297532/297532 [00:05<00:00, 51532.93it/s]


In [10]:
df = pd.DataFrame({'image_filename': image_filenames, 'image_path': image_paths, 'year': years, 'month': months, 'day': days})
df.describe()

Unnamed: 0,image_filename,image_path,year,month,day
count,297532,297532,297532,297532,297532
unique,297532,297532,2,12,31
top,20000101_000113_000,lsc/extracted/202001/16/20000101_000113_000.jpg,2019,3,6
freq,1,1,214468,34319,11202


In [12]:
df.head()

Unnamed: 0,image_filename,image_path,year,month,day
0,20000101_000113_000,lsc/extracted/202001/16/20000101_000113_000.jpg,2020,1,16
1,20000101_000145_000,lsc/extracted/202001/16/20000101_000145_000.jpg,2020,1,16
2,20000101_001405_000,lsc/extracted/202001/16/20000101_001405_000.jpg,2020,1,16
3,20000101_002639_000,lsc/extracted/202001/16/20000101_002639_000.jpg,2020,1,16
4,20000101_011926_000,lsc/extracted/202001/16/20000101_011926_000.jpg,2020,1,16


In [11]:
df.to_csv('lsc/fixed_image_list_contracted.csv', index=False)