-
Notifications
You must be signed in to change notification settings - Fork 1
/
parq2folder.py
82 lines (67 loc) · 3.41 KB
/
parq2folder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import argparse
import os
from pathlib import Path
import pandas as pd
# Magic save function
def save_file(file_path, data, mode='w', encoding='utf-8', debug=False):
""" Function to save a file, defaults to write mode,
if wb is passed it will save in binary mode without encoding."""
if not debug:
if mode == 'wb':
with open(file_path, mode) as f:
f.write(data)
print(f'File saved to {file_path}')
elif mode == 'w':
with open(file_path, mode, encoding=encoding) as f:
f.write(data)
print(f'File saved to {file_path}')
else:
print('Debug mode, file not saved')
# function to allow joining of text, tags, alt_text_a and alt_text_b using key args
def build_text(columns):
""" build text from columns in arbitrary order of key """
text_string = [columns[key] for key in columns]
return ', '.join(text_string)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Extract images and text from Parquet files to a folder')
parser.add_argument('--input-parq-dir', type=str,
help='Parquet folder', required=True)
parser.add_argument('--output-dir', type=str,
help='Directory to export data', required=True)
# Create argument to select which caption to save, text or tags
parser.add_argument ('--select-caption', type=str, default='TEXT',
help='To extract more than one column use comma seperated argument, available options: "text, tags, alt_text_a, alt_text_b"',
required=False)
# Parse the args arg!
args = parser.parse_args()
# Path to Parquet files from input argument
parquet_dir = Path(args.input_parq_dir)
# Create list of Parquet files, I assume all parquet files are in the same directory and end with .parquet
parquet_files = [os.path.join(parquet_dir, f) for f in os.listdir(parquet_dir) if f.endswith('.parquet')]
# For each file in the list of Parquet files
for parquet_file in parquet_files:
# Load the Parquet file
df = pd.read_parquet(parquet_file)
# Loop the loop through the dataframe
for index, row in df.iterrows():
# Extract wanted information, need to expand this to include other columns from new arguments
file_name = row['file_name']
image_data = row['image']
#text = row['text']
#alt_text_a = row['alt_text_a']
#alt_text_b = row['alt_text_b']
#tags = row['tags']
# Split argument string into list
arg_string = args.select_caption.split(', ')
# For each column in the argument string, if it exists in the row, add it
text_columns = {col: row[col] for col in arg_string if col in row}
# Build output text from argument and columns
built_text = build_text(text_columns)
# Image path and file_name for saving...
image_file_path = os.path.join(Path(args.output_dir), file_name)
# Call Save function
save_file(image_file_path, image_data, mode='wb', debug=False)
# Text file path and file_name for saving...
text_file_path = os.path.join(Path(args.output_dir), f"{os.path.splitext(file_name)[0]}.txt")
# Call Save function
save_file(text_file_path, built_text, debug=False)