In [1]:
import json
import numpy
import pandas
import pathlib
import pydash
import subprocess
import tqdm

title = 'Der Gang in die Nacht'
wikidata = 'Q3793222'
reference = 'https://archive.org/details/silent-der-gang-in-die-nacht'
local_file = pathlib.Path.home() / 'der-gang-in-die-nacht' / 'mkv' / 'der-gang-in-die-nacht.mkv'

if not local_file.exists():
    raise Exception('Local file does not exist.')

ffprobe_info = subprocess.check_output(['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', '-show_streams', str(local_file)])
ffprobe_info = pydash.get(json.loads(ffprobe_info.decode()), 'streams')
ffprobe_info = [x for x in ffprobe_info if x['codec_type'] == 'video']

if len(ffprobe_info) != 1:
    raise Exception ('Multiple video tracks!?')

# note that the accuracy of this is only as good as the digital representation maintains the characteristics of the original analogue item.
# also image can often be letterboxed and/or pillorboxed, so use of ffmpeg's cropdetect filter would be preferable.

a = ffprobe_info[0]['display_aspect_ratio'].split(':')
aspect_ratio = round(float(a[0])/float(a[1]), 2)

# similar case for aspect ratio, especially for silent film normalised to modern framerates.

f = ffprobe_info[0]['r_frame_rate'].split('/')
fps = round(float(f[0])/float(f[1]), 2)

d = ffprobe_info[0]['tags']['DURATION']
duration = d.split('.')[0]

dataframe = pandas.DataFrame(columns=['S', 'P', 'O', 'R'])

dataframe.loc[len(dataframe)] = [(title), ('title'), (title), (reference)]
dataframe.loc[len(dataframe)] = [(title), ('wikidata'), (wikidata), ('https://www.wikidata.org/wiki/Q3793222')]
dataframe.loc[len(dataframe)] = [(title), ('duration'), (duration), (reference)]
dataframe.loc[len(dataframe)] = [(title), ('fps'), (fps), (reference)]
dataframe.loc[len(dataframe)] = [(title), ('aspect ratio'), (aspect_ratio), (reference)]

print(len(dataframe))
dataframe.head()

5


Unnamed: 0,S,P,O,R
0,Der Gang in die Nacht,title,Der Gang in die Nacht,https://archive.org/details/silent-der-gang-in...
1,Der Gang in die Nacht,wikidata,Q3793222,https://www.wikidata.org/wiki/Q3793222
2,Der Gang in die Nacht,duration,01:20:58,https://archive.org/details/silent-der-gang-in...
3,Der Gang in die Nacht,fps,20.0,https://archive.org/details/silent-der-gang-in...
4,Der Gang in die Nacht,aspect ratio,1.33,https://archive.org/details/silent-der-gang-in...


In [2]:
# generate FFprobe report on frame differencing and colour attributes.

attributes = ['YDIF', 'HUEMED', 'HUEAVG', 'SATMAX']
attributes = ','.join([f'lavfi.signalstats.{x}' for x in attributes])

ffprobe_call = ['ffprobe', '-v', 'quiet', '-f', 'lavfi', '-i']
ffprobe_call += [f'movie={str(local_file)},signalstats']
ffprobe_call += ['-show_entries', f'frame=pkt_pts_time:frame_tags={attributes}', '-print_format', 'csv'] 

f = open(str(pathlib.Path.home() / 'temp.csv'), "w")
subprocess.call(ffprobe_call, stdout=f)

0

In [3]:
window = 4
threshold = 70

shot_detect = pandas.read_csv(str(pathlib.Path.home() / 'temp.csv'), header=None)
shot_detect.columns = ['FRAME', 'NO', 'YDIF', 'HUEMED', 'HUEAVG', 'SATMAX']

shot_detect['FRAME'], shot_detect['SHOT'] = shot_detect.index+1, 1
shot_detect = shot_detect.loc[shot_detect.YDIF > threshold]

debounce = shot_detect.to_dict('records')
for x in debounce:
    region = [x for x in range(int(x['FRAME'])-window, int(x['FRAME']))]
    scores = [y['FRAME'] for y in debounce if y['SHOT'] == 1]
    overlap = [x for x in scores if x in region]

    if not len(overlap):
        x['SHOT'] = 1
    else:
        x['SHOT'] = 0

shots = [1]+[x['FRAME'] for x in debounce if x['SHOT'] == 1]
shots = [{'FRAME':x, 'SHOT':n+1} for n,x in enumerate(shots)]

data = pandas.read_csv(str(pathlib.Path.home() / 'temp.csv'), header=None)
data.columns = ['FRAME', 'NO', 'YDIF', 'HUEMED', 'HUEAVG', 'SATMAX']
data['FRAME'] = data.index+1
data = pandas.merge(data, pandas.DataFrame(shots), on='FRAME', how='left')
data['SHOT'] = data['SHOT'].ffill()

for x in data.SHOT.unique():

    shot_name = f'Shot {str(int(x)).zfill(4)}'
 
    section = data.copy()
    section = section.loc[section.SHOT.isin([x])]

    raw_hues = list(section.HUEMED)
    hue_val, hue_count = numpy.unique(raw_hues, return_counts=True)

    # shot name should possibly feature the title itself, eg Der Gang Shot #0007
 
    dataframe.loc[len(dataframe)] = [(shot_name), ('shot from'), (title), (reference)]

    # shot length should be transformed from frame to timecode, consistent with duration.
    # in fact duration property could possibly be reused here.

    dataframe.loc[len(dataframe)] = [(shot_name), ('shot length'), (len(section)), (reference)]

    # hue value needs transformation
    
    # # qctools hue conversion to red == 0
    # # hue = [((h*-1)+160)%360 for h in hue]

    # also possibly further transformation into hex, rgb or colour block.

    dataframe.loc[len(dataframe)] = [(shot_name), ('hue'), (numpy.argmax(hue_count)), (reference)]

print(len(dataframe))
dataframe.tail()

815


Unnamed: 0,S,P,O,R
810,Shot 0269,shot length,5,https://archive.org/details/silent-der-gang-in...
811,Shot 0269,hue,0,https://archive.org/details/silent-der-gang-in...
812,Shot 0270,shot from,Der Gang in die Nacht,https://archive.org/details/silent-der-gang-in...
813,Shot 0270,shot length,11182,https://archive.org/details/silent-der-gang-in...
814,Shot 0270,hue,26,https://archive.org/details/silent-der-gang-in...


In [4]:
png_path = pathlib.Path.home() / 'der-gang-in-die-nacht' / 'png' / '%07d.png'
subprocess.call(['ffmpeg', '-v', 'quiet', '-t', '50', '-i', str(local_file), str(png_path)])
images = sorted([x for x in png_path.parents[0].iterdir() if x.suffix == '.png'])

for x in tqdm.tqdm(images):
    new_path = x.parents[0] / x.stem
    if not pathlib.Path(f'{new_path}.txt').exists():
        subprocess.call(['tesseract', '-c', 'debug_file=/dev/null', str(x), str(new_path), '-l', 'deu'])

100%|██████████| 1000/1000 [00:00<00:00, 49053.32it/s]


In [5]:
text = sorted([x for x in png_path.parents[0].iterdir() if x.suffix == '.txt'])

for x in sorted(text):

    frame_name = f'Frame {x.stem}'
 
    dataframe.loc[len(dataframe)] = [(frame_name), ('frame number'), (x.stem), (reference)]

    shot_numb = data.copy()
    shot_numb = shot_numb.loc[shot_numb.FRAME.isin([int(x.stem)])]
    shot_numb = shot_numb.to_dict('records')[0]['SHOT']
    shot_name = f'Shot {str(int(shot_numb)).zfill(4)}'
    dataframe.loc[len(dataframe)] = [(frame_name), ('from shot'), (shot_name), (reference)]

    with open(x) as ocr_text:
        dataframe.loc[len(dataframe)] = [(frame_name), ('ocr text'), (ocr_text.read().replace('\n', ' ')[:350].strip()), (reference)]

print(len(dataframe))
dataframe.tail()

3815


Unnamed: 0,S,P,O,R
3810,Frame 0000999,from shot,Shot 0001,https://archive.org/details/silent-der-gang-in...
3811,Frame 0000999,ocr text,Der Gang in die Nacht Sine Tragödie in 5 Akten,https://archive.org/details/silent-der-gang-in...
3812,Frame 0001000,frame number,0001000,https://archive.org/details/silent-der-gang-in...
3813,Frame 0001000,from shot,Shot 0001,https://archive.org/details/silent-der-gang-in...
3814,Frame 0001000,ocr text,Der Gang in die Nacht Sine Tragödie in 5 Akten,https://archive.org/details/silent-der-gang-in...


In [6]:
export = dataframe.to_dict('records')
export = [x for x in export if x['S'] in ['Der Gang in die Nacht', 'Shot 0001', 'Frame 0000027', 'Frame 0000900']]

with open(pathlib.Path.cwd() / 'dataset.json', 'w') as outgoing:
    json.dump(export, outgoing, ensure_ascii=False, indent=4)

print('all done.')

all done.
