In [1]:
import colorsys
import io
import json
import numpy
import pandas
import pathlib
import pydash
import subprocess
import tqdm

def ffprobe_file_analysis(source):

    ''' Extract basic file characteristics. '''

    ffprobe_info = subprocess.check_output(['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', '-show_streams', str(local_file)])
    ffprobe_info = pydash.get(json.loads(ffprobe_info.decode()), 'streams')
    ffprobe_info = [x for x in ffprobe_info if x['codec_type'] == 'video']

    if len(ffprobe_info) != 1:
        raise Exception ('Multiple video tracks!?')

    a = ffprobe_info[0]['display_aspect_ratio'].split(':')
    aspect_ratio = round(float(a[0])/float(a[1]), 2)

    f = ffprobe_info[0]['r_frame_rate'].split('/')
    fps = round(float(f[0])/float(f[1]), 2)

    d = ffprobe_info[0]['tags']['DURATION']
    duration = d.split('.')[0]
    
    return {'aspect ratio': aspect_ratio, 'fps': fps, 'duration': duration}

def ffprobe_image_analysis(source):

    ''' More detailed image analysis. '''

    attributes = ['HUEMED', 'YDIF']
    attributes_ff = ','.join([f'lavfi.signalstats.{x}' for x in attributes])

    ffprobe_call = ['ffprobe', '-v', 'quiet', '-f', 'lavfi', '-i']
    ffprobe_call += [f'movie={str(source)},signalstats']
    ffprobe_call += ['-show_entries', f'frame=pkt_pts_time:frame_tags={attributes_ff}', '-print_format', 'csv'] 

    a = subprocess.check_output(ffprobe_call)

    dataframe = pandas.read_table(io.StringIO(a.decode()), sep=',', header=None)
    dataframe.columns = ['frame', 'location'] + attributes

    return dataframe

def shot_detect(l, t, w):

    ''' Simple shot detection algorithm. '''

    # detect all frames above ydif threshold level.
    l = [int(x >= t) for x in l]

    # debounce detection.
    for n, x in enumerate(l):
        if n > w:
            if sum(l[n-w:n]):
                l[n] = 0

    # convert to frame number (shot number can be inferred by placement).
    return [1]+[n for n,x in enumerate(l) if x == 1]

def shot_length(row, data):

    ''' Determine how many frames comprise a shot. '''

    isolated = data.copy()
    isolated = isolated.loc[isolated.shot.isin([row['shot']])]

    return len(isolated)

def hue_average(row, data):

    ''' Reduce FFprobe HUEMED data to RGB. '''

    isolated = data.copy()
    isolated = isolated.loc[isolated.shot.isin([row['shot']])]
    hue = numpy.median(list(isolated.HUEMED))
    hue = ((hue*-1)+160)%360
    rgb = colorsys.hsv_to_rgb(hue/360, 1.0, 1.0)
    rgb = ', '.join([str(int(x*255)) for x in rgb])

    return rgb

local_file = pathlib.Path.home() / 'der-gang-in-die-nacht' / 'mkv' / 'der-gang-in-die-nacht.mkv'

if not local_file.exists():
    raise Exception('Local file does not exist.')

frames = pathlib.Path.home() / 'der-gang-in-die-nacht' / 'jpg' / '%07d.jpg'

if not len([x for x in frames.parents[0].iterdir() if x.suffix == '.jpg']):
    subprocess.call(['ffmpeg', '-v', 'quiet', '-i', str(local_file), str(frames)])

for x in tqdm.tqdm(sorted([x for x in frames.parents[0].iterdir() if x.suffix == '.jpg'])):
    tesseract_output = x.parents[1] / 'txt' / f'{x.stem}'
    tesseract_output_with_ext = x.parents[1] / 'txt' / f'{x.stem}.txt' 
    if not tesseract_output_with_ext.exists():
        subprocess.call(['tesseract', str(x), str(tesseract_output), '-l', 'deu'], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)

100%|██████████| 97160/97160 [00:01<00:00, 59918.93it/s]


In [2]:
# work claims.

title = 'Der Gang in die Nacht'
wikidata = 'Q3793222'
reference = 'https://archive.org/details/silent-der-gang-in-die-nacht'

dataframe = pandas.DataFrame(columns=['S', 'I', 'P', 'O', 'R'])

dataframe.loc[len(dataframe)] = [(title), ('work'), ('title'), (title), (reference)]
dataframe.loc[len(dataframe)] = [(title), ('work'), ('wikidata'), (wikidata), ('https://www.wikidata.org/wiki/Q3793222')]
dataframe.loc[len(dataframe)] = [(title), ('work'), ('represented by'), (local_file.name), (reference)]

print(len(dataframe))
dataframe.head()

3


Unnamed: 0,S,I,P,O,R
0,Der Gang in die Nacht,work,title,Der Gang in die Nacht,https://archive.org/details/silent-der-gang-in...
1,Der Gang in die Nacht,work,wikidata,Q3793222,https://www.wikidata.org/wiki/Q3793222
2,Der Gang in die Nacht,work,represented by,der-gang-in-die-nacht.mkv,https://archive.org/details/silent-der-gang-in...


In [3]:
# media claims.

media_metadata = ffprobe_file_analysis(local_file)

dataframe.loc[len(dataframe)] = [(local_file.name), ('media file'), ('representation of'), (title), (reference)]
dataframe.loc[len(dataframe)] = [(local_file.name), ('media file'), ('duration'), (media_metadata['duration']), (reference)]
dataframe.loc[len(dataframe)] = [(local_file.name), ('media file'), ('fps'), (media_metadata['fps']), (reference)]
dataframe.loc[len(dataframe)] = [(local_file.name), ('media file'), ('aspect ratio'), (media_metadata['aspect ratio']), (reference)]

print(len(dataframe))
dataframe.head()

7


Unnamed: 0,S,I,P,O,R
0,Der Gang in die Nacht,work,title,Der Gang in die Nacht,https://archive.org/details/silent-der-gang-in...
1,Der Gang in die Nacht,work,wikidata,Q3793222,https://www.wikidata.org/wiki/Q3793222
2,Der Gang in die Nacht,work,represented by,der-gang-in-die-nacht.mkv,https://archive.org/details/silent-der-gang-in...
3,der-gang-in-die-nacht.mkv,media file,representation of,Der Gang in die Nacht,https://archive.org/details/silent-der-gang-in...
4,der-gang-in-die-nacht.mkv,media file,duration,01:20:58,https://archive.org/details/silent-der-gang-in...


In [4]:
# shot claims.

analysis = ffprobe_image_analysis(local_file)
analysis['frame'] = analysis.index+1

shotlist = shot_detect(list(analysis.YDIF), 7, 24)
shotlist = pandas.DataFrame([{'frame':x, 'shot':n+1} for n,x in enumerate(shotlist)])
analysis = pandas.merge(analysis, shotlist, on='frame', how='left')
analysis.shot = analysis.shot.ffill().astype('int')
analysis['shot_length'] = analysis.apply(shot_length, data=analysis, axis=1)
analysis['rgb'] = analysis.apply(hue_average, data=analysis, axis=1)

shot_dataframe = analysis.copy()
shot_dataframe = shot_dataframe[['shot', 'shot_length', 'rgb']].drop_duplicates()

for x in shot_dataframe.to_dict('records'):
    label = f"{local_file.name} shot #{x['shot']}"
    dataframe.loc[len(dataframe)] = [(local_file.name), ('media file'), ('has shot'), (label), (reference)]
    dataframe.loc[len(dataframe)] = [(label), ('shot'), ('shot of'), (local_file.name), (reference)]

    dataframe.loc[len(dataframe)] = [(label), ('shot'), ('shot number'), (x['shot']), (reference)]
    dataframe.loc[len(dataframe)] = [(label), ('shot'), ('shot length'), (x['shot_length']), (reference)]
    dataframe.loc[len(dataframe)] = [(label), ('shot'), ('hue'), (x['rgb']), (reference)]

print(len(dataframe))
dataframe.head(20)

2232


Unnamed: 0,S,I,P,O,R
0,Der Gang in die Nacht,work,title,Der Gang in die Nacht,https://archive.org/details/silent-der-gang-in...
1,Der Gang in die Nacht,work,wikidata,Q3793222,https://www.wikidata.org/wiki/Q3793222
2,Der Gang in die Nacht,work,represented by,der-gang-in-die-nacht.mkv,https://archive.org/details/silent-der-gang-in...
3,der-gang-in-die-nacht.mkv,media file,representation of,Der Gang in die Nacht,https://archive.org/details/silent-der-gang-in...
4,der-gang-in-die-nacht.mkv,media file,duration,01:20:58,https://archive.org/details/silent-der-gang-in...
5,der-gang-in-die-nacht.mkv,media file,fps,20.0,https://archive.org/details/silent-der-gang-in...
6,der-gang-in-die-nacht.mkv,media file,aspect ratio,1.33,https://archive.org/details/silent-der-gang-in...
7,der-gang-in-die-nacht.mkv,media file,has shot,der-gang-in-die-nacht.mkv shot #1,https://archive.org/details/silent-der-gang-in...
8,der-gang-in-die-nacht.mkv shot #1,shot,shot of,der-gang-in-die-nacht.mkv,https://archive.org/details/silent-der-gang-in...
9,der-gang-in-die-nacht.mkv shot #1,shot,shot number,1,https://archive.org/details/silent-der-gang-in...


In [5]:
# frame claims.

frame_dataframe = analysis.copy()

for x in frame_dataframe.to_dict('records'):
    shot_label = f"{local_file.name} shot #{x['shot']}"
    frame_label = f"{local_file.name} frame #{x['frame']}"

    dataframe.loc[len(dataframe)] = [(shot_label), ('shot'), ('has frame'), (frame_label), (reference)]
    dataframe.loc[len(dataframe)] = [(frame_label), ('frame'), ('frame of'), (shot_label), (reference)] 
    dataframe.loc[len(dataframe)] = [(frame_label), ('frame'), ('frame number'), (x['frame']), (reference)]

    ocr_path = pathlib.Path.home() / 'der-gang-in-die-nacht' / 'txt' / f"{str(x['frame']).zfill(7)}.txt"
    with open(ocr_path) as ocr_text:
        ocr_text = ocr_text.read()
        if len(ocr_text):
            dataframe.loc[len(dataframe)] = [(frame_label), ('frame'), ('ocr text'), (ocr_text), (reference)]

print(len(dataframe))
dataframe.head(20)

310381


Unnamed: 0,S,I,P,O,R
0,Der Gang in die Nacht,work,title,Der Gang in die Nacht,https://archive.org/details/silent-der-gang-in...
1,Der Gang in die Nacht,work,wikidata,Q3793222,https://www.wikidata.org/wiki/Q3793222
2,Der Gang in die Nacht,work,represented by,der-gang-in-die-nacht.mkv,https://archive.org/details/silent-der-gang-in...
3,der-gang-in-die-nacht.mkv,media file,representation of,Der Gang in die Nacht,https://archive.org/details/silent-der-gang-in...
4,der-gang-in-die-nacht.mkv,media file,duration,01:20:58,https://archive.org/details/silent-der-gang-in...
5,der-gang-in-die-nacht.mkv,media file,fps,20.0,https://archive.org/details/silent-der-gang-in...
6,der-gang-in-die-nacht.mkv,media file,aspect ratio,1.33,https://archive.org/details/silent-der-gang-in...
7,der-gang-in-die-nacht.mkv,media file,has shot,der-gang-in-die-nacht.mkv shot #1,https://archive.org/details/silent-der-gang-in...
8,der-gang-in-die-nacht.mkv shot #1,shot,shot of,der-gang-in-die-nacht.mkv,https://archive.org/details/silent-der-gang-in...
9,der-gang-in-die-nacht.mkv shot #1,shot,shot number,1,https://archive.org/details/silent-der-gang-in...


In [6]:
export = dataframe.to_dict('records')

with open(pathlib.Path.cwd() / 'dataset.json', 'w') as outgoing:
    json.dump(export, outgoing, ensure_ascii=False, indent=4)

print('all done.')

all done.
