In [1]:
# default_exp face_mesh.capture_session

# Viseme dataset

> Create a dataset that can be used to classify a set of face mesh landmarks as a viseme.

## Default file/directory layout

- `data`
    - viseme-config.json
    - `capture_session`
        - `viseme_{%Y%m%d_%H%M%S}_{viseme_id}` (one for each capture session)
            - data.npy
            - metadata.json
            - {row_id}.jpeg (one for each "row" in data.npy)
    - `viseme_dataset_{%Y%m%d_%H%M%S}` (one for each dataset created from capture session data)
        - data.npy
        - metadata.json
        - `processed_{%Y%m%d_%H%M%S}` (one for each processed dataset)
            - metadata.json
            - data.npy
            - stats.npz
            - `model_{%Y%m%d_%H%M%S}` (one for each model trained)
                - metadata.json
                - state_dict.npz

## Default file/directory layout

- `data`
    - viseme-config.json
    - `capture_session`
        - `viseme_{%Y%m%d_%H%M%S}_{viseme_id}` (one for each capture session)
            - data.npy
            - metadata.json
            - {row_id}.jpeg (one for each "row" in data.npy)
    - `viseme_dataset_{%Y%m%d_%H%M%S}` (one for each dataset created from capture session data)
        - data.npy
        - metadata.json
        - `processed_{%Y%m%d_%H%M%S}` (one for each processed dataset)
            - metadata.json
            - data.npy
            - stats.npz
            - `model_{%Y%m%d_%H%M%S}` (one for each model trained)
                - metadata.json
                - state_dict.npz

## How to train a viseme classifier, using your own dataset;

So we'll be training a model that can classify visemes from face mesh landmarks.

- Create a `data` directory containing viseme-config.json
- Run `capture_session` for each viseme/expression you want to classify
    - TODO: command line `capture_session`
    - TODO: suggest how many sample per-session, how many sessions etc
- Run `ml.data#viseme_dataset_from_capture_sessions` to create a dataset containing multiple visemes
- Run `ml.data#processed_dataset_from_viseme_dataset` to create a ML ready dataset
- Copy the `processed` dataset to google drive
- TODO: Run training notebook in colab
- Copy state_dict.npz (trained weights) back to your machine
- TODO: Try out with 10d_test_np_model.ipynb

### Example viseme-config.json
```
{
    "expressions": {
        "0": "No expression",
        "1": "oo",
        "2": "ee",
        "3": "ah",
        "4": "Random Talking"
    }
}
```

We can use this classifier as part of the expression pointer control system to change modes - e.g. start/stop moving the pointer.

expressions to capture
- nothing
- oo
- ee
- ar/ah
- random talking - without exagerating expressions ...
    - this should be ignored by pointer control
    - MAYBE we should remove random talking examples that are classified as 0:nothing?

while recording data
- keep fingers on keyboard
- exagerate expression - unless we're doing nothing/random talking
- change lighing over different capture sessions
- move around slowly
    - up,down,left,right,corners etc
- move forward and backward a bit

In [2]:
#export
from expoco.core import *
import ipywidgets as widgets # TODO: remove if not used
import numpy as np
import pandas as pd
import cv2, time, math, json, shutil
import win32api, win32con

import mediapipe as mp
mp_face_mesh = mp.solutions.face_mesh

from pathlib import Path

In [3]:
#export
COLUMN_NAMES = landmark_ids_to_col_names(range(468), None, ['x','y','z'])

In [4]:
COLUMN_NAMES[:5], COLUMN_NAMES[-5:]

(['0x', '0y', '0z', '1x', '1y'], ['466y', '466z', '467x', '467y', '467z'])

In [5]:
#export
def _new_metadata(stop_after, path, video_capture, expression_id, expression_name, comments):
    width, height = [int(video_capture.get(p)) for p in [cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT]]
    assert width >= height
    return dict(count=0, stop_after=stop_after, path=str(path.resolve()), expression_id=expression_id, 
                expression_name=expression_name, capture_width=width, capture_height=height, start_date=now(), 
                column_names=COLUMN_NAMES, comments=comments)
# TODO: add relevant software versions etc

In [6]:
#export
def _setup_variables(expression_id):
    expression_id = str(expression_id)
    with open('data/viseme-config.json') as f: config = json.load(f)
    if expression_id not in config.get('expressions', {}):
        raise Exception(f'{expression_id} is missing from expressions section of data/viseme-config.json')
    expression_name = config['expressions'][expression_id]
    path = Path(f'data/capture_session/viseme_{now()}_{expression_id}') # TODO: capture_sessions
    path.mkdir(parents=True, exist_ok=True)
    video_capture = cv2.VideoCapture(0)
    width, height = [int(video_capture.get(p)) for p in [cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT]]
    assert width >= height
    return expression_id, expression_name, path, video_capture, width, height

In [7]:
#export
def _update_image(image, image_display_helper, text):
    image = cv2.putText(image, text, (20,40), cv2.FONT_HERSHEY_COMPLEX, 1, (200,200,200))
    image_display_helper.show(image)

In [8]:
#export
def _capture_and_process(video_capture, face_mesh): # TODO: DRY
    retval, image = video_capture.read() # TODO: check retval
    image = cv2.flip(image, 1)
    return image, face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

In [9]:
#export
def _countdown(video_capture, expression_name, image_display_helper, face_mesh):
    for i in range(3,0,-1):
        image, _ = _capture_and_process(video_capture, face_mesh)
        _update_image(image, image_display_helper, f'Capture: {expression_name} in {i}s')
        time.sleep(1)

when we `_save_results`
- we want to save some images so we can check why things are being misclassified etc
- so that we don't create too much data, we can
    - save one in 10 images at high quality or
    - save all images at low quality
- if we wanted to be able to re-calculate landmarks (i.e. if mediapipe changed) we might need to save all images at high quality
    - TODO: see if we get the same landmark data from low res images

In [10]:
#export
def _save_results(path, results, data, image, capture_count):
    # save all landmarks calculated
    row = []
    for landmark_id in range(468):
        landmark = results.multi_face_landmarks[0].landmark[landmark_id]
        for coord in ['x','y','z']:
            row.append(getattr(landmark, coord))
    data.append(row)
    # save one in 10 images
#     if capture_count % 10 == 0: 
#         img_name = f'{now()}_{capture_count}.png'
#         assert cv2.imwrite(f'{path}/{img_name}', image)
#         data['img_path'].append(img_name)
#     else:
#         data['img_path'].append('')
    # but we can save all as low res - using less space than a single png (even at max compression)
    assert cv2.imwrite(f'{path}/{capture_count}.jpeg', image, [cv2.IMWRITE_JPEG_QUALITY, 50])

In [11]:
#export
def dry_run():
    video_capture = cv2.VideoCapture(0)
    retval, image = video_capture.read()
    image_display_helper = ImageDisplayHelper(cv2.flip(image, 1), 'expoco: Dry Run')
    while True:
        if win32api.GetAsyncKeyState(win32con.VK_ESCAPE): 
            video_capture.release()
            break
        retval, image = video_capture.read()
        image_display_helper.show(cv2.flip(image, 1))
        time.sleep(.05)
    image_display_helper.close()
    return image

In [12]:
#do_not_test
image = dry_run() # press ESC to stop

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02\x80\x00\x00\x01\xe0\x08\x02\x00\x00\x00\xba\xb3K…

In [13]:
#export
def capture_session(expression_id, stop_after, comments):
    "Run a video capture session"
    expression_id, expression_name, path, video_capture, width, height = _setup_variables(expression_id)
    data = []
    face_mesh = mp_face_mesh.FaceMesh(max_num_faces=1)
    image, _ = _capture_and_process(video_capture, face_mesh)
    image_display_helper = ImageDisplayHelper(image, 'expoco: Capture session')
    try:
        _countdown(video_capture, expression_name, image_display_helper, face_mesh)
        metadata = _new_metadata(
                stop_after, path, video_capture, expression_id, expression_name, comments)
        for capture_count in range(1, stop_after+1):
            image, results = _capture_and_process(video_capture, face_mesh)
            _update_image(image, image_display_helper, f'{expression_name} {capture_count}')
            _save_results(path, results, data, image, capture_count)
            time.sleep(.05)
        metadata['count'] = capture_count
        metadata['end_date'] = now()
        with open(path/'metadata.json', 'w') as f: json.dump(metadata, f, indent=2)
        np.save(path/'data.npy', np.array(data, dtype=float), allow_pickle=False)
    finally:
        video_capture.release()
    return path

In [14]:
#do_not_test
path = capture_session(0, 10, 'throw away')

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02\x80\x00\x00\x01\xe0\x08\x02\x00\x00\x00\xba\xb3K…

In [15]:
#do_not_test
# path = Path('data/capture_session/viseme_20211112_202956_2')
print('loading from', path)
data = np.load(path/'data.npy')
print('data.shape', data.shape)
with open(path/'metadata.json') as f:
    metadata = json.load(f)
df = pd.DataFrame(data, columns=metadata['column_names'])
df

loading from data\capture_session\viseme_20211115_140302_0
data.shape (10, 1404)


Unnamed: 0,0x,0y,0z,1x,1y,1z,2x,2y,2z,3x,...,464z,465x,465y,465z,466x,466y,466z,467x,467y,467z
0,0.428121,0.668234,-0.031121,0.424556,0.611835,-0.059705,0.427168,0.627673,-0.030588,0.415625,...,-0.00284,0.44826,0.516319,-0.010177,0.515493,0.497179,0.005602,0.521562,0.490471,0.005393
1,0.428745,0.674618,-0.030308,0.425435,0.620659,-0.060191,0.427651,0.635384,-0.030591,0.415999,...,-0.00289,0.44755,0.518719,-0.01039,0.514392,0.497362,0.006018,0.520232,0.491379,0.005717
2,0.430875,0.681061,-0.02956,0.428211,0.626459,-0.060861,0.429878,0.64093,-0.030717,0.418732,...,-0.00308,0.450003,0.520975,-0.010906,0.516298,0.499514,0.006833,0.521996,0.494007,0.006591
3,0.4311,0.681225,-0.029019,0.426938,0.626984,-0.061031,0.429416,0.641318,-0.030562,0.417771,...,-0.004387,0.450133,0.522557,-0.012147,0.517336,0.501239,0.004538,0.523078,0.495647,0.004179
4,0.432932,0.681675,-0.030326,0.429838,0.627055,-0.060984,0.431691,0.642002,-0.031101,0.420273,...,-0.002665,0.452007,0.524025,-0.010463,0.518701,0.50394,0.007247,0.52457,0.497681,0.007093
5,0.436304,0.682005,-0.030923,0.432699,0.626776,-0.060911,0.434713,0.641937,-0.031379,0.42314,...,-0.001836,0.454156,0.523944,-0.009591,0.520283,0.503566,0.008173,0.526127,0.497399,0.008057
6,0.439656,0.681621,-0.030459,0.437169,0.624497,-0.060681,0.43853,0.640355,-0.031162,0.427288,...,-0.002276,0.457879,0.523852,-0.009938,0.523473,0.50388,0.00854,0.5294,0.497005,0.008573
7,0.441659,0.682853,-0.029996,0.438605,0.625046,-0.060693,0.44027,0.640821,-0.030948,0.428654,...,-0.00251,0.458569,0.523145,-0.010248,0.523864,0.502302,0.007997,0.529623,0.496206,0.007916
8,0.450512,0.683204,-0.03013,0.449241,0.626454,-0.061695,0.449586,0.641615,-0.031263,0.438178,...,-0.002741,0.467244,0.522323,-0.01057,0.53226,0.502306,0.010057,0.537918,0.495325,0.010238
9,0.452678,0.678798,-0.03164,0.451579,0.621085,-0.061688,0.451808,0.63689,-0.03186,0.440669,...,-0.001142,0.469571,0.520327,-0.008969,0.534015,0.501711,0.012017,0.539766,0.494742,0.012317


# clear out an old viseme data

In [16]:
# data_path = Path('../data')
# for path in data_path.glob('viseme_capture_session*'):
#     print('removing', path)
#     shutil.rmtree(path)

In [19]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 05a_ml_data.ipynb.
Converted 05b_ml_model.ipynb.
Converted 10a_mediapipe_face_mesh_identify_landmarks.ipynb.
Converted 10b_mediapipe_face_mesh_capture_session.ipynb.
Converted 10c_mediapipe_face_mesh_train_model.ipynb.
Converted 10d_test_np_model.ipynb.
Converted 20a_gui_capture_command.ipynb.
Converted 70_cli.ipynb.
Converted index.ipynb.
Converted project_lifecycle.ipynb.
