In [None]:
# yolov7 base import & variables
import argparse
import time
from pathlib import Path

import cv2
import torch
import torch.backends.cudnn as cudnn
from numpy import random
import easydict

from models.experimental import attempt_load
from utils.datasets import LoadStreams, LoadImages
from utils.general import check_img_size, check_requirements, check_imshow, non_max_suppression, apply_classifier, \
    scale_coords, xyxy2xywh, strip_optimizer, set_logging, increment_path
from utils.plots import plot_one_box
from utils.torch_utils import select_device, load_classifier, time_synchronized, TracedModel
opt = easydict.EasyDict({
    "weights":'yolov7-e6.pt',
    "source":'vd000.mp4',
    
    "img_size":640,
    "conf_thres":0.25,
    "iou_thres":0.45,
    "device":'',
    "view_img":False,
    "save_txt":False,
    "save_conf":False,
    "nosave":False,
    "classes":None,
    
    "agnostic_nms":False,
    "augment":False,
    "updata":False,
    "project":'runs/detect',
    "name":'exp',
    "exist_ok":False,
    "no_trace":False,
})

In [None]:
def detect():
    source, weights, view_img, save_txt, imgsz, trace = opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size, not opt.no_trace
    save_img = not opt.nosave and not source.endswith('.txt')  # save inference images
    webcam = source.isnumeric() or source.endswith('.txt') or source.lower().startswith(
        ('rtsp://', 'rtmp://', 'http://', 'https://'))

    # Initialize
    set_logging()
    device = select_device(opt.device)
    half = device.type != 'cpu'  # half precision only supported on CUDA

    # Load model
    model = attempt_load(weights, map_location=device)# load FP32 model
    stride = int(model.stride.max())  # model stride
    imgsz = check_img_size(imgsz, s=stride)  # check img_size

    if trace:
        model = TracedModel(model, device, opt.img_size)

    if half:
        model.half()  # to FP16

    # Second-stage classifier
    classify = False
    if classify:
        modelc = load_classifier(name='resnet101', n=2)  # initialize
        modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model']).to(device).eval()

    # Set Dataloader
    vid_path, vid_writer = None, None
    if webcam:
        view_img = check_imshow()
        cudnn.benchmark = True  # set True to speed up constant image size inference
        dataset = LoadStreams(source, img_size=imgsz, stride=stride)
    else:
        dataset = LoadImages(source, img_size=imgsz, stride=stride)

    # Get names and colors
    names = model.module.names if hasattr(model, 'module') else model.names
    colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]

    # Run inference
    if device.type != 'cpu':
        model(torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(next(model.parameters())))  # run once
    old_img_w = old_img_h = imgsz
    old_img_b = 1
    t0 = time.time()
    first_cnt=0
    for path, img, im0s, vid_cap in dataset:
        first_cnt+=1
        if first_cnt==1:
            if vid_cap:
                fps = vid_cap.get(cv2.CAP_PROP_FPS)
                w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            else:  # stream
                fps, w, h = 30, im0.shape[1], im0.shape[0]
            save_path ='run.mp4'    
            out = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))

        
        with mp_face_mesh.FaceMesh( max_num_faces=1, refine_landmarks=True,
        min_detection_confidence=0.5, min_tracking_confidence=0.5) as face_mesh:
            im0s.flags.writeable = False
            im0s = cv2.cvtColor(im0s, cv2.COLOR_BGR2RGB)
            results = face_mesh.process(im0s)
            x = im0s.shape[1] # height
            y = im0s.shape[0] # width

            # Draw the face mesh annotations on the image.
            im0s.flags.writeable = True
            im0s = cv2.cvtColor(im0s, cv2.COLOR_RGB2BGR)

            if results.multi_face_landmarks:
                for face_landmarks in (results.multi_face_landmarks):
                    begin = time.time()
                    # Drawing base line(facemesh)
                    # eyes
                    mp_drawing.draw_landmarks(
                        image=im0s,
                        landmark_list=face_landmarks,
                        connections=FACEMESH_CONTOURS,
                        landmark_drawing_spec=None,
                        connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_contours_style())
                    # irises
                    mp_drawing.draw_landmarks(
                        image=im0s,
                        landmark_list=face_landmarks,
                        connections=mp_face_mesh.FACEMESH_IRISES,
                        # mp_face_mesh
                        landmark_drawing_spec=None,
                        connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_iris_connections_style())
                    total_landmarks.append(face_landmarks.landmark)
                    # Make DataFrames------------------------------------------------------------
                    # iris data frame
                    irises=[] # temporary list
                    for iris, _ in FACEMESH_IRISES:
                        irises.append(iris)
                    irises.sort() # order
                    total = [] # to be iris dataframe
                    for n,_ in enumerate(irises):
                        n+=1
                        # 좌표 x,y,z값 순서 각 4개씩 (오른쪽눈 < 왼쪽눈) 
                        if n <=len(FACEMESH_LEFT_IRIS):
                            direction = 'right'
                        else:
                            n-=len(FACEMESH_LEFT_IRIS)
                            direction = 'left'
                        now = [_,direction ,face_landmarks.landmark[_].x,face_landmarks.landmark[_].y,face_landmarks.landmark[_].z] # info in this time
                        total.append(now) 
                    iris_df = pd.DataFrame(total, columns = ['idx','dir','x','y','z']) # idx: landmark, dir: right/left
                    
                    # iris / normalized data => resize to origin and to int
                    iris_df['x'] = iris_df['x']*x
                    iris_df['y'] = iris_df['y']*y
                    iris_df['x'] = iris_df['x'].astype('int64')
                    iris_df['y'] = iris_df['y'].astype('int64')

                    # eyes data frame
                    eyes=[] # temporary list
                    for eye, _ in FACEMESH_EYES:
                        eyes.append(eye)
                        eyes.append(_)
                    eyes = list(set(eyes))
                    eyes.sort() # order
                    total = [] # to be eyes dataframe
                    for n,_ in enumerate(eyes):
                        n+=1
                        # 좌표 x,y,z값 순서 각 16개씩 (오른쪽눈 < 왼쪽눈) 
                        if n <= len(FACEMESH_LEFT_EYE): 
                            direction = 'right'     
                        else:
                            n-=int(len(FACEMESH_LEFT_EYE))
                            direction = 'left'
                        if _ in under:
                            loc = 'under'
                        else:
                            loc = 'up'
                        now = [_,direction ,face_landmarks.landmark[_].x,face_landmarks.landmark[_].y,face_landmarks.landmark[_].z,loc] # info in this time
                        total.append(now)
                    eyes_df = pd.DataFrame(total, columns = ['idx','dir','x','y','z','loc']) # idx: landmark, dir: right/left, loc: up/down
                    
                    # eyes / normalized data => resize to origin and to int
                    eyes_df['x'] = eyes_df['x']*x
                    eyes_df['y'] = eyes_df['y']*y
                    eyes_df['x'] = eyes_df['x'].astype('int64')
                    eyes_df['y'] = eyes_df['y'].astype('int64')
                    
                    # Gaze Point Estimation------------------------------------------------------------
                    # 오른쪽 동공의 각 끝 좌표
                    n469_x, n469_y = iris_df[iris_df['idx']==469].x,iris_df[iris_df['idx']==469].y
                    n471_x, n471_y = iris_df[iris_df['idx']==471].x,iris_df[iris_df['idx']==471].y
                    # 왼쪽 동공의 각 끝 좌표
                    n474_x, n474_y = iris_df[iris_df['idx']==474].x,iris_df[iris_df['idx']==474].y
                    n476_x, n476_y = iris_df[iris_df['idx']==476].x,iris_df[iris_df['idx']==476].y
                    
                    # 오른쪽 동공의 중심좌표
                    dot_r = ((int(n469_x) + int(n471_x)) / 2, (int(n469_y) + int(n471_y)) / 2)
                    # 왼쪽 동공의 중심좌표
                    dot_l = ((int(n474_x) + int(n476_x)) / 2, (int(n474_y) + int(n476_y)) / 2)

                    # 오른쪽 눈꺼풀의 각 끝 좌표와 길이
                    n33 = (eyes_df[eyes_df['idx']==33].x,eyes_df[eyes_df['idx']==33].y)
                    n133 = (eyes_df[eyes_df['idx']==133].x,eyes_df[eyes_df['idx']==133].y) 
                    # dist_r = math.dist(n33,n133)
                    dist_r = distance(eyes_df[eyes_df['idx']==33].iloc[0].x,eyes_df[eyes_df['idx']==33].iloc[0].y,eyes_df[eyes_df['idx']==133].iloc[0].x,eyes_df[eyes_df['idx']==133].iloc[0].y)
                    
                    # 왼쪽 눈꺼풀의 각 끝 좌표와 길이
                    n263 = (eyes_df[eyes_df['idx']==263].x,eyes_df[eyes_df['idx']==263].y)
                    n362 = (eyes_df[eyes_df['idx']==362].x,eyes_df[eyes_df['idx']==362].y)
                    # dist_l = math.dist(n263,n362)
                    dist_l = distance(eyes_df[eyes_df['idx']==263].iloc[0].x,eyes_df[eyes_df['idx']==263].iloc[0].y,eyes_df[eyes_df['idx']==362].iloc[0].x,eyes_df[eyes_df['idx']==362].iloc[0].y)


                    # 오른쪽 밑 눈꺼풀
                    n145 = (eyes_df[eyes_df['idx']==145].x,eyes_df[eyes_df['idx']==145].y)
                    # 왼쪽 밑 눈꺼풀
                    n374 = (eyes_df[eyes_df['idx']==374].x,eyes_df[eyes_df['idx']==374].y)
                    
                    # gaze point line val
                    # 눈 좌표 값 방향기준
                    
                    range_w = int(x*.07) # 좌측부터 2,3번째 그리드의 x좌표 간격에 각각 +,- 값 

                    # gaze_point_line --------------------------------------------------
                    right_line_x = ((n33[0][1]-range_w)/2)/2
                    rightcenter_line_x = ((n33[0][1]-range_w)/2) + ((n33[0][1]-range_w)/2)/2
                    center_line_x = (n263[0][17]+range_w - (n33[0][1]-range_w))/2 + (n33[0][1]-range_w)
                    leftcenter_line_x = (n263[0][17]+range_w)+(x-(n263[0][17]+range_w))/4
                    left_line_x = (n263[0][17]+range_w) + (x-(n263[0][17]+range_w))*3/4
                    
                    up_line_y = eyes_df[eyes_df['idx']==33].y[1]/2
                    middle_line_y = eyes_df[eyes_df['idx']==33].y[1] + (y*.75 -  eyes_df[eyes_df['idx']==33].y[1])/2
                    down_line_y = y*.75+y*.125          
                    # 오른쪽 눈 방향 (좌우)
                    # r_ratio = round((math.dist(dot_r, n133)/dist_r),5) # if ratio < thres: left
                    r_ratio = round(distance((int(n469_x)+int(n471_x))/2,(int(n469_y)+int(n471_y))/2,eyes_df[eyes_df['idx']==133].iloc[0].x,eyes_df[eyes_df['idx']==133].iloc[0].y)/dist_r,5)
                    if r_ratio:
                        if r_ratio < thres:
                            dir_r = 'Right'
                        elif r_ratio > thres_:
                            dir_r = 'Left'
                        else:
                            dir_r = 'Center'
                    # 왼쪽 눈 방향 (좌우)
                    # l_ratio = round((math.dist(dot_l, n263)/dist_l),5) # if ratio < thres: left                
                    l_ratio = round(distance((int(n474_x) + int(n476_x)) / 2, (int(n474_y) + int(n476_y)) / 2,eyes_df[eyes_df['idx']==263].iloc[0].x,eyes_df[eyes_df['idx']==263].iloc[0].y)/dist_l,5)
                    if l_ratio:
                        if l_ratio < thres:
                            dir_l = 'Right'
                        elif l_ratio > thres_:
                            dir_l = 'Left'
                        else:
                            dir_l = 'Center'
                    # 통합 눈 방향 (좌우)
                    if dir_r == dir_l:
                        dir_ = dir_r
                        if dir_r == 'Right':
                            gaze_line_x = left_line_x
                        else:
                            gaze_line_x = right_line_x
                    elif ((dir_r =='Right') and (dir_l =='Left')) or ((dir_r == 'Left') and (dir_l == 'Right')):
                        dir_ = 'Center' # 양 끝 값일 때, 중앙으로
                        gaze_line_x = center_line_x
                    else: # [rightcenter, leftcenter, centerright, centerleft]
                        dir_ = [dir_r,dir_l]
                        if ('Right' in dir_) and ('Center' in dir_):
                            dir_ = 'RightCenter'
                            gaze_line_x = leftcenter_line_x
                        if ('Left' in dir_) and ('Center' in dir_):
                            dir_ = 'LeftCenter'
                            gaze_line_x = rightcenter_line_x

            #                 up_r = iris_df[iris_df['idx']==472]['y'][3] - eyes_df[eyes_df['idx']==145].y[4] # if up<0: up
            #                 up_l = iris_df[iris_df['idx']==477]['y'][7] - eyes_df[eyes_df['idx']==374].y[20] # if up<0: up
                    # EAR ratio--------------------------------------------------
                    # 오른쪽 눈 방향 (상하) : (|161-163|+|157-154|)/2*|133-33|*1/100
                    n161 = (eyes_df[eyes_df['idx']==161].x,eyes_df[eyes_df['idx']==161].y)
                    n163 = (eyes_df[eyes_df['idx']==163].x,eyes_df[eyes_df['idx']==163].y)
                    n154 = (eyes_df[eyes_df['idx']==154].x,eyes_df[eyes_df['idx']==154].y)
                    n157 = (eyes_df[eyes_df['idx']==157].x,eyes_df[eyes_df['idx']==157].y)
                    # right_ear = (abs(math.dist(n161,n163))+abs(math.dist(n157,n154)))/2*abs(math.dist(n133,n33))/1000
                    right_ear = (abs(distance(eyes_df[eyes_df['idx']==161].iloc[0].x,eyes_df[eyes_df['idx']==161].iloc[0].y,eyes_df[eyes_df['idx']==163].iloc[0].x,eyes_df[eyes_df['idx']==163].iloc[0].y))+\
                                  abs(distance(eyes_df[eyes_df['idx']==157].iloc[0].x,eyes_df[eyes_df['idx']==157].iloc[0].y,eyes_df[eyes_df['idx']==154].iloc[0].x,eyes_df[eyes_df['idx']==154].iloc[0].y)))/2*\
                                  abs(distance(eyes_df[eyes_df['idx']==133].iloc[0].x,eyes_df[eyes_df['idx']==133].iloc[0].y,eyes_df[eyes_df['idx']==33].iloc[0].x,eyes_df[eyes_df['idx']==33].iloc[0].y))/1000
                    # 왼쪽 눈 방향 (상하) : (|384-381|+|388-390|)/2*|263-362|*1/100
                    n381 = (eyes_df[eyes_df['idx']==381].x,eyes_df[eyes_df['idx']==381].y)
                    n384 = (eyes_df[eyes_df['idx']==384].x,eyes_df[eyes_df['idx']==384].y)
                    n388 = (eyes_df[eyes_df['idx']==388].x,eyes_df[eyes_df['idx']==388].y)
                    n390 = (eyes_df[eyes_df['idx']==390].x,eyes_df[eyes_df['idx']==390].y)
                    # left_ear = (abs(math.dist(n384,n381))+abs(math.dist(n388,n390)))/2*abs(math.dist(n263,n362))/1000
                    left_ear = (abs(distance(eyes_df[eyes_df['idx']==381].iloc[0].x,eyes_df[eyes_df['idx']==381].iloc[0].y,eyes_df[eyes_df['idx']==384].iloc[0].x,eyes_df[eyes_df['idx']==384].iloc[0].y))+\
                                abs(distance(eyes_df[eyes_df['idx']==388].iloc[0].x,eyes_df[eyes_df['idx']==388].iloc[0].y,eyes_df[eyes_df['idx']==390].iloc[0].x,eyes_df[eyes_df['idx']==390].iloc[0].y)))/2*\
                                abs(distance(eyes_df[eyes_df['idx']==263].iloc[0].x,eyes_df[eyes_df['idx']==263].iloc[0].y,eyes_df[eyes_df['idx']==362].iloc[0].x,eyes_df[eyes_df['idx']==362].iloc[0].y))/1000
                    # Right iris(468) z vs Left iris(473) z: higher value is closer camera.
                    if face_landmarks.landmark[468].z > face_landmarks.landmark[473].z:
                        using_ear = right_ear
                    else:
                        using_ear = left_ear
                    if using_ear <= 0.15:
                        ear = 'CLOSE'
                        gaze_line_y = down_line_y
                    elif (using_ear > 0.15) and (using_ear <= thres_ear/2):# thres_ear_ = thres_ear/2
                        ear = 'DOWN'
                        gaze_line_y = down_line_y
                    elif (using_ear > 0.4) and (using_ear < thres_ear): # thres_ear = 0.7
                        ear = 'MIDDLE'
                        gaze_line_y = middle_line_y
                    else:
                        ear = 'UP'
                        gaze_line_y = up_line_y
                        

                    # time --------------------------------------------------
                    end = time.time()
                    t = end - begin # 현재 frame 시간 값
                    
            #                 if obj in positive_list: # 공부하는 시간은 frame시간 더함
            #                     study_time += t # 순공시간
            #                 else:
            #                     continue    
            #                 now = [t,obj] # 현재 행
            #                 time_list.append(now) # sum 했을 때 => 전체 시간 값


            #             cv2.imshow('MediaPipe', cv2.flip(im0s, 0))         # Flip the image horizontally for a selfie-view display.
                # cv2_imshow(im0s)
            img = torch.from_numpy(img).to(device)
            img = img.half() if half else img.float()  # uint8 to fp16/32
            img /= 255.0  # 0 - 255 to 0.0 - 1.0
            if img.ndimension() == 3:
                img = img.unsqueeze(0)

            # Warmup
            if device.type != 'cpu' and (old_img_b != img.shape[0] or old_img_h != img.shape[2] or old_img_w != img.shape[3]):
                old_img_b = img.shape[0]
                old_img_h = img.shape[2]
                old_img_w = img.shape[3]
                for i in range(3):
                    model(img, augment=opt.augment)[0]

            # Inference
            t1 = time_synchronized()
            pred = model(img, augment=opt.augment)[0]
            t2 = time_synchronized()

            # Apply NMS
            pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
            t3 = time_synchronized()

            # Apply Classifier
            if classify:
                pred = apply_classifier(pred, modelc, img, im0s)
            
            if results.multi_face_landmarks:

                # Grid line--------------------------------------------------
                # out.write(im0s)
                cv2.line(im0s,(n33[0][1]-range_w,0),(n33[0][1]-range_w,y),(255,0,0),3) # n33[0][1]= n33_x, range_w = 50
                cv2.line(im0s,(n263[0][17]+range_w,0),(n263[0][17]+range_w,y),(255,0,0),3)

                cv2.line(im0s,(int((n33[0][1]-range_w)/2),0),(int((n33[0][1]-range_w)/2),y),(255,0,0),3)
                cv2.line(im0s,(int((x-(n263[0][17]+range_w))/2+(n263[0][17]+range_w)),0),(int((x-(n263[0][17]+range_w))/2+(n263[0][17]+range_w)),y),(255,0,0),3) # n263[0][17]= n263_x
                
                cv2.line(im0s,(0,int(y*0.75)),(x,int(y*0.75)),(255,0,0),1) # 책상선
                cv2.line(im0s,(0,eyes_df[eyes_df['idx']==33].y[1]),(x,eyes_df[eyes_df['idx']==33].y[1]),(255,0,0),1) # 오른쪽 바깥 눈꼬리 기준
                #cv2.line(im0s,(0,int(face_landmarks.landmark[10].y*y)),(x,int(face_landmarks.landmark[10].y*y)),(255,0,0),3) # 이마라인선 but, down과 middle의 기준이 애매함, 눈꼬리 기준으로 위아래 나누는게 더 좋을듯
                
                # gaze point line --------------------------------------------------
                        # print(gaze_line_x)
                        # print(gaze_line_y)
                if ear != 'UP':
                    cv2.line(im0s,(int(face_landmarks.landmark[468].x*x),int(face_landmarks.landmark[468].y*y)),(int(gaze_line_x-x*.07), int(gaze_line_y)),(255,0,0),2) 
                    cv2.line(im0s,(int(face_landmarks.landmark[473].x*x),int(face_landmarks.landmark[473].y*y)),(int(gaze_line_x+x*.07), int(gaze_line_y)),(255,0,0),2)
                # put text --------------------------------------------------   
                if dir_:
                    org=(int(x*0.3),int(y*0.3))
                    font=cv2.FONT_HERSHEY_SIMPLEX
                    cv2.putText(im0s,dir_,org,font,.5,(255,0,0),1)
                    # size, BaseLine=cv2.getTextSize(dir_,font,1,2)
                if ear:
                    org=(int(x*0.3),int(y*0.4))
                    font=cv2.FONT_HERSHEY_SIMPLEX
                    cv2.putText(im0s,ear,org,font,.5,(255,0,0),1)
                    # size, BaseLine=cv2.getTextSize(ear,font,1,2)

            # Process detections
            for i, det in enumerate(pred):  # detections per image
                if webcam:  # batch_size >= 1
                    p, s, frame = path[i], '%g: ' % i, dataset.count
                else:
                    p, s, frame = path, '', getattr(dataset, 'frame', 0)

                # gn = torch.tensor(im0s.shape)[[1, 0, 1, 0]]  # normalization gain whwh
                if len(det):
                    # Rescale boxes from img_size to im0 size
                    det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0s.shape).round()
                    
                                    # Print results
                    for c in det[:, -1].unique():
                        n = (det[:, -1] == c).sum()  # detections per class
                        s += f"{n} {names[int(c)]}{'s' * (n > 1)}, "  # add to string

                    for  *xyxy, conf, cls in reversed(det):
                      label = f'{names[int(cls)]} {conf:.2f}'
                      plot_one_box(xyxy, im0s, label=label, color=colors[int(cls)], line_thickness=1)
                # Print time (inference + NMS)
                print(f'{s}Done. ({(1E3 * (t2 - t1)):.1f}ms) Inference, ({(1E3 * (t3 - t2)):.1f}ms) NMS')
                      
                out.write(im0s)
                # cv2_imshow(im0s)
    vid_cap.release()
    out.release()                

In [None]:
# 실행문
with torch.no_grad():
    if opt.update:  # update all models (to fix SourceChangeWarning)
        for opt.weights in ['yolov7.pt']:
            detect()
            strip_optimizer(opt.weights)
    else:
        detect()