## First of all install these must dependencies to run the program
### FFMPEG tool for transcoding multimedia files:
* For Ubuntu: https://linuxize.com/post/how-to-install-ffmpeg-on-ubuntu-18-04/
* For Windows: https://www.wikihow.com/Install-FFmpeg-on-Windows

In [1]:
#! pip install imutils
#! pip install tqdm
#! pip install opencv-python

## Importing the modules

In [2]:
import os,sys
import cv2
import subprocess
import re
import math
from subprocess import check_call, PIPE, Popen
import shlex
from tqdm import tqdm
import pandas as pd
from imutils.object_detection import non_max_suppression
import numpy as np
import time
from sklearn.cluster import KMeans

*Here we can see we are using opencv library for python which is a very important open source video analysis library avaialble*<br>

*In this we will use several pre-trained models or haar cascade files( It is ML object detection algorithms to determine objects in an image* 

In [13]:
haar_upper_body_cascade = cv2.CascadeClassifier("models/haarcascade_upperbody.xml")
haar_full_body_cascade = cv2.CascadeClassifier("models/haarcascade_fullbody.xml")
haar_face_cascade = cv2.CascadeClassifier('models/haarcascade_frontalface_alt.xml')
lbm_face_cascade = cv2.CascadeClassifier('models/lbpcascade_frontalface.xml')
haar_hand_cascade = cv2.CascadeClassifier('models/hand.xml')
haar_lowerbody_cascade = cv2.CascadeClassifier('model/haarcascade_lowerbody.xml')
text_detection_model = "models/frozen_east_text_detection.pb"

#text_detection_model = "https://raw.githubusercontent.com/oyyd/ frozen_east_text_detection.pb/master/frozen_east_text_detection.pb"

*Loading different files where we will store our keyframes, segments and result about the video summarization *

In [14]:
DATA_FOLDER = '../dataset'
SEGMENT_OUTPUT = '../segments'
OUTPUT_FOLDER = 'results'

*This is use to get what is the frame we are getting in video segments, P-frame or I-frame. Genrally I-frame have all the independent features to represent the image data*

In [15]:
def get_video_frame_types(filename):
    cmd = 'ffprobe -v error -show_entries frame=pict_type -of default=noprint_wrappers=1'.split()
    out = subprocess.check_output(cmd + [filename]).decode()
    frame_types = out.replace('pict_type=', '').split()
    return zip(range(len(frame_types)), frame_types)

*It calculates the video length in seconds*

In [16]:
def get_video_length(video_fn):
    res = subprocess.run(["ffprobe", "-v", "error", "-show_entries",
                             "format=duration", "-of",
                             "default=noprint_wrappers=1:nokey=1", video_fn],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT)
    return float(res.stdout)

In [17]:
pwd

'/Users/vipul/Desktop/AI_ML_Capstone_Mar/Capstone Project 1/Solution Files'

In [18]:
get_video_length("../dataset/nptel_ml/ML.mp4")/60

3.4160277833333335

*This function will split the data into number of segments divided by seconds we specify in our case 30 sec*

In [19]:
re_metadata = re.compile('Duration: (\d{2}):(\d{2}):(\d{2})\.\d+,.*\n.* (\d+(\.\d+)?) fps')


def get_video_metadata(video_fn):
    p1 = Popen(["ffmpeg", "-hide_banner", "-i", video_fn], stderr=PIPE, universal_newlines=True)
    output = p1.communicate()[1]
    matches = re_metadata.search(output)
    if matches:
        video_length = int(matches.group(1)) * 3600 + int(matches.group(2)) * 60 + int(matches.group(3))
        video_fps = float(matches.group(4))
    else:
        raise Exception("Can't parse required video metadata")
    return video_length, video_fps

In [20]:
def split_video_in_segment(video_fn, num, out_path, by='size'):
    assert num > 0
    assert by in ['size', 'count']
    split_video_size = num if by == 'size' else None
    split_video_count = num if by == 'count' else None
    # parsing meta-data
    video_length, video_fps = get_video_metadata(video_fn)

    # calculate the video video split count
    if split_video_size:
        split_video_count = math.ceil(video_length / split_video_size)
        if split_video_count == 1:
            raise Exception("Too small split size! Please increase the target split size!!")
    else: # split video count
        split_video_size = round(video_length / split_video_count)
    
    # For windows
    #pth, ext = video_fn.rsplit(".", 1)
    #print("pth: {},ext: {}".format(pth,ext))
    #temp_pth = pth.rsplit("\\",1)
    #print("temp_pth: {}".format(temp_pth))
    #folder_name = temp_pth[0].rsplit("\\",1)
    #print("folder_name: {}".format(folder_name))
    #pth = out_path + os.sep + folder_name[1] + os.sep + temp_pth[1]
    #print("updated path: {},ext: {}".format(pth,ext))
    
    # For Linux Machines
    pth, ext = video_fn.rsplit(".", 1)
    temp_pth = pth.rsplit("/",1)
    folder_name = temp_pth[0].rsplit("/",1)
    pth = out_path + os.sep + folder_name[1] + os.sep + temp_pth[1]
    #cmd = 'ffmpeg -hide_banner -loglevel panic -i "{}" -c copy -map 0 -segment_time {} -reset_timestamps 1 -g {} -sc_threshold 0 -force_key_frames "expr:gte(t,n_forced*{})" -f segment -y "{}-%d.{}"'.format(video_fn, split_video_size, round(split_video_size*video_fps), split_video_size, pth, ext)
    cmd = 'ffmpeg -i "{}" -c copy -map 0 -f segment -segment_time 60 -reset_timestamps 1 -y "{}-%d.{}"'.format(video_fn, pth, ext)
    check_call(shlex.split(cmd), universal_newlines=True)

    # returning the list of output (index start from 0)
    return ['{}-{}.{}'.format(pth, i, ext) for i in range(split_video_count)]

## Task 1: Video Segmentation

*This loop will iterate over the folders and the videos inside it and call the split function, the split will be saved in segments folder*

In [21]:
for file in tqdm(os.listdir(DATA_FOLDER)):
    for f in tqdm(os.listdir(DATA_FOLDER+os.sep+file)):
        if f.endswith((".mp4", ".avi", ".mov", ".wmv", ".qt", ".MTS", ".M2TS", ".TS", ".mkv", ".flv", ".vob")):
            path=os.path.join(DATA_FOLDER,file,f)
            print("Path: {}".format(path))
            print(split_video_in_segment(path, 30,SEGMENT_OUTPUT))

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A

Path: ../dataset/nptel_ai/How to Learn and Follow the Course.mp4



100%|██████████| 3/3 [00:00<00:00,  4.78it/s][A
 50%|█████     | 1/2 [00:00<00:00,  1.59it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A
100%|██████████| 2/2 [00:00<00:00, 12.30it/s][A
100%|██████████| 2/2 [00:00<00:00,  2.51it/s]

['../segments/nptel_ai/How to Learn and Follow the Course-0.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-1.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-2.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-3.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-4.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-5.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-6.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-7.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-8.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-9.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-10.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-11.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-12.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-13.mp4', '../segments/nptel_ai/How to Learn and Follow the Course-14.mp4']
Path: ../dataset/np




*This is the for loop which get all the segments in different files, and get all the i-frames and classify and cluster the image based upon the face,hand,upper_body,text and Area of text occupied in image and store the data in csv file in result*

## Follwing Task in this block of cell and code
* Task 2: Assessment of Instructor Presence and Interaction
* Task 3: Assessment of use of blackboard, slides

The result of the output saved in Result folder

In [22]:
# load the pre-trained text detecter to detect text in the frame
print("[INFO] loading text detector...")
net = cv2.dnn.readNet(text_detection_model)
for seg_file in tqdm(os.listdir(SEGMENT_OUTPUT)):
    # Initialize the parameters for result
    params = {"Keyframe_number": [], "Instructor_detected": [], "Hand_detected": [],
                "Instructor_upperBody_detected": [],
                "Instructor_fullBody_detected": [], "Teaching_method": [],
                "Text_detected": [], "Percent_of_area_occupied_by_text": [], "Too_much_text_occupied": []}
    video_found = False
    total_video_len = 0
    total_segments = 0
    #print("Taking segments.....")
    for video_fn in os.listdir(SEGMENT_OUTPUT + os.sep + seg_file):
        # loop through the files with video extensions only 
        if video_fn.endswith(
                (".mp4", ".avi", ".mov", ".wmv", ".qt", ".MTS", ".M2TS", ".TS", ".mkv", ".flv", ".vob")):
            video_found = True
            path = os.path.join(SEGMENT_OUTPUT, seg_file, video_fn)
            temp_video_len = get_video_length(path)
            total_video_len += temp_video_len
            total_segments += 1
            frame_types = get_video_frame_types(SEGMENT_OUTPUT + os.sep + seg_file + os.sep + video_fn)
            i_frames = [x[0] for x in frame_types if x[1] == 'I']
            # once I frame is got , we can loop in I frames to detect features
            if i_frames:
                basename = os.path.splitext(os.path.basename(video_fn))[0]
                cap = cv2.VideoCapture(SEGMENT_OUTPUT + os.sep + seg_file + os.sep + video_fn)

                keyframe_number = ""
                Instructor_detected = False
                Hand_detected = False
                Instructor_upperBody_detected = False
                Instructor_fullBody_detected = False
                Teaching_method = "No board used"
                Text_detected = False
                Percent_of_area_occupied_by_text = 0
                Too_much_text_occupied = False

                for frame_no in i_frames:

                    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
                    ret, frame = cap.read()
                    keyframe_index = basename + '_i_frame_' + str(frame_no)
                    keyframe_number = keyframe_index
                    outname = keyframe_index + '.jpg'
                    cv2.imwrite('keyframes/' + outname, frame)
                    # print('Saved: ' + outname)
                    # converts to gray
                    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

                    # Checking which board instructor is using slides, greenBoard, blackboard
                    hsv_image = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
                    reshape = hsv_image.reshape((hsv_image.shape[0] * hsv_image.shape[1], 3))

                    # Find and display most dominant colors
                    cluster = KMeans(n_clusters=3).fit(reshape)
                    centroids = cluster.cluster_centers_
                    # Get the number of different clusters, create histogram, and normalize
                    labels = np.arange(0, len(np.unique(cluster.labels_)) + 1)
                    (hist, _) = np.histogram(cluster.labels_, bins=labels)
                    hist = hist.astype("float")
                    hist /= hist.sum()

                    # It will create the histogram and iterate into every colors
                    colors = sorted([(percent, color) for (percent, color) in zip(hist, centroids)], reverse=True)

                    if 36 < colors[0][1][0] < 86:
                        if 25 < colors[0][1][1] < 255:
                            if 25 < colors[0][1][2] < 255:
                                Teaching_method = "GreenBoard"

                    if 0 < colors[0][1][0] < 179:
                        if 5 < colors[0][1][1] < 50:
                            if 50 < colors[0][1][2] < 255:
                                Teaching_method = "BlackBoard"

                    if 0 < colors[0][1][0] < 172:
                        if 0 < colors[0][1][1] < 111:
                            if 168 < colors[0][1][2] < 255:
                                Teaching_method = "slide"
                    # Detect the face
                    #print("Now detecting face...")
                    faces = lbm_face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5,
                                                                minSize=(50, 100),
                                                                # Min size for valid detection, changes according
                                                                # to video size or body size in the video.
                                                                flags=cv2.CASCADE_SCALE_IMAGE
                                                                )
                    # Upper body detection
                    #print("Upper body Detection...")
                    upper_body = haar_upper_body_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5,
                                                                            minSize=(50, 100),
                                                                            # Min size for valid detection, changes
                                                                            # according to video size or body size
                                                                            # in the video.
                                                                            flags=cv2.CASCADE_SCALE_IMAGE
                                                                            )
                    # Full body detection
                    #print("Full Body Detection...")
                    full_body = haar_full_body_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5,
                                                                        minSize=(50, 100),
                                                                        # Min size for valid detection, changes
                                                                        # according to video size or body size in
                                                                        # the video.
                                                                        flags=cv2.CASCADE_SCALE_IMAGE
                                                                        )

                    # Hand detection
                    #print("Hand detection....")
                    hands = haar_hand_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5,
                                                                minSize=(50, 100),
                                                                # Min size for valid detection, changes according
                                                                # to video size or body size in the video.
                                                                flags=cv2.CASCADE_SCALE_IMAGE
                                                                )
                    # Getting detection points

                    #for (x, y, w, h) in faces:
                    if len(faces)>0:
                        Instructor_detected = True
                    #for (x, y, w, h) in upper_body:
                    if len(upper_body)>0:
                        Instructor_upperBody_detected = True
                    #for (x, y, w, h) in full_body:
                    if len(full_body)>0:
                        Instructor_fullBody_detected = True
                    #for (x, y, w, h) in hands:
                    if len(hands)>0:
                        Hand_detected = True
                    # Text detection
                    # Detecting the dimention of image
                    image = frame.copy()
                    (H, W) = image.shape[:2]

                    # taking the ratio of new dimention set
                    # for both the width and height
                    (newW, newH) = (320, 320)
                    rW = W / float(newW)
                    rH = H / float(newH)

                    # taking new dimention image after resizing
                    image = cv2.resize(image, (newW, newH))
                    (H, W) = image.shape[:2]

                    # define two output layer for model
                    layerNames = [
                        "feature_fusion/Conv_7/Sigmoid",
                        "feature_fusion/concat_3"]
                    # forward pass
                    # to obtain two output layer of model
                    blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),
                                                    (123.68, 116.78, 103.94), swapRB=True, crop=False)

                    start = time.time()
                    net.setInput(blob)
                    (scores, geometry) = net.forward(layerNames)
                    end = time.time()

                    # show timing information on text prediction
                    print("[INFO] text detection took {:.6f} seconds".format(end - start))

                    # scores value row and column
                    # bounding boxes initialization
                    # confidence scores
                    (numRows, numCols) = scores.shape[2:4]
                    rects = []
                    confidences = []

                    # loop over the number of rows
                    for y in range(0, numRows):
                        # extract the scores (probabilities), followed by the geometrical
                        # to detect the bounding box of the data
                        # surround text
                        scoresData = scores[0, 0, y]
                        xData0 = geometry[0, 0, y]
                        xData1 = geometry[0, 1, y]
                        xData2 = geometry[0, 2, y]
                        xData3 = geometry[0, 3, y]
                        anglesData = geometry[0, 4, y]

                        # loop over the number of columns
                        for x in range(0, numCols):
                            # ignore if the scores is not enough
                            if scoresData[x] < 0.5:
                                continue

                            # computing our feature maps
                            # be 4x smaller than the input image
                            (offsetX, offsetY) = (x * 4.0, y * 4.0)

                            # rotation angle for prediction
                            # compute the sin and cosine
                            angle = anglesData[x]
                            cos = np.cos(angle)
                            sin = np.sin(angle)

                            # getting width and height
                            # the bounding box
                            h = xData0[x] + xData2[x]
                            w = xData1[x] + xData3[x]

                            # x and y staring and ending coordinates
                            # the text prediction bounding box
                            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
                            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
                            startX = int(endX - w)
                            startY = int(endY - h)

                            # add the bounding box coordinates and probability score to
                            # our respective lists
                            rects.append((startX, startY, endX, endY))
                            confidences.append(scoresData[x])

                    # suppress weak and overlapping bounding
                    # boxes
                    boxes = non_max_suppression(np.array(rects), probs=confidences)

                    # loop over the bounding boxes
                    area_temp = []
                    for (startX, startY, endX, endY) in boxes:
                        # scale the box coordinates
                        # ratios
                        startX = int(startX * rW)
                        startY = int(startY * rH)
                        endX = int(endX * rW)
                        endY = int(endY * rH)
                        Text_detected = True
                        # draw the bounding box on the image
                        area_temp.append(((abs(endX - startX) * abs(endY - startY)) / (H * W)) * 100)
                    Percent_of_area_occupied_by_text = sum(area_temp)
                    # get all the parameters
                    params["Keyframe_number"].append(keyframe_number)
                    params["Instructor_detected"].append(Instructor_detected)
                    params["Hand_detected"].append(Hand_detected)
                    params["Instructor_upperBody_detected"].append(Instructor_upperBody_detected)
                    params["Instructor_fullBody_detected"].append(Instructor_fullBody_detected)
                    params["Teaching_method"].append(Teaching_method)
                    params["Text_detected"].append(Text_detected)
                    params["Percent_of_area_occupied_by_text"].append(Percent_of_area_occupied_by_text)
                    if Percent_of_area_occupied_by_text > 60:
                        params["Too_much_text_occupied"].append("Too Much Text detected")
                    else:
                        params["Too_much_text_occupied"].append(Too_much_text_occupied)

                cap.release()
            else:
                print('No I-frames in ' + video_fn)
    if video_found:
        data = pd.DataFrame(params)
        data.set_index("Keyframe_number")
        data.to_csv(OUTPUT_FOLDER + os.sep + seg_file + "video_details" +".csv", index=False)
        # average details
        Instructor_presence = ""
        if len(data.Instructor_detected.unique()) == 2:
            Instructor_presence = "Part of the video frame"
        elif list(data.Instructor_detected.unique())[0]:
            Instructor_presence = "Whole screen"
        else:
            Instructor_presence = "No Instructor detected"

        Interaction_instructor = []
        if len(data.Hand_detected.unique()) == 2:
            Interaction_instructor.append("hand")
        elif list(data.Hand_detected.unique())[0]:
            Interaction_instructor.append("Hand")

        if len(data.Instructor_upperBody_detected.unique()) == 2:
            Interaction_instructor.append("UpperBody")
        elif list(data.Instructor_upperBody_detected.unique())[0]:
            Interaction_instructor.append("UpperBody")

        if len(data.Instructor_fullBody_detected.unique()) == 2:
            Interaction_instructor.append("FullBody")
        elif list(data.Instructor_fullBody_detected.unique())[0]:
            Interaction_instructor.append("FullBody")

        avg_params = {
            "Total_I_KeyFrames": [len(data.index)],
            "Total_segments_obtained": [total_segments],
            "Video_length": [str(total_video_len/60) + ' mins'],
            "Instructor_presence": [Instructor_presence],
            "Instructor_gesture_interaction": [Interaction_instructor],
            "Instructor_teaching_methods": [list(data.Teaching_method.unique())[:-1] if len(data.Teaching_method.unique()) > 1 else list(data.Teaching_method.unique())[0]]
        }
        df = pd.DataFrame(avg_params)
        df.to_csv(OUTPUT_FOLDER + os.sep + seg_file + "video_average_details" + ".csv", index=False)

[INFO] loading text detector...


error: OpenCV(4.5.2) /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-iwig8vc6/opencv/modules/dnn/src/caffe/caffe_io.cpp:1133: error: (-2:Unspecified error) FAILED: fs.is_open(). Can't open "https://raw.githubusercontent.com/oyyd/ frozen_east_text_detection.pb/master/frozen_east_text_detection.pb" in function 'ReadProtoFromBinaryFile'
