# FINAL PROJECT: CV-BASED TENNIS COACHING
## Prepared for: Exam of *AI Lab: Computer Vision and NLP* by Professor Pannone
### Prepared by: Federica Bruni (1933963) , Maria Emilia Russo (1966203)

### short project overview
This notebook is the code of the project 'CV-BASED TENNIS COACHING' which aims at analysing videos from a tennis game and being able to:
- predict the next stroke
- compare two videos containing the same stroke and get a score out of 100 on how similar these strokes look.

## step0 - imports & costants

In [460]:
import pandas as pd
import numpy as np
import os
import json
import io
import base64
from IPython.display import HTML
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import cv2
from sklearn import preprocessing
from dtaidistance import dtw

In [461]:
#this variable holds True if the needed envirorment for this project exists
setted_up = False

#this variable holds the absolute path to the notebook directory
dir_path = '/Users/uni/opt/' #INSERT YOUR PATH HERE

#this variables hold the absolute path to the train & test directories
train_dir = ''
test_dir = ''

#since each training file corresponds to one and only one output directory we use a dictionary to keep track of
#mapping between video name and directory number
train_dict= {}
train_num = 0


## step1 - build needed structure

In [462]:
def set_up():
    '''function that constructs a system of directories to organize our data for this project.
    
    The idea is the following:
    notebook-directory-path  --> train - directory of training videos 
                             --> test - directory of testing videos '''
    
    #sanity check to avoid building the same enirorment twice
    global setted_up
    
    if setted_up == True:
        return 'The required system of directories already exists'
    else:
        setted_up = True
    
    #if it does not exist yet: build train directory to collect training videos
    train_directory = os.path.join(dir_path, 'train/')
    if not os.path.exists(train_directory):
        os.mkdir(train_directory)
        global train_dir
        train_dir = train_directory
    
    
    #if it does not exist yet: build test directory to collect testing videos
    test_directory = os.path.join(dir_path, 'test/')
    if not os.path.exists(test_directory):
        os.mkdir(test_directory)
        global test_dir
        test_dir = test_directory
    
    return 'Envirorment set-up was successfully completed'


## step2 - For each training file build file-related structure and populate with openpose

In [463]:
def populate():
    ''' function that firstly builds directories to hold training files openpose result and then runs openpose. 
    
    In particular for each file:
    notebook-directory-path  --> train - directory of training videos -> output_train_0 --> images
                                                                                        --> json
                                                                                        --> video (mp4)
    NB: 3 assumptions: 
        - input video is in '.avi'
        - filename format: stroke_name + _ + number '''

    global train_dir
    global train_dict
    global train_num
    
    #this check is needed to ensure that populate has the train directory to which
    if setted_up == False:
        return 'The envirorment must be setted up before population'
    
    #list of train_videos names
    train_videos = [file.split('.')[0] for file in os.listdir(train_dir) if file.endswith('.avi')]
    
    for vid in train_videos:
        
        #first we make sure that the ouput of this file does not exist already by verifying that the video name
        #has not been stored as key in the train dictionary
        if vid in train_dict.keys():
            print('The output of ' + vid + ' already exists')
            continue

        train_dict[vid]=train_num
        train_num += 1
    
        #filename with extenction
        tot_name = vid + '.avi'
        
        #path to training video
        video_path = os.path.join(train_dir,tot_name)
        
        #path to output of openpose
        train_out_dir_path = os.path.join(dir_path, 'output_train'+str(train_num-1)+'/')
        
        #no need to check if directory already exists because we've already checked that no mapping exists
        
        os.mkdir(train_out_dir_path)
            
        #if the directory did not exist before then for sure neither its subdirectory existed
        train_json = os.path.join(train_out_dir_path,'json/')
        os.mkdir(train_json)
            
        train_images = os.path.join(train_out_dir_path,'images/')
        os.mkdir(train_images)
            
        train_mp4 = os.path.join(train_out_dir_path,'video/')
        os.mkdir(train_mp4)
            
        #path to save the video result
        video_save = os.path.join(train_mp4,tot_name)
            
        #openpose in action
        !cd "opt/openpose" && build/examples/openpose/openpose.bin --video $video_path --write_json $train_json --display 0 --write_images $train_images --write_images_format jpg --write_video $video_save
    
        # convert the result into MP4
        video_mp4 = os.path.join(train_mp4,vid+'.mp4')
        !ffmpeg -y -loglevel info -i $video_save $video_mp4
            
        #print confirmation
        print('The output of '+ vid + ' has been stored')
        
            
    return 'The population was successfully completed'

In [193]:
set_up()

'The required system of directories already exists'

In [194]:
populate()

The output of backhand_1 already exists
The output of backhand_2 already exists
Starting OpenPose demo...
Configuring OpenPose...
Starting thread(s)...
We have introduced an additional boost in accuracy in the CUDA version of about 0.2% with respect to the CPU/OpenCL versions. We will not port this to CPU given the considerable slow down in speed it would add to it. Nevertheless, this accuracy boost is almost insignificant so the CPU/OpenCL versions can be safely used.
Empty frame detected, frame number 76 of 77. In /Users/uni/opt/openpose/src/openpose/producer/producer.cpp:checkFrameIntegrity():290
Empty frame detected, frame number 76 of 77. In /Users/uni/opt/openpose/src/openpose/producer/producer.cpp:checkFrameIntegrity():290
Empty frame detected, frame number 76 of 77. In /Users/uni/opt/openpose/src/openpose/producer/producer.cpp:checkFrameIntegrity():290
OpenPose demo successfully finished. Total time: 93.494168 seconds.
ffmpeg version 5.0.1 Copyright (c) 2000-2022 the FFmpeg dev

frame=   72 fps=0.0 q=-1.0 Lsize=     364kB time=00:00:02.30 bitrate=1296.4kbits/s speed=9.86x    
video:362kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.422464%
[1;36m[libx264 @ 0x138607d20] [0mframe I:1     Avg QP:22.58  size: 24963
[1;36m[libx264 @ 0x138607d20] [0mframe P:38    Avg QP:23.51  size:  7422
[1;36m[libx264 @ 0x138607d20] [0mframe B:33    Avg QP:28.69  size:  1924
[1;36m[libx264 @ 0x138607d20] [0mconsecutive B-frames: 27.8% 22.2% 33.3% 16.7%
[1;36m[libx264 @ 0x138607d20] [0mmb I  I16..4: 33.9% 59.3%  6.7%
[1;36m[libx264 @ 0x138607d20] [0mmb P  I16..4:  4.1%  7.9%  1.5%  P16..4: 22.7% 12.5% 10.5%  0.0%  0.0%    skip:40.8%
[1;36m[libx264 @ 0x138607d20] [0mmb B  I16..4:  1.1%  0.9%  0.2%  B16..8: 38.2%  7.0%  2.3%  direct: 1.9%  skip:48.5%  L0:57.3% L1:34.3% BI: 8.4%
[1;36m[libx264 @ 0x138607d20] [0m8x8 transform intra:56.9% inter:74.5%
[1;36m[libx264 @ 0x138607d20] [0mcoded y,uvDC,uvAC intra: 40.8% 55.5% 13.9% inter: 14.

'The population was successfully completed'

## step3 - build train keypoint dataframe

In [486]:
def loadTrainData():
    '''function that translates all output training json files in readable pandas dataframes.
    The columns are:file number, frame number, x and y of each of the 24 keypoints that openpose identified'''
    
    path_out = dir_path
    
    #number of output train files
    numOfSet= len(train_dict)
    

    #initialize empty dataframe
    train = pd.DataFrame()
    
    #for each training video output
    for d in range(0, numOfSet):
        
        #path to json file directory
        path = os.path.join(path_out, 'output_train'+ str(d)+'/json/')
        
        #frame numbers
        Frames = [name for name in os.listdir(path)]
        
        #for each frame of training video
        for frame_num,f in enumerate(Frames):
            
            #find path to json file of that frame
            file = open(path + f)

            #load file
            j = json.load(file)
            
            #initialize row
            row = [[]]
            
            #if there is at least one person in frame
            if (j['people'] != []):
                
                #in order to find main player out of all people identified we save for each element identified
                #the size (difference between yhead-yfoot), the biggest player is the one closest to the camera
                #and so our target
                identify = []
                for i in j['people']:
                    body_size= (i['pose_keypoints_2d'][1])-(i['pose_keypoints_2d'][33])
                    identify.append(body_size)
                z = identify.index(max(identify))
                      
                #add keypoints found
                row[0] += j['people'][z]['pose_keypoints_2d']
            else:
                continue
            #then we initialize the columns
            columns =  ['file_num', 'frame_num']
            for i in range(25):
                X = 'X' + str(i)
                Y = 'Y' + str(i)
                P = 'P' + str(i)
                columns += [X, Y, P]
            
            #since 25 keypoints and (X,Y,P) each keypoint
            if (len(row[0]) == 75):
                
                    #complete row
                    row = [d] + [frame_num] + row[0]
                    
                    #build tmp dataframe
                    tmp = pd.DataFrame ([row], columns=columns)
                    
                    #concatenate with existing one
                    train = pd.concat([train,tmp], ignore_index=True)
                    
                    #start over
                    row=[[]]
               
    train = train.drop(list(train.filter(like='P', axis=1)), axis = 1)
    return train

In [487]:
trainData= loadTrainData()

## step4 - define labels

In [299]:
#we assign for each tennis shot a unique number
label_dict = {'forehand':0, 'backhand':1, 'serve':2}

#we build dataframe to represent the assignment
label_df={'label_names':['forehand','backhand','serve'], 'label_nums':[0,1,2]}
df = pd.DataFrame(data=label_df)
df

Unnamed: 0,label_names,label_nums
0,forehand,0
1,backhand,1
2,serve,2


## step5 - manually assign labels

In [300]:
#this step could be a nightmare, assigning each frame with a label number. 
#We insted make it efficient taking two assumptions:
#  1) all training files are saved with shotname+_+number
#  2) we assume that each training file represents only one stroke type (hence it can give that name to the file)
# so our implementation directly taking the name from the filename adds the correspondinf lable to all file's frames

In [301]:

def addTrainLabels():
    '''function that returns the current train labels dataset, which is the dataframe that maps each training file
    number to a label wrt the file name'''
    
    #each time the returned dataframe is up-to-date because it is compiled from scratch each time
    trainLabels = pd.DataFrame()
    col = ['file_num','label_num']
    for name, num in train_dict.items():
        n = df.loc[df['label_names']==(name.split('_')[0])][['label_nums']]
        ndf = pd.DataFrame ([[num,int(n.apply(int))]], columns=col)
        trainLabels= pd.concat([trainLabels,ndf], ignore_index=True)
    return trainLabels


In [302]:
#create current training file-label dataframe
training_labels = addTrainLabels()

In [303]:
def TrainedDataset(trainData, labels):
    '''function that merges train dataframe with current label dataframe'''
    train = pd.merge(trainData, labels, on=['file_num'],how = 'inner')
    return train



In [304]:
#final training dataframe
final_train=TrainedDataset(trainData, training_labels)
final_train

Unnamed: 0,file_num,frame_num,X0,Y0,X1,Y1,X2,Y2,X3,Y3,...,Y20,X21,Y21,X22,Y22,X23,Y23,X24,Y24,label_num
0,0,0,0.000,0.000,117.887,115.932,114.937,115.919,0.000,0.000,...,169.740,125.709,168.769,0.000,0.000,0.000,0.000,122.780,169.711,1
1,0,1,0.000,0.000,450.438,184.400,463.165,182.436,486.642,181.473,...,337.988,490.566,334.067,423.089,316.484,425.014,315.493,443.608,323.315,1
2,0,2,420.172,172.655,440.701,184.427,446.533,182.444,0.000,0.000,...,336.036,484.735,324.312,421.106,320.358,423.106,318.407,441.678,324.308,1
3,0,3,413.318,182.459,432.861,195.149,429.955,194.209,423.104,210.825,...,347.722,462.209,343.834,425.047,323.313,427.975,321.374,445.562,326.252,1
4,0,4,0.000,0.000,451.422,184.435,466.108,183.406,490.570,190.251,...,338.954,489.610,333.089,434.825,319.383,436.800,317.472,450.476,324.294,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252,3,67,0.000,0.000,322.312,163.859,336.005,165.792,339.958,200.063,...,301.797,284.180,302.753,333.115,308.622,332.088,309.618,314.503,307.680,0
253,3,68,356.579,146.234,337.955,158.941,333.075,160.907,323.314,186.368,...,284.150,309.590,285.141,348.718,302.736,342.846,304.728,324.283,301.804,0
254,3,69,0.000,0.000,324.291,148.233,338.939,151.151,336.022,170.704,...,279.307,306.678,285.162,337.969,286.140,340.920,288.078,333.086,288.107,0
255,3,70,0.000,0.000,324.258,159.931,335.029,161.904,337.983,188.298,...,300.793,285.157,302.777,340.936,305.724,338.970,306.693,319.409,302.745,0


## step6 - train an SVM model

In [362]:
#we will use Scikit-Learn's support vector classifier to train an SVM model on our data
svm = SVC(kernel = 'linear', probability = True)

In [366]:
def splitdata(trainData):
    X = trainData.iloc[:, 2:-1]
    y = trainData.iloc[:, -1].values
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2)
    return X_train, X_val, y_train, y_val

In [367]:
def train(trainData):
    X_train, X_val, y_train, y_val = splitdata(trainData)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_val)
    print(classification_report(y_val, y_pred))
    return svm

In [368]:
SVM = train(final_train)

              precision    recall  f1-score   support

           0       1.00      0.87      0.93        31
           1       0.84      1.00      0.91        21

    accuracy                           0.92        52
   macro avg       0.92      0.94      0.92        52
weighted avg       0.94      0.92      0.92        52



## step7 - load test data

In [274]:
def buildTestData(video):
        ''' function that firstly builds directories to hold test files openpose result and then runs openpose. 
    
        In particular for each file:
        notebook-directory-path  --> train - directory of training videos -> output_test_0 --> images
                                                                                        --> json
                                                                                        --> video (mp4)
        NB: 3 assumptions: 
            - input video is in '.avi'
            - filename format: test + _ + number '''

        #filename without extenction
        vid = video[:-4]
        
        #path to training video
        video_path = os.path.join(test_dir,video)
        
        #path to output of openpose
        train_out_dir_path = os.path.join(dir_path, 'output_'+vid+'/')
        os.mkdir(train_out_dir_path)

        train_json = os.path.join(train_out_dir_path,'json/')
        os.mkdir(train_json)
            
        train_images = os.path.join(train_out_dir_path,'images/')
        os.mkdir(train_images)
            
        train_mp4 = os.path.join(train_out_dir_path,'video/')
        os.mkdir(train_mp4)
            
        #path to save the video result
        video_save = os.path.join(train_mp4,video)
            
        #openpose in action
        !cd "opt/openpose" && build/examples/openpose/openpose.bin --video $video_path --write_json $train_json --display 0 --write_images $train_images --write_images_format jpg --write_video $video_save
    
        # convert the result into MP4
        video_mp4 = os.path.join(train_mp4,vid+'.mp4')
        !ffmpeg -y -loglevel info -i $video_save $video_mp4
            
        #print confirmation
        print('The output of '+ vid + ' has been stored')
        
            
        return 'The population was successfully completed'

In [276]:
buildTestData('test_2.avi')

Starting OpenPose demo...
Configuring OpenPose...
Starting thread(s)...
We have introduced an additional boost in accuracy in the CUDA version of about 0.2% with respect to the CPU/OpenCL versions. We will not port this to CPU given the considerable slow down in speed it would add to it. Nevertheless, this accuracy boost is almost insignificant so the CPU/OpenCL versions can be safely used.
Empty frame detected, frame number 329 of 330. In /Users/uni/opt/openpose/src/openpose/producer/producer.cpp:checkFrameIntegrity():290
Empty frame detected, frame number 329 of 330. In /Users/uni/opt/openpose/src/openpose/producer/producer.cpp:checkFrameIntegrity():290
Empty frame detected, frame number 329 of 330. In /Users/uni/opt/openpose/src/openpose/producer/producer.cpp:checkFrameIntegrity():290
OpenPose demo successfully finished. Total time: 512.587124 seconds.
ffmpeg version 5.0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with Apple clang version 13.1.6 (clang-1316.0.21.2.5)
  c

'The population was successfully completed'

## step8 - build test keypoint dataframe

In [489]:
def loadTestData(video):
    '''function that translates a given video's output test json files in readable pandas dataframes.
    The columns are: frame number, x and y of each of the 24 keypoints that openpose identified'''

    #initialize to empty dataframe
    test = pd.DataFrame()
    
    #filename without extenction
    vid = video[:-4]
    
    #path to output folder
    path= os.path.join(dir_path, 'output_'+vid+'/')
        
    #frame number
    Frames = [name for name in os.listdir(path+'json/')]
        
    #for each frame
    for i,frame in enumerate(Frames):
            
        #path to frame json
        file = open(path +'json/'+frame)

        #load file
        j = json.load(file)

        #initialize
        row = [[]]
            
        #if there is at least one person in frame
        if (j['people'] != []):
                
            #in order to find main player out of all people identified we save for each element identified
            #the size (difference between yhead-yfoot), the biggest player is the one closest to the camera
            #and so our target
            identify = []
            for w in j['people']:
                body_size= (w['pose_keypoints_2d'][1])-(w['pose_keypoints_2d'][33])
                identify.append(body_size)
            z = identify.index(max(identify))
                      
            #add keypoints found
            row[0] += j['people'][z]['pose_keypoints_2d']
            
        else:
            continue
        
        #initialize columns
        columns =  ['frame_num']
        for k in range(25):
            X = 'X' + str(k)
            Y = 'Y' + str(k)
            P = 'P' + str(k)
            columns += [X, Y, P]
        columns += ['label']
            
        if (len(row[0]) == 75):
                #complete row, set all labels to 0
                row = [i]+ row[0] +[0]  
                fdf = pd.DataFrame ([row], columns=columns)
                test = pd.concat([test,fdf], ignore_index=True)
                    
                #start over
                row=[[]]
                                   
    test = test.drop(list(test.filter(like='P', axis=1)), axis = 1)
    return test

In [490]:
testdf = loadTestData('test_2.avi')
testdf

Unnamed: 0,frame_num,X0,Y0,X1,Y1,X2,Y2,X3,Y3,X4,...,Y20,X21,Y21,X22,Y22,X23,Y23,X24,Y24,label
0,0,339.954,123.780,354.589,139.412,346.800,139.428,0.000,0.000,0.000,...,319.402,389.833,314.510,363.387,298.868,364.411,295.942,381.028,292.017,0
1,1,0.000,0.000,356.559,141.358,364.382,142.323,367.316,170.685,349.703,...,310.568,363.405,303.754,373.192,304.702,378.071,302.762,383.971,302.754,0
2,2,305.708,121.821,291.005,139.415,289.071,139.402,301.775,158.971,321.344,...,0.000,298.821,268.533,285.164,270.467,279.279,271.446,276.341,264.607,0
3,3,0.000,0.000,350.677,132.561,367.304,133.561,363.402,157.972,0.000,...,276.353,335.054,279.300,366.322,285.162,364.366,286.120,351.661,283.222,0
4,4,0.000,0.000,210.819,138.412,224.488,142.318,231.369,169.733,250.928,...,291.039,178.527,293.001,217.639,302.793,212.777,303.749,199.079,296.890,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,324,0.000,0.000,223.516,127.685,245.031,129.633,252.850,160.892,238.223,...,296.875,182.411,293.966,236.264,292.976,243.079,293.943,242.092,294.953,0
325,325,0.000,0.000,159.913,133.548,178.555,134.509,185.376,156.062,0.000,...,288.114,149.165,289.070,140.376,301.784,132.577,302.770,115.932,292.981,0
326,326,177.555,113.941,0.000,0.000,172.643,130.621,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0
327,327,0.000,0.000,354.640,134.495,371.246,135.472,369.284,157.034,0.000,...,274.405,335.062,277.354,369.276,287.118,374.182,288.071,370.284,288.100,0


## step 9 - prediction 

In [491]:
def test(model, testData):
    '''this function given a dataframe and a SVM model returns the input dataframe updated to the predicted labels'''
    X = testData.iloc[:, 1:-1]
    ypred = model.predict(X)
    ydf = pd.DataFrame()
    columns=['frame_num','pred']
    for i,j in enumerate(ypred):
        tmp = [[]]
        tmp[0] = df[df['label_nums']==j]['label_names'].iloc[0]
        tmp = [i] + [tmp[0]]
        
        d = pd.DataFrame ([tmp], columns=columns)
        ydf = pd.concat([ydf,d], ignore_index=True)


    del testData['label']

    testData = pd.merge(testData, ydf, on=['frame_num'],how = 'inner')


    return testData

In [492]:
predicted_df = test(SVM, testdf)


## step 10 - output prediction video

In [424]:
def foreback_output(video_in, TestData,out_name):
    '''this function displays on a new mp4 file the predictions taken from the dataframe we implemented in step10'''
    out = dir_path+ 'output_'+video_in[:-4]+'/'
    out_img = out+'images/'
    Frames = [name for name in os.listdir(out_img)]
    Frames.sort()
    
    for i,f in enumerate(Frames):
        img = cv2.imread(out_img+f)
        shot = TestData['pred'][i]
        
        if i == 0:
            height, width, channels = img.shape
            video_out = cv2.VideoWriter(out+out_name, cv2.VideoWriter_fourcc('m', 'p', '4', 'v'), 30, (width, height))

        imgf = plot_foreback(img,width, height,shot)
        video_out.write(imgf)
    video_out.release()
    
    return

def plot_foreback(img, width, height, shot_type):
    shot = 'shot: {}'.format(shot_type)
    
    img = cv2.putText(img, shot, (int(width/30), int(13*height/15)), cv2.FONT_HERSHEY_DUPLEX, 1, (255, 255, 255), 1, cv2.LINE_AA)
    return img

In [425]:
foreback_output('test_2.avi', predicted_df, 'predicted_test_22.mp4')

## step11 - evaluate your tennis wrt to your faviorite player's stroke

In [439]:
def loadCompData(path):
    '''function that returns list of x and y of keypoints '''
    Files = [name for name in os.listdir(path+'json/')]
    Files.sort()
    
    df = [[] for i in range(len(Files))] 
    
    for i,f in enumerate(Files):
        dff=[]
        file = open(path +'json/'+f)
        j = json.load(file)
        if (j['people'] != []):
            #trick per trovare il giocatore TODO: migliora, per ora la mia idea: quello che ha piu dettagli
            identify = []
            for k in j['people']:
                body_size= (k['pose_keypoints_2d'][1])-(k['pose_keypoints_2d'][33])
                identify.append(body_size)
            z = identify.index(max(identify))
            q = j['people'][z]['pose_keypoints_2d']
            
            w=0
            while w != 72:
                l = q[w:w+2]
                dff.extend(l)
                w+=3
            df[i]=dff
    return df,i

In [454]:
#try with two videos
d1,i = loadCompData('/Users/uni/opt/output_test_1/')
d2,j = loadCompData('/Users/uni/opt/output_train2/')

In [455]:
def distance(m1,m2,i,j):
    '''distance between two videos'''
    model_points = m1/ np.linalg.norm(m1)
    input_points = m2/np.linalg.norm(m2)
    x = dtw.distance(model_points, input_points)
    return int(100-(100*(dtw.distance(model_points, input_points))))

In [458]:
def compare(model1, model2,i,j):
    '''function that compares two videos by comparing with dtw distance each keypoint of normalized to l2'''
    scores = []
    for k in range(24):
        m1 = [row[k] for row in model1]
        m2 = [row[k] for row in model2]
        scores.append(distance(m1,m2,i,j))
    return 'The two shots are '+ str(int(np.mean(scores)))+'% similar'

In [459]:
compare(d1, d2,i,j)

'The two shots are 71% similar'