In [7]:
import matplotlib.pyplot as plt
import numpy as np
import os, sys
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm
from scipy.signal import find_peaks, peak_prominences
import torch

#### Open csv-files with OpenFace analysis results

In [13]:
def get_one_video(file_name: str):
    with open(file_name) as f:
        video_df = pd.read_csv(file_name, sep=',')
    return video_df[['timestamp', 'confidence', 'pose_Rx', 'pose_Ry', 'pose_Rz']]

#### Process OpenFace analysis results: if timestamp confidence < 0.6, replace value with neighbours' average value

In [14]:
def check_confidence(df):
    new_df = pd.DataFrame(columns=['timestamp', 'confidence', 'pose_Rx', 'pose_Ry', 'pose_Rz'])
    for ind, row in df.iterrows():
        if row['confidence'] > 0.6:
            new_df = new_df.append(row, ignore_index=True)
        elif ind == 0:
            next_row = df.iloc[[1]]
            new_df = new_df.append(pd.Series([row['timestamp'], 
                                     next_row['confidence'].values[0], 
                                     next_row['pose_Rx'].values[0], 
                                     next_row['pose_Ry'].values[0],
                                     next_row['pose_Rz'].values[0]], 
                        index=new_df.columns), ignore_index=True)
        elif ind == (df.shape[0] - 1):
            prev_row = df.iloc[[ind - 1]]
            new_df = new_df.append(pd.Series([row['timestamp'], 
                                     prev_row['confidence'].values[0], 
                                     prev_row['pose_Rx'].values[0], 
                                     prev_row['pose_Ry'].values[0],
                                     prev_row['pose_Rz'].values[0]], 
                          index=new_df.columns), ignore_index=True)
        else:
            next_row = df.iloc[[ind + 1]]
            prev_row = df.iloc[[ind - 1]]
            
            new_df = new_df.append(pd.Series([row['timestamp'], 
                                     np.mean([prev_row['confidence'].values[0], next_row['confidence'].values[0]]), 
                                     np.mean([prev_row['pose_Rx'].values[0], next_row['pose_Rx'].values[0]]), 
                                     np.mean([prev_row['pose_Ry'].values[0], next_row['pose_Ry'].values[0]]),
                                     np.mean([prev_row['pose_Rz'].values[0], next_row['pose_Rz'].values[0]])], 
                          index=new_df.columns), ignore_index=True)
    return new_df

#### Optional: get informant id from the corpus fragment name

In [15]:
def get_meta_info(token_name):
    informant = token_name.split('_')[0].split('-')[2]
    search_type = token_name.split('_')[1]
    
    return informant, search_type

#### Get results: count peaks, amplitude, frequencies etc

In [16]:
def create_results(path: str) -> pd.DataFrame:
    negation_info_df = pd.DataFrame(columns=['token_name', 
                                             'informant',
                                             'timestamps_num', 
                                             'peaks_num', 
                                             'peaks_amplitude', 
                                             'frequencies'])
    
    for file_name in tqdm(os.listdir(path=path)):
        if file_name.endswith('.csv'):
            
            df = get_one_video(path+file_name)
            df.columns = ['timestamp', 'confidence', 'pose_Rx', 'pose_Ry', 'pose_Rz']
            df = check_confidence(df)
            informant, search_type = get_meta_info(file_name.split('.')[0])

            df_dict = {'token_name': file_name,
                       'informant': informant}
                       
            max_peaks, _ = find_peaks(df['pose_Ry'])
            min_peaks, _ = find_peaks(df['pose_Ry'] * (-1))

            all_peaks = np.sort(np.concatenate((max_peaks, min_peaks)))
            new_peaks = []
            y = df['pose_Ry']

            if len(all_peaks) > 1:
                for i, peak in enumerate(all_peaks):
                    if peak == all_peaks[0]:
                        diff = [abs(y[peak] - y[all_peaks[i+1]])]
                    elif peak == all_peaks[-1]:
                        diff = [abs(y[peak] - y[all_peaks[i-1]])]
                    else:
                        diff = [abs(y[peak] - y[all_peaks[i-1]]), abs(y[peak] - y[all_peaks[i+1]])]

                    if max(diff) > 0.02:
                        new_peaks.append(peak)
            else:
                new_peaks = all_peaks


            if len(new_peaks) <= 1:
                #print(df['pose_Ry'].values)
                df_dict['peaks_amplitude'] = df['pose_Ry'].values.max() - df['pose_Ry'].values.min()
            else:
                df_dict['peaks_amplitude'] = max(df['pose_Ry'][new_peaks]) - min(df['pose_Ry'][new_peaks])

            df_dict['peaks_num'] = len(new_peaks)

            #  вот здесь мы считаем частоту 
            if len(new_peaks) > 1:
                T = (df['timestamp'][new_peaks[-1]] - df['timestamp'][new_peaks[0]]) / (len(new_peaks) - 1)
                F = 1 / T
                df_dict['avg_time'] = T
                df_dict['frequencies'] = F
                df_dict['timestamps_num'] = df['timestamp'][new_peaks[-1]] - df['timestamp'][new_peaks[0]]
                df_dict['avg_time'] = 0
                df_dict['frequencies'] = 0
                df_dict['timestamps_num'] = df['timestamp'].iat[-1] - df['timestamp'].iat[0]

            negation_info_df = negation_info_df.append(df_dict, ignore_index=True)


    negation_info_df['peaks_num'] = negation_info_df['peaks_num'].astype('int')
    negation_info_df['frequencies'] = negation_info_df['frequencies'].astype('float64')

    return negation_info_df

In [18]:
df = create_results('../new_extracted/annot_cut_3/processed/')
df.head()

In [20]:
def get_fragment_name(fragment):
    return fragment.replace('.csv', '')

df.rename(columns={'token_name': 'fragment'}, inplace=True)
df.fragment = df.fragment.apply(get_fragment_name)

#### Add annotated information about sign type

In [21]:
signs_df = pd.read_csv('../sign_annotations.csv', sep=';')
signs_df = signs_df.drop(columns=['Column1'])

In [22]:
signs_df

Unnamed: 0,fragment,sign
0,RSLM-cr1-s14_NET_0,НЕТ
1,RSLM-cr4-s13_NE_6,НЕРЕГУЛЯРНОЕ
2,RSLM-cr5-s22_NE_10,НЕРЕГУЛЯРНОЕ
3,RSLM-m5-s40-d-std_NE_20,НЕРЕГУЛЯРНОЕ
4,RSLM-m5-s40-d-std_NEG_0,НЕРЕГУЛЯРНОЕ
...,...,...
69,RSLN-n5-s25-d-std_NET_155,НЕТ
70,RSLN-n5-s25-d-std_NET_156,НЕТ
71,RSLN-n5-s25-d-std_NET_158,НЕТ
72,RSLN-n5-s25-d-std_NET_159,НЕТ


In [23]:
df = df.set_index('fragment').join(signs_df.set_index('fragment'))

In [24]:
df1

Unnamed: 0_level_0,informant,timestamps_num,peaks_num,peaks_amplitude,frequencies,avg_time,sign
fragment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
RSLM-cr1-s14_NET_0,s14,0.28,3,0.2410,7.142857,0.140000,НЕТ
RSLM-cr4-s13_NE_6,s13,0.52,4,0.2630,5.769231,0.173333,НЕРЕГУЛЯРНОЕ
RSLM-cr5-s22_NE_10,s22,0.12,2,0.1060,8.333333,0.120000,НЕРЕГУЛЯРНОЕ
RSLM-m5-s40-d-std_NEG_0,s40,0.20,2,0.1700,5.000000,0.200000,НЕРЕГУЛЯРНОЕ
RSLM-m5-s40-d-std_NE_20,s40,0.36,3,0.3350,5.555556,0.180000,НЕРЕГУЛЯРНОЕ
...,...,...,...,...,...,...,...
RSLN-n5-s25-d-std_NET_155,s25,1.12,11,0.4710,8.928571,0.112000,НЕТ
RSLN-n5-s25-d-std_NET_156,s25,0.76,6,0.5640,6.578947,0.152000,НЕТ
RSLN-n5-s25-d-std_NET_158,s25,1.00,10,0.7710,9.000000,0.111111,НЕТ
RSLN-n5-s25-d-std_NET_159,s25,1.24,14,0.1805,10.483871,0.095385,НЕТ


### Save all

In [25]:
df1.to_csv('../new_extracted/annot_cut_3/processed/processed_fragments.csv')