In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os, sys
import pandas as pd
import seaborn as sns
from scipy.signal import find_peaks, peak_prominences
import torch

In [3]:
def get_one_video(file_name: str):
    with open(file_name) as f:
        video_df = pd.read_csv(file_name, sep=',')
        return video_df[[' timestamp', ' confidence', ' pose_Rx', ' pose_Ry', ' pose_Rz']]

## Работа с корпусными данными

In [5]:
#  разметка жестов, на которые распространяется немануальное маркирование, лежит в отдельном файле

with open('распространение.csv', encoding='utf-8') as f:
    spr_df = pd.read_csv(f, sep=',').iloc[: , :7]

spr_df.head()

Unnamed: 0,token_name,spreading_POS,spreading_annot,Annotation_begin,Annotation_end,annot_before,annot_after
0,Поиск 1_166,SUBJ_NOTEXIST,опыт_нет,13690.0,14610.0,1,0
1,Поиск 1_104,VERB_NEG,слышать_нет,19850.0,20660.0,1,0
2,Поиск 2_226,SUBJ_VERB.NEG,я_не.знать,27530.0,28880.0,1,0
3,Поиск 6_10,VERB_NEVER,ехать_никогда,32700.0,33540.0,0,0
4,Поиск 2_77,SUBJ_VERB.NEG,я_не.помнить,40290.0,40910.0,1,0


In [44]:
def negation_type(spr):
    if spr == 'DISC':
        return 'PART'
    elif 'DOUBLE' in spr:
        return 'DOUBLE'
    elif 'VERB.NEG' in spr:
        return 'INCORP'
    elif 'VERB_NEG' in spr:
        return 'VERB.NEG'
    elif 'NOTEXIST' in spr:
        return 'VERB.NEG'
    elif 'NOBODY' in spr:
        return 'PRON'
    elif 'NOTHING' in spr:
        return 'PRON'
    elif 'NEVER' in spr:
        return 'PRON'

In [45]:
def get_token(full_name):
    return full_name.split('/')[2].replace('.mp4', '.csv')

def get_path(full_path):
    return full_path.split('\\')[6].replace('.eaf', '').split('-')[2]

#  файл с метаинформацией об информанте для каждого фрагмента
with open('sentences_info.csv', encoding='utf-8') as inf:
    df_inf = pd.read_csv(inf, header=None, names=['name', 'path', 'annot1', 'annot2', 'full_annot', 'other'], sep='\t')
    df_inf['name'] = df_inf.name.apply(get_token)
    df_inf['path'] = df_inf.path.apply(get_path)

df_inf.head()

Unnamed: 0,name,path,annot1,annot2,full_annot,other
0,Поиск 1_0.csv,s14,46990,51660,"[(46970, 47650, 'вход'), (47650, 48350, 'собак...",
1,Поиск 1_1.csv,s14,66590,66951,"[(66581, 66951, 'нет'), (66951, 67201, 'ладно')]",
2,Поиск 1_2.csv,s21,68415,68725,"[(67665, 68415, 'поднять.скатерть'), (68415, 6...",
3,Поиск 1_3.csv,s21,71487,72007,"[(69997, 71487, 'кресло'), (71487, 72007, 'нет')]",
4,Поиск 1_4.csv,s23,54005,54345,"[(53995, 54345, 'нет'), (54345, 54905, 'точно')]",


In [46]:
negation_info_df = pd.DataFrame(columns=['token_name', 'timestamps_num', 'peaks_num', 
                                         'abs_amplitude', 'peaks_amplitude', 'check_full_sentence', 'frequencies'])

#  обрабатываем отдельно каждый фрагмент с немануальным маркированием, лежащий в отдельной папке


for file_name in os.listdir(path='./processed_with_marking/'):
    if file_name.endswith('.csv'):
        df = get_one_video('./processed_with_marking/'+file_name)
        df.columns = ['timestamp', 'confidence', 'pose_Rx', 'pose_Ry', 'pose_Rz']
        
        #  имя фрагмента
        df_dict = {'token_name': file_name}  
        
        #  первый способ поиска амплитуд, не использованный в дальнейшем
        df_dict['abs_amplitude'] = max(df['pose_Ry']) - min(df['pose_Ry'])
        
        #  находим пики
        max_peaks, _ = find_peaks(df['pose_Ry'])
        min_peaks, _ = find_peaks(df['pose_Ry'] * (-1))
        
        #  отфильтровываем пики с минимальной высотой
        all_peaks = np.sort(np.concatenate((max_peaks, min_peaks)))
        new_peaks = []
        y = df['pose_Ry']

        if len(all_peaks) > 1:
            for i, peak in enumerate(all_peaks):
                if peak == all_peaks[0]:
                    diff = [abs(y[peak] - y[all_peaks[i+1]])]
                elif peak == all_peaks[-1]:
                    diff = [abs(y[peak] - y[all_peaks[i-1]])]
                else:
                    diff = [abs(y[peak] - y[all_peaks[i-1]]), abs(y[peak] - y[all_peaks[i+1]])]

                if max(diff) > 0.02:
                    new_peaks.append(peak)
        else:
            new_peaks = all_peaks
        
        #  запоминаем случаи, где пиков 1 или 0 и считаем амплитуду
        if len(new_peaks) <= 1:
            df_dict['check_full_sentence'] = True
            df_dict['peaks_amplitude'] = df_dict['abs_amplitude']
        else:
            df_dict['check_full_sentence'] = False
            df_dict['peaks_amplitude'] = max(df['pose_Ry'][new_peaks]) - min(df['pose_Ry'][new_peaks])
            
        df_dict['peaks_num'] = len(new_peaks)
        
        #  считаем частоту 
        if len(new_peaks) > 1:
            T = (df['timestamp'][new_peaks[-1]] - df['timestamp'][new_peaks[0]]) / (len(new_peaks) - 1)
            F = 1 / T
            df_dict['avg_time'] = T
            df_dict['frequencies'] = F
            df_dict['timestamps_num'] = df['timestamp'][new_peaks[-1]] - df['timestamp'][new_peaks[0]]
        else:
            df_dict['check_full_sentence'] = True
            df_dict['avg_time'] = 0
            df_dict['frequencies'] = 0
            df_dict['timestamps_num'] = df['timestamp'].iat[-1] - df['timestamp'].iat[0]
        
        #  записываем информацию о количестве жестов, на которые растространяется маркирование
        df_dict['spreading'] = spr_df.loc[spr_df['token_name'] == file_name.split('.')[0]]['spreading_POS'].iloc[0]
        df_dict['spreading_num'] = len(df_dict['spreading'].replace('VERB_NEG', 'VERB.NEG').split('_'))
        
        #  записываем информанта
        df_dict['informant'] = df_inf.loc[df_inf['name'] == file_name]['path'].values[0]

        negation_info_df = negation_info_df.append(df_dict, ignore_index=True)
        

negation_info_df['peaks_num'] = negation_info_df['peaks_num'].astype('int')
negation_info_df['frequencies'] = negation_info_df['frequencies'].astype('float64')
negation_info_df['neg_type'] = negation_info_df.spreading.apply(negation_type)

negation_info_df.head()

Unnamed: 0,token_name,timestamps_num,peaks_num,abs_amplitude,peaks_amplitude,check_full_sentence,frequencies,avg_time,informant,spreading,spreading_num,neg_type
0,Поиск 1_0.csv,0.24,1,0.253,0.253,True,0.0,0.0,s14,DISC,1.0,PART
1,Поиск 1_104.csv,0.32,3,0.11,0.068,False,6.25,0.16,s4,VERB_NEG,1.0,VERB.NEG
2,Поиск 1_111.csv,0.167,3,0.066,0.049,False,11.976048,0.0835,s5,DISC,1.0,PART
3,Поиск 1_112.csv,0.234,5,0.241,0.175,False,17.094017,0.0585,s25,DISC,1.0,PART
4,Поиск 1_113.csv,0.166,2,0.23,0.212,False,6.024096,0.166,s17,DISC,1.0,PART


In [47]:
negation_info_df[negation_info_df != 0].frequencies.mean()

9.053858433540954

In [48]:
negation_info_df.to_csv('negation_info.csv', index=False)

Статистика по информантам

In [30]:
informant_df = negation_info_df.groupby('informant')[['peaks_num', 'peaks_amplitude']].mean()
informant_df['frequencies'] = negation_info_df[negation_info_df.frequencies != 0].groupby('informant')['frequencies'].mean()
informant_df['inf_count'] = negation_info_df.groupby('informant')['peaks_num'].count()
informant_df

Unnamed: 0_level_0,peaks_num,peaks_amplitude,frequencies,inf_count
informant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
s1,4.0,0.486,4.991681,1
s13,5.0,0.259545,6.921145,11
s14,5.625,0.142375,9.11748,8
s15,3.5,0.0565,9.821429,2
s16,2.5,0.1055,7.5,2
s17,6.4,0.2243,12.130786,10
s18,4.0,0.224,6.29888,3
s19,3.727273,0.316727,9.777017,11
s2,3.347826,0.193609,9.26365,23
s21,4.333333,0.2695,7.087302,6


Статистика по типам отрицания

In [31]:
mean_df = negation_info_df.groupby('neg_type')[['peaks_num', 'peaks_amplitude']].mean()
mean_df['frequencies'] = negation_info_df[negation_info_df.frequencies != 0].groupby('neg_type')['frequencies'].mean()
mean_df['type_count'] = negation_info_df.groupby('neg_type')['peaks_num'].count()
mean_df

Unnamed: 0_level_0,peaks_num,peaks_amplitude,frequencies,type_count
neg_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PART,3.315789,0.258868,10.3978,38
PRON,5.25,0.228542,8.460794,24
VERB.NEG,4.776316,0.241934,8.701362,76


## Работа с элицитированными данными

In [42]:
with open('./elicitation/elicitation with marking/sentences_info_e.csv', encoding='utf-8') as inf:
    spr_df_e = pd.read_csv(inf, header=None, names=['token_name', 'spreading_POS', 'unnamed'], sep='\t')

spr_df_e.head()

Unnamed: 0,token_name,spreading_POS,unnamed
0,be.not nothing - c3.mp4,VERB.NEG,
1,break not nothing - c3.mp4,VERB.NEG,
2,call nobody - c1 (1).mp4,NOBODY,
3,call nobody - c3.mp4,NOBODY,
4,come not nobody - c3.mp4,VERB.NEG_NOBODY,


In [43]:
def get_informant(file_name):
    if 'c1' in file_name:
        return 'c1'
    elif 'c2' in file_name:
        return 'c2'
    elif 'c3' in file_name:
        return 'c3'
    elif 'c4' in file_name:
        return 'c4'

In [54]:
def elic_negation_type(spr):
    if ('VERB.NEG' in spr or 'VERB_NEG' in spr) and ('NOBODY' in spr or 'NOTHING' in spr):
        return 'DOUBLE'
    elif spr == 'NEG_NOTHING':
        return 'DOUBLE'
    elif spr == 'VERB.NEG':
        return 'INCORP'
    elif spr == 'VERB_NEG':
        return 'VERB.NEG'
    elif spr == 'NOBODY':
        return 'PRON'
    elif spr == 'NOTHING':
        return 'PRON'
    elif spr == 'NEVER':
        return 'PRON'

In [55]:
el_negation_info_df = pd.DataFrame(columns=['token_name', 'timestamps_num', 'peaks_num', 
                                         'abs_amplitude', 'peaks_amplitude', 'check_full_sentence', 'frequencies'])

for file_name in os.listdir(path='./elicitation/processed_videos_elicitation_with_marking/'):
    if file_name.endswith('.csv'):
        df = get_one_video('./elicitation/processed_videos_elicitation_with_marking/'+file_name)
        df.columns = ['timestamp', 'confidence', 'pose_Rx', 'pose_Ry', 'pose_Rz']
        
        #  имя фрагмента
        df_dict = {'token_name': file_name}
        
        #  первый способ поиска амплитуд
        df_dict['abs_amplitude'] = max(df['pose_Ry']) - min(df['pose_Ry'])
        
        #  ищем пики
        max_peaks, _ = find_peaks(df['pose_Ry'])
        min_peaks, _ = find_peaks(df['pose_Ry'] * (-1))
        
        #  отфильтровываем пики с минимальной высобой
        all_peaks = np.sort(np.concatenate((max_peaks, min_peaks)))
        new_peaks = []
        y = df['pose_Ry']

        if len(all_peaks) > 1:
            for i, peak in enumerate(all_peaks):
                if peak == all_peaks[0]:
                    diff = [abs(y[peak] - y[all_peaks[i+1]])]
                elif peak == all_peaks[-1]:
                    diff = [abs(y[peak] - y[all_peaks[i-1]])]
                else:
                    diff = [abs(y[peak] - y[all_peaks[i-1]]), abs(y[peak] - y[all_peaks[i+1]])]

                if max(diff) > 0.02:
                    new_peaks.append(peak)
        else:
            new_peaks = all_peaks
        
        #  записываем случаи, когда пиков 0 или 1 и считаем амплитуды
        if len(new_peaks) <= 1:
            df_dict['check_full_sentence'] = True
            df_dict['peaks_amplitude'] = df_dict['abs_amplitude']
        else:
            df_dict['check_full_sentence'] = False
            df_dict['peaks_amplitude'] = max(df['pose_Ry'][new_peaks]) - min(df['pose_Ry'][new_peaks])
            
        df_dict['peaks_num'] = len(new_peaks)
        
        #  считаем частоту 
        if len(new_peaks) > 1:
            T = (df['timestamp'][new_peaks[-1]] - df['timestamp'][new_peaks[0]]) / (len(new_peaks) - 1)
            F = 1 / T
            df_dict['avg_time'] = T
            df_dict['frequencies'] = F
            df_dict['timestamps_num'] = df['timestamp'][new_peaks[-1]] - df['timestamp'][new_peaks[0]]
        else:
            df_dict['check_full_sentence'] = True
            df_dict['avg_time'] = 0
            df_dict['frequencies'] = 0
            df_dict['timestamps_num'] = df['timestamp'].iat[-1] - df['timestamp'].iat[0]
        
        #  получаем информацию о распространении и видах отрицания
        df_dict['spreading'] = spr_df_e.loc[spr_df_e['token_name'] == file_name.replace('.csv', '.mp4')]['spreading_POS'].iloc[0]
        df_dict['spreading_num'] = len(df_dict['spreading'].replace('VERB_NEG', 'VERB.NEG').split('_'))
        df_dict['negation_type'] = elic_negation_type(df_dict['spreading'])
        
        #  записываем информацию об информантах
        df_dict['informant'] = get_informant(file_name)

        el_negation_info_df = el_negation_info_df.append(df_dict, ignore_index=True)
        
        #plt.plot(df['timestamp'], y)
        #plt.plot(df['timestamp'][new_peaks], y[new_peaks], "x")
        #plt.show()
        

el_negation_info_df['peaks_num'] = el_negation_info_df['peaks_num'].astype('int')
el_negation_info_df['frequencies'] = el_negation_info_df['frequencies'].astype('float64')

el_negation_info_df.head()

Unnamed: 0,token_name,timestamps_num,peaks_num,abs_amplitude,peaks_amplitude,check_full_sentence,frequencies,avg_time,informant,negation_type,spreading,spreading_num
0,be.not nothing - c3.csv,0.368,2,0.154,0.154,False,2.717391,0.368,c3,VERB.NEG,VERB.NEG,1.0
1,break not nothing - c3.csv,0.567,1,0.263,0.263,True,0.0,0.0,c3,VERB.NEG,VERB.NEG,1.0
2,call nobody - c1 (1).csv,0.568,5,0.147,0.134,False,7.042254,0.142,c1,PRON,NOBODY,1.0
3,call nobody - c3.csv,0.167,2,0.198,0.04,False,5.988024,0.167,c3,PRON,NOBODY,1.0
4,come not nobody - c3.csv,1.268,7,0.496,0.496,False,4.731861,0.211333,c3,DOUBLE,VERB.NEG_NOBODY,2.0


In [56]:
el_negation_info_df[el_negation_info_df != 0].frequencies.mean()

4.584930125579849

In [57]:
el_negation_info_df.to_csv('el_negation_info.csv', index=False)