In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import os
import re
import textgrid
import warnings
from tqdm import tqdm_notebook
warnings.filterwarnings("ignore")

In [51]:
class DocParser(object):
    def __init__(self, lang):        
        self.lang = lang
        
    def parse(self, xml, name):
        if self.lang == 'Bashkir':
            return self._get_data_bashkir(xml)
        
        elif self.lang == 'Beserman' or self.lang == 'Chukchi':
            return self._get_data_beserman(xml)
               
        elif self.lang == 'Azeri':
            return self._get_data_azeri(xml, name)
        
        elif self.lang == 'Kabardian':
            return self._get_data_kabardian(xml)
        
        elif self.lang in {'Ustja', 'Rogovatka', 'Malinino', 'Opochka'}:
            return self._get_data_rogovatka(xml, name)
        
    def __calculate_syllables(self, text):
        if text is None:
            return 0
        if self.lang == 'Beserman' or self.lang == 'Chukchi':
            vowels = {'o', 'e', 'a', 'i', 'ə', 'u'}
        elif self.lang == 'Bashkir':
            vowels = {'o', 'e', 'a', 'i', 'ə', 'u', 'ö', 'ä'}
        elif self.lang in {'Ustja', 'Rogovatka', 'Malinino', 'Opochka'}:
            vowels = {'у', 'е', 'ы', 'а', 'о', 'э', 'я', 'и', 'ю', 'ё'}
        elif self.lang == 'Azeri':
            vowels = {'o', 'e', 'a', 'i', 'ə', 'u', 'ü', 'ö'}
        elif self.lang == 'Kabardian':
            vowels = {'э', 'ы', 'а', 'и', 'е', 'y', 'я', 'о', 'ю', 'ё', 'ə', 'e', 'a', 'o', 'u', 'у'}

        n_syl = 0

        for char in text:
            if char.lower() in vowels:
                n_syl += 1
            elif self.lang != 'Kabardian' and char in {'(', ')', '[', ']', '<', '>'}:
                return 0

        return n_syl
    def _get_data_rogovatka(self, xml, name):
        soup = bs(xml, 'xml')
        r_year = re.search('(20..)', name)
        if r_year is not None:
            r_year = int(r_year.groups(0)[0])
        else:
            r_year = 0
        tiers = soup.find_all('TIER', LINGUISTIC_TYPE_REF='Praat')
        tiers += soup.find_all('TIER', LINGUISTIC_TYPE_REF='default-lt')

        new_data = list()

        for t in tiers:
            part = t.attrs['TIER_ID']
            expressions = t.find_all('ALIGNABLE_ANNOTATION')
            for exp in expressions:
                data = dict()
                data['PARTICIPANT'] = part
                data['R_YEAR'] = r_year
                start_id = exp.attrs['TIME_SLOT_REF1']
                end_id = exp.attrs['TIME_SLOT_REF2']

                start_time = int(soup.find('TIME_SLOT', TIME_SLOT_ID=start_id).attrs['TIME_VALUE'])
                end_time = int(soup.find('TIME_SLOT', TIME_SLOT_ID=end_id).attrs['TIME_VALUE'])

                data['DURATION'] = end_time - start_time
                for c in exp.find('ANNOTATION_VALUE'):
                    data['TEXT'] = c
                    data['SYLLABLES'] = self.__calculate_syllables(c)
                    data['RATE'] = data['SYLLABLES'] / (data['DURATION'] / 1000)
                if 'TEXT' in data:
                    new_data.append(data)

        return new_data
    
    def _get_data_azeri(self, xml, name):
        tg = textgrid.TextGrid(xml)
        new_data = list()
        
        for tier in tg.tiers:
            if 'ransc' in tier.nameid:
                part = tier.nameid
                for exp in tier.simple_transcript:
                    if exp[2] != '' and re.search('[а-я]', exp[2]) is None:                        
                        data = dict()
                        data['TEXT'] = exp[2]                       
                        data['PARTICIPANT'] = part
                        data['DURATION'] = (float(exp[1]) - float(exp[0])) * 1000
                        data['SYLLABLES'] = self.__calculate_syllables(exp[2])
                        if data['SYLLABLES'] != 0:
                            data['RATE'] = data['SYLLABLES'] / (data['DURATION'] / 1000)
                            data['R_YEAR'] = '20' + name[4:6]
                            new_data.append(data)
        return new_data   
    
    def __find_year(self, soup):
        link = soup.find('MEDIA_DESCRIPTOR').attrs['MEDIA_URL']
        r_year = re.search('(20..)', link)
        if r_year is not None:
            return int(r_year.groups(0)[0])
        else:
            return
        
    def _get_data_ustja(self, xml):
        soup = bs(xml, 'xml')

        tiers = soup.find_all('TIER', LINGUISTIC_TYPE_REF='Praat')

        new_data = list()

        for t in tiers:
            part = t.attrs['TIER_ID']

            expressions = t.find_all('ALIGNABLE_ANNOTATION')
            for exp in expressions:
                data = dict()
                data['PARTICIPANT'] = part
                start_id = exp.attrs['TIME_SLOT_REF1']
                end_id = exp.attrs['TIME_SLOT_REF2']

                start_time = int(soup.find('TIME_SLOT', TIME_SLOT_ID=start_id).attrs['TIME_VALUE'])
                end_time = int(soup.find('TIME_SLOT', TIME_SLOT_ID=end_id).attrs['TIME_VALUE'])

                data['DURATION'] = end_time - start_time
                for c in exp.find('ANNOTATION_VALUE'):
                    data['TEXT'] = c
                    data['SYLLABLES'] = self.__calculate_syllables(c)
                    data['RATE'] = data['SYLLABLES'] / (data['DURATION'] / 1000)
                    
                new_data.append(data)

        return new_data

    def _get_data_beserman(self, xml):
        soup = bs(xml, 'xml')

        tiers = soup.find_all('TIER', PARTICIPANT=True, LINGUISTIC_TYPE_REF='orig')
        tiers += soup.find_all('TIER', PARTICIPANT=True, LINGUISTIC_TYPE_REF='default-lt')
        
        year = self.__find_year(soup)
        new_data = list()
        
        for t in tiers:
            part = t.attrs['PARTICIPANT']

            expressions = t.find_all('ALIGNABLE_ANNOTATION')
            for exp in expressions:
                if re.search('[а-я]', exp.text) is None:
                    data = dict()
                    data['PARTICIPANT'] = part
                    start_id = exp.attrs['TIME_SLOT_REF1']
                    end_id = exp.attrs['TIME_SLOT_REF2']

                    start_time = int(soup.find('TIME_SLOT', TIME_SLOT_ID=start_id).attrs['TIME_VALUE'])
                    end_time = int(soup.find('TIME_SLOT', TIME_SLOT_ID=end_id).attrs['TIME_VALUE'])

                    data['R_YEAR'] = year
                    data['DURATION'] = end_time - start_time
                    for c in exp.find('ANNOTATION_VALUE'):
                        data['TEXT'] = c
                        data['SYLLABLES'] = self.__calculate_syllables(c)
                        data['RATE'] = data['SYLLABLES'] / (data['DURATION'] / 1000)
                    new_data.append(data)
        return new_data
    
    def _get_data_kabardian(self,xml):
        soup = bs(xml, 'xml')

        tiers = soup.find_all('TIER', LINGUISTIC_TYPE_REF='default-lt', PARTICIPANT=True)
        new_data = list()
        
        for t in tiers:
            part = t.attrs['PARTICIPANT']

            expressions = t.find_all('ALIGNABLE_ANNOTATION')
            for exp in expressions:
                    data = dict()
                    data['PARTICIPANT'] = part
                    start_id = exp.attrs['TIME_SLOT_REF1']
                    end_id = exp.attrs['TIME_SLOT_REF2']

                    start_time = int(soup.find('TIME_SLOT', TIME_SLOT_ID=start_id).attrs['TIME_VALUE'])
                    end_time = int(soup.find('TIME_SLOT', TIME_SLOT_ID=end_id).attrs['TIME_VALUE'])

                    data['DURATION'] = end_time - start_time
                    for c in exp.find('ANNOTATION_VALUE'):
                        data['TEXT'] = c
                        data['SYLLABLES'] = self.__calculate_syllables(c)
                        data['RATE'] = data['SYLLABLES'] / (data['DURATION'] / 1000)
                    new_data.append(data)
        return new_data
    
    def _get_data_bashkir(self, xml):
        soup = bs(xml, 'xml')

        tiers = soup.find_all('TIER', PARTICIPANT=True, LINGUISTIC_TYPE_REF='id')

        new_data = list()

        for t in tiers:
            part = t.attrs['PARTICIPANT']
            expressions = t.find_all('ALIGNABLE_ANNOTATION')

            for exp in expressions:                
                data = dict()
                data['PARTICIPANT'] = part
                start_id = exp.attrs['TIME_SLOT_REF1']
                end_id = exp.attrs['TIME_SLOT_REF2']

                start_time = int(soup.find('TIME_SLOT', TIME_SLOT_ID=start_id).attrs['TIME_VALUE'])
                end_time = int(soup.find('TIME_SLOT', TIME_SLOT_ID=end_id).attrs['TIME_VALUE'])

                data['DURATION'] = end_time - start_time

                ann_id = exp.attrs['ANNOTATION_ID']

                texts = soup.find('TIER', PARTICIPANT=part, LINGUISTIC_TYPE_REF='tx')            
                relevant = texts.find_all('REF_ANNOTATION', ANNOTATION_REF=ann_id)

                text = ''
                for word in relevant:
                    expressions = word.find('ANNOTATION_VALUE')
                    text += expressions.text
                    text += ' '

                    if re.search('[а-я]+', text) is not None: continue
                        
                    data['TEXT'] = text[:-1]
                    data['SYLLABLES'] = self.__calculate_syllables(text[:-1])
                    data['RATE'] = data['SYLLABLES'] / (data['DURATION'] / 1000)
                    
                new_data.append(data)
        return new_data

In [50]:
class DataCollector(object):
    def __init__(self, lang):
        self.lang = lang
        self.meta = pd.read_csv(r'.\Data\Meta' + os.sep + self.lang + '.csv', sep=';', encoding='utf-8')
        self.parser = DocParser(lang)
        
    def get_data(self):
        frames = self._collect_data()
        long = self._select_long(frames)
        data = self._process_long(long, frames)
        return data
    
    def _collect_data(self):
        frames = dict()
        if self.lang in {'Beserman', 'Bashkir', 'Chukchi', 'Ustja', 'Kabardian', 'Rogovatka'}:
            for root, dirs, files in os.walk(r'C:\Users\Masha\Documents\Diploma\Data' + os.sep + self.lang):
                for name in tqdm_notebook(files):
                    if '.eaf' in name:
                        with open(root + os.sep + name, 'r', encoding='utf-8') as f:
                            xml = f.read()
                            data = self.parser.parse(xml, name)
                            frame = pd.DataFrame(data, columns=['PARTICIPANT', 'TEXT', 'DURATION', 'SYLLABLES', 
                                                                'RATE', 'R_YEAR'])
                            frames[name] = frame
                                
        elif self.lang == 'Azeri':
            for root, dirs, files in os.walk(r'C:\Users\Masha\Documents\Diploma\Data' + os.sep + 'Azeri'):
                for name in tqdm_notebook(files):
                    if '.TextGrid' in name:
                        with open(root + os.sep + name, 'r', encoding='utf-8') as f:
                            xml = f.read()
                            data = self.parser.parse(xml, name)
                            frame = pd.DataFrame(data, columns=['PARTICIPANT', 'TEXT', 'DURATION', 'SYLLABLES', 
                                                                'RATE', 'R_YEAR'])
                            frames[name] = frame
        return frames
    
    def _select_long(self, frames):    

        all_participants = dict()
        i = 0
        
        for key in frames:
            frame = frames[key]
            speech_duration = frame.DURATION.sum()
            for participant in set(frame.PARTICIPANT.values):  
                p_duration = frame[frame.PARTICIPANT == participant].DURATION.sum()
                p_part = p_duration / speech_duration

                if p_part >= 0.7:
                    all_participants[i] = {'Name': participant, 'File Name': key, 'Duration': p_duration/1000, 
                                           'Part': p_part, 'Lang': self.lang}
                    i += 1
        return pd.DataFrame(all_participants).T
    
    def _process_long(self, long, frames):
        long_frames = pd.DataFrame(columns=['PARTICIPANT', 'TEXT', 'DURATION', 'SYLLABLES', 'RATE', 'R_YEAR', 'AGE', 'GENDER',
                                           'LANGUAGE', 'FILE'])

        for key in frames:
                if key in long['File Name'].values:
                    part = long[long['File Name'] == key].Name.values[0]
                    
                    f = frames[key][frames[key].PARTICIPANT == part]
                    if self.lang == 'Azeri':
                        age, gen, part = self.__get_meta(part, frames[key].R_YEAR.values[0], key)
                        f['PARTICIPANT'] = [part] * len(f)
                    else:
                        age, gen = self.__get_meta(part, frames[key].R_YEAR.values[0], key)
                    f['AGE'] = [age] * len(f)
                    f['GENDER'] = [gen] * len(f)
                    f['LANGUAGE'] = [self.lang] * len(f)
                    f['FILE'] = [key] * len(f)
                    
                       
                    long_frames = long_frames.append(f[f.SYLLABLES != 0])
                    
        return long_frames
    
    def __get_meta(self, part, year, file):
        if self.lang == 'Beserman':
            age = year - self.meta[self.meta['Name'] == part]['Year'].values[0]
            gen = self.meta[self.meta['Name'] == part]['Gender'].values[0]
        
        if self.lang == 'Chukchi':
            age = self.meta[self.meta['Name'] == part][self.meta['File'] == file]['Record'].values[0] \
            - self.meta[self.meta['Name'] == part][self.meta['File'] == file]['Year'].values[0]
            gen = self.meta[self.meta['Name'] == part][self.meta['File'] == file]['Gender'].values[0]
            
        if self.lang == 'Azeri':
            age = self.meta[self.meta['File'] == file]['Record'].values[0] \
            - self.meta[self.meta['File'] == file]['Year'].values[0]
            gen = self.meta[self.meta['File'] == file]['Gender'].values[0]
            part = self.meta[self.meta['File'] == file]['Name'].values[0]
            return age, gen, part
            
        if self.lang == 'Bashkir':
            file = file[:-4]
            age = self.meta[self.meta['Name'] == part][self.meta['File'] == file]['Age'].values[0]
            gen = self.meta[self.meta['Name'] == part][self.meta['File'] == file]['Gender'].values[0]
            gen = gen.capitalize()
            
        if self.lang == 'Ustja':
            age = int(year - self.meta[self.meta['Speaker'] == part]['Year of birth'].values[0])
            gen = self.meta[self.meta['Speaker'] == part]['Sex'].values[0]
            gen = gen[0].capitalize()
            
        if self.lang == 'Rogovatka':
            age = year - self.meta[self.meta['string_id'] == part]['year_of_birth'].values[0]
            gen = self.meta[self.meta['string_id'] == part]['sex'].values[0]
            gen = gen.capitalize()
            
        if self.lang == 'Kabardian':
            age = 0
            gen = self.meta[self.meta['Name'] == part]['Gender'].values[0]
            
        return age, gen

In [None]:
all_data = pd.DataFrame(columns=['PARTICIPANT', 'TEXT', 'DURATION', 'SYLLABLES', 'RATE', 'R_YEAR', 'AGE',
       'GENDER', 'LANGUAGE', 'FILE'])

for lang in ['Beserman', 'Bashkir', 'Chukchi', 'Azeri', 'Ustja', 'Kabardian', 'Rogovatka']:
    dc = DataCollector(lang)
    data = dc.get_data()
    all_data = all_data.append(data)

In [15]:
all_data.to_csv('all_data.csv', index=False, sep=';', encoding='utf-8')

In [16]:
clean_data = all_data[all_data['RATE'] < 10]

In [17]:
clean_data.to_csv('clean_data.csv', index=False, sep=';')