In [5]:
%matplotlib inline
import NSTcorpus
import NSTlexicon
import NSTjulius
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

Read lexicon information to create items for lexicon look-up
Words (list of str) - words (length 927167)
Pos (dict, key:lexical entry, val: POS - part of speech, e.g. NN)
Morph (dict, key: word, val: morphological tags, e.g. 'AKT|PRS'
Phon (dict, key: word, val: phonological transcript)
NumSyll (dict, key: word, val: number of syllables)
Syll (dict, key: word, val: list of transcribed syllables)
Accent:
Decomp:
Base:
Sem:
SemInfo:

In [6]:



Words, Pos, Morph, Phon, NumSyll, Syll, Accent, Decomp, Base, Sem, SemInfo = NSTlexicon.read_lexicon()



reading lex
100000
200000
300000
400000
500000
600000
700000
800000
900000


KeyError: 'sjuk'

Read the NST database to obtain lists of wav files, transcripts, region of youth of the speaker (all length 502115).

In [None]:
topdirlist=["/Volumes/Nephthys/NewNST/0467_sv_train_1", "/Volumes/Nephthys/NewNST/0467_sv_train_2", "/Volumes/Nephthys/NewNST/0467_sv_train_3"]
NSTwavfilenames, NSTtranscripts, NSTregionofyouth=NSTcorpus.read_corpus_raw(topdirlist)

We want to focus only on disyllabic nouns, i.e. where NumSyllDict has value 2 and Pos contains 'NN'. This leads to a subset of length 50404.

In [4]:
RawDisyll=[k for k in NumSyll.keys() if NumSyll[k]==2 and 'NN' in Pos[k]]

In [5]:
print(len(RawDisyll))

50404


This is not yet specific enough and contains a lot of proper names, items with punctuation that might not work like alphanumeric words, and disyllabic items that aren't stressed on the first syllable like "banan" and "poäng", i.e. iambic stress. For the accent analysis we focus on trochaic patterns.

In [6]:
RawDisyll=[x for x in RawDisyll if x[0].islower()] #proper names capitalised, ignore those
RawDisyll=[x for x in RawDisyll if x.isalpha()] #ignore items like a-lag a:et 
RawDisyll=[x for x in RawDisyll if not (len(Accent[x])==2 and Accent[x][0]==0)]

This leaves 45887 items.



In [7]:
len(RawDisyll)

45887

Now, we want to gather information about individual occurrences. We first construct a list of token types (i.e. specific morphological forms) and dictionaries to look up frequencies. We limit this to items from the lexicon that occur at least once.

In [8]:
TypeFreq, LemmaFreq=NSTlexicon.TokenBaseFrequencies(NSTtranscripts, Base)
Types=NSTlexicon.BaseFreqAboveX(NSTtranscripts, RawDisyll, 0)

In [9]:
print(len(Types))


8131


Selecting just word types that actually occur in the corpus, this boils down to 8131 types.
Types list different morphological forms separately, i.e. this list contains 'apa' (monkey, sg.) and 'apor' (monkey, pl.) as two entries (This is important because the accent assignment may change between singular and plural, e.g. tiger1/tigrar2 so we don't want to conflate the recordings under a single lemma).


One final problem is tokens whose POS is ambiguous. For the present analysis we focus on nouns, but many Swedish word forms can be from multiple syntactic categories, such as 'vara', which could be noun ('product') or verb ('to be'). So, while 'vara' has a very high frequency count, most of these are in fact verbs.

In [16]:
print(Pos['vara'])
print(TypeFreq['vara'])


['NN', 'VB']
5057


We therefore use POS tagging (spacy) in order to filter out tokens that are in fact not nouns in their specific sentence context. To illustrate, an example where 'vara' is a noun, with the corresponding spacy output:

In [19]:
import spacy


nlp=spacy.load('sv_core_news_sm')
doc = nlp("En vara är av dålig kvalite")
    

for token in doc:
    print(token.text, token.lemma_, token.pos_)

En en DET
vara vara NOUN
är vara AUX
av av ADP
dålig dålig ADJ
kvalite kvalite ADP


To do this, we first need to retrieve all transcripts of recordings for each token.
We construct Type2Wav, a dictionary that lists for each key (type) all wav files with an occurrence, and Type2Trans, a dictionary that lists all transcripts.

In [22]:
from string import punctuation
def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

def type_dictionaries(Types,Transcripts,Wavfiles):
    index={}
    for i in range(len(Transcripts)):
        tStripped = strip_punctuation(Transcripts[i])
        tSplit = tStripped.split()
        for s in tSplit:
            index.setdefault(s,[]).append(i)

    type2wav={}
    type2trans={}
    
    for type in Types:
        for w in index[type]:
            type2wav.setdefault(type,[]).append(Wavfiles[w])
            type2trans.setdefault(type,[]).append(Transcripts[w])
    
    return type2wav, type2trans

In [23]:
Type2Wav, Type2Trans=type_dictionaries(Types, NSTtranscripts, NSTwavfilenames)


Now we create a data frame with one row per token (occurrence) of each type, with columns ID (e.g. vingar1), Type (vingar), Base (vinge),Accent (2,0), AccList (e.g. can be [[2,0],[1,0]]) ,Morph (e.g. PLU|IND|NOM|UTR, can be a list ,POS (should always be NN at this stage because filtered earlier), WavFile (complete path), JuliusTranscript (complete path))

In particular, we first pass any item that has more than one POS through spacy.

In [None]:
import re
import os.path
import spacy

#nlp = spacy.load("sv_pipeline")
nlp=spacy.load('sv_core_news_sm')   

Tokens=[]
IDList=[]
BaseList=[]
AccList=[]
MorphList=[]
POSList=[]
WavList=[]
JuliusList=[]
TranscriptList=[]

startidx=0
#with open('/Volumes/Seth/NSTsvensktal/TokenDF_i.txt', 'r') as f:
#    for line in f:
#        startidx=int(line.strip())
#print(startidx)

for i in range(startidx,len(Types)):
    if i%1000==0:
        print(i)
        TokenDF=pd.DataFrame({'ID':IDList, 'Token':Tokens,'Base':BaseList, 'Acc':AccList, 'POS':POSList, 'Transcript':TranscriptList })
        #TokenDF.to_csv('/Volumes/Seth/NSTsvensktal/TokenDF'+str(i)+'.csv')
        TokenDF.to_csv('/Volumes/Nephthys/NewNST/TokenDF'+str(i)+'.csv')
        #with open('/Volumes/Seth/NSTsvensktal/TokenDF_i.txt', 'w') as f:
        with open('/Volumes/Nephthys/NewNST/TokenDF_i.txt', 'w') as f:
            f.write(str(i))
        
    t=Types[i]
    
    idcounter=1
    if isinstance(Accent[t][0],list):
        continue #exclude any words that have ambiguous accent assignment, i.e. where Acc[t] is a list of lists
    if isinstance(Pos[t],list):
        
    
        for j in range(len(Type2Trans[t])):
            transcript=Type2Trans[t][j]
            wavfile=Type2Wav[t][j]
            doc=nlp(transcript)
            tokens=[token.text for token in doc]
            postags=[token.pos_ for token in doc]
            idx=tokens.index(t)
            thispos=postags[idx]
            thisbase=Base[t]
            if thispos=='NOUN': #use it
                if i%100==0:
                    print(transcript)
                    print('spacy found NOUN for ' + t)
                Tokens.append(t)
                IDList.append(t+str(idcounter))
                AccList.append(Accent[t])
                POSList.append('NN')
                WavList.append(wavfile)
                if isinstance(thisbase,list):
                    for b in thisbase:
                        if b in Pos and 'NN' in Pos[b]:
                            thisbase=b
                            #print('breaking')
                            break
                    if not isinstance(thisbase,list):
                        BaseList.append(thisbase)
                    else:
                        BaseList.append(thisbase[0])
                    #print('did not break')
                    #BaseList.append(thisbase[0])
                else:
                    BaseList.append(thisbase)
                TranscriptList.append(transcript)
                idcounter=idcounter+1
            else:
                if i%100==0:
                    print('spacy found ' + thispos + ' for ' + t + '- discard.')
    else:
        if Pos[t]=="NN":
            thisbase=Base[t]
            if isinstance(thisbase, list):
                for b in thisbase:
                    if b in Pos and 'NN' in Pos[b]:
                        thisbase=b
                        #'NN break'
                        break
            for j in range(len(Type2Trans[t])):
                transcript=Type2Trans[t][j]
                wavfile=Type2Wav[t][j]
                Tokens.append(t)
                IDList.append(t+str(idcounter))
                AccList.append(Accent[t])
                POSList.append('NN')
                BaseList.append(thisbase)
                TranscriptList.append(transcript)
                WavList.append(wavfile)
                idcounter=idcounter+1
                        
            

In [26]:
TokenDF=pd.DataFrame({'ID':IDList, 'Token':Tokens, 'Base':BaseList, 'Acc':AccList, 'POS':POSList, 'Transcript':TranscriptList , 'WavFile':WavList})
TokenDF.to_csv('TokenDFspacywav.csv')


In [27]:
TokenDF.shape

(123681, 7)

Now, for every entry in TokenDF, we use the Julius speech recognizer with language models from KTH (https://www.speech.kth.se/asr/) to obtain word boundary timestamps for the recognized tokens. 
(From command line via NSTjulius.runjulius(wavlist)). We use NSTjulius.parseJulius() to check whether the output contains the target token. If not, the item is labelled as a 'miss'. We add a new column to the TokenDF which indicates either the filename of the recognised Julius output, or 'miss'.
This procedure is successful for 123681 files, and fails for 20436 files.

In [29]:

print(sum(TokenDF['Julius']=='miss'))

123681
20436


Next we want to parse the julius output to find the cutpoints c1 and c2 for the relevant token for each item, then cut the wav file, and finally get the pitch for the cut wav file via praat.

In [42]:
import subprocess

def cutwav(wavfilename, cutpoint1, cutpoint2, outfilename):
    
    subprocess.call(["/Applications/Praat.app/Contents/MacOS/Praat", "--run", "cutwav.praat", wavfilename, cutpoint1, cutpoint2, outfilename])


In [None]:
import NSTtools
import os, re


PitchFile=[]
for i in range(TokenDF.shape[0]):
    if i%100==0:
        print('============')
        print(i)
    
        print('============')
        with open('progress.txt', 'w') as f:
            f.write(str(i))
        
    

    juliusfile=TokenDF.loc[i]['Julius']
    b=TokenDF.loc[i]['Base']
    
    print(juliusfile)
    
    if os.path.isfile(juliusfile):
        print('is path')
        try:
            [Parse,WIndex]=NSTjulius.parse_julius_output(Base, jfile=juliusfile)
            cutpoint1=Parse[WIndex[b]][1]
            cutpoint2=Parse[WIndex[b]][2]

            filestemre=re.search('^(.*)\.wav$', TokenDF.loc[i]['WavFile'])
            filestem=filestemre.group(1)
            tokenfile=filestem+'_'+TokenDF.loc[i]['ID']+'.wav'
            print('cut...')
            cutwav(TokenDF.loc[i]['WavFile'], str(int(cutpoint1)/100), str(int(cutpoint2)/100), tokenfile)
            print('get pitch...')
            pf=NSTtools.getpitch(tokenfile)
            PitchFile.append(pf)
            print('got pitch')
        except:
            pass
        
        
        
        

Add pitch file info to TokenDF

In [4]:
import re, os

TokenFiles=[]
for i in range(TokenDF.shape[0]):
    
    if os.path.isfile(TokenDF.loc[i]['Julius']):
        
        stemre=re.search('(.*)\.wav$', TokenDF.loc[i]['WavFile'])
        stem=stemre.group(1)
        tokenfile=stem+'_'+TokenDF.loc[i]['ID']+'.wav'
        if os.path.isfile(tokenfile):
            TokenFiles.append(tokenfile)
        else:
            TokenFiles.append("")
    else:
        TokenFiles.append("")

In [6]:
import re, os

PitchFiles=[]

for i in range(TokenDF.shape[0]):
    
    if os.path.isfile(TokenDF.loc[i]['Julius']):
        
        stemre=re.search('(.*)\.wav$', TokenDF.loc[i]['WavFile'])
        stem=stemre.group(1)
        pitchfilename=stem+'_'+TokenDF.loc[i]['ID']+'_pitch.txt'
        
        if os.path.isfile(pitchfilename):
            PitchFiles.append(pitchfilename)
        else:
            
            PitchFiles.append("")
    else:
            PitchFiles.append("")

In [7]:
TokenDF['PitchFile']=PitchFiles

The final stages of preprocessing, normalisation and smoothing, are done in R.