In [1]:
# celex_segmental_info_entropy.ipynb
# H. Muller
# 2023-05-29

# Input: celex components and cgn
# Output: compounds in cgn with information about relative frequency, paradigmatic probability, segmental information etc.

In [3]:
from os import environ

# Parse arguments
if environ.get('lemma'):
    lemmaPath = environ.get('lemma')
else:
    lemmaPath = '../DataProcessed/dml.txt'
    
if environ.get('word'):
    wordPath = environ.get('word')
else:
    wordPath = '../DataProcessed/dmw.txt'  
    
if environ.get('phono'):
    phonoPath = environ.get('phono')
else:
    phonoPath = '../DataProcessed/dpw.txt' 
    
if environ.get('cgn'):
    cgnPath = environ.get('cgn')
else:
    cgnPath = '../DataProcessed/cgn_predictors_comp-o.csv' 

if environ.get('outfile'):
    outfile = environ.get('outfile')
else:
    outfile = '../DataProcessed/cgn_processed_comp-o.csv'

In [2]:
import pandas as pd

# Read files
lemma = pd.read_csv(lemmaPath, sep='\t', header=0, index_col=0)
word = pd.read_csv(wordPath, sep='\t', header=0, index_col=0)
phono = pd.read_csv(phonoPath, sep='\t', header=0, index_col=0)
cgn = pd.read_csv(cgnPath, sep='\t', header=0, index_col=0)

# Compute segmental information

In [4]:
phono['DISC'] = phono.PhonStrsDISC.str.replace("'", "").str.replace('-', '')
phono = phono.fillna('')

In [5]:
# check if I can find all types
print(phono.loc[phono.DISC.str.startswith('jar@'), 'Word'].head())
print(phono.loc[phono.DISC.str.startswith('jar'), 'Word'].head())

142049         jaren
142050     jarenlang
142051    jarenlange
142054         jarig
142055        jarige
Name: Word, dtype: object
141437                jaar
141438      jaarabonnement
141439    jaarabonnementen
141440           jaarbasis
141441         jaarbericht
Name: Word, dtype: object


In [6]:
import math

# define a function to create the new columns
def get_fraction(row):
        left = row['leftPhonoCELEX']
        interfix = row['IfixPhonoCELEX']
        
        # combine for computing numerator and denominator
        numeratorStr = left + interfix
        denominatorStr = left
        
        # compute segmental info
        numerator = len(phono.loc[phono.DISC.str.startswith(numeratorStr), 'Word'].drop_duplicates())
        denominator = len(phono.loc[phono.DISC.str.startswith(denominatorStr), 'Word'].drop_duplicates())
        TypeSegmentalInfo = -1 * math.log(numerator / denominator, 2)
        
        return pd.Series([TypeSegmentalInfo])

# apply the function to each row of the dataframe and create new columns
cgn[['TypeSegmentalInfo']] = cgn.apply(get_fraction, axis=1)
cgn.head()

Unnamed: 0,FileNameTierName,ID,WordOrtho,WordPhono,Phone,PhoneStart,PhoneEnd,variant,UtteranceBorder,UtteranceStart,...,WordEnd,leftPhonoLen,rightPhonoLen,IfixLen,PhoneDuration,WordDuration,LeftDuration,RightDuration,IfixDuration,TypeSegmentalInfo
0,fn001092.awdN00551,182-apekool.,apekool,ap@kol,a,57.578,57.7,Netherlandic,False,55.054,...,58.176,2,3,1,0.122,0.598,0.193,0.354,0.051,2.087463
1,fn001092.awdN00551,182-apekool.,apekool,ap@kol,p,57.7,57.771,Netherlandic,False,55.054,...,58.176,2,3,1,0.071,0.598,0.193,0.354,0.051,2.087463
2,fn001092.awdN00551,182-apekool.,apekool,ap@kol,@,57.771,57.822,Netherlandic,False,55.054,...,58.176,2,3,1,0.051,0.598,0.193,0.354,0.051,2.087463
3,fn001092.awdN00551,182-apekool.,apekool,ap@kol,k,57.822,57.953,Netherlandic,False,55.054,...,58.176,2,3,1,0.131,0.598,0.193,0.354,0.051,2.087463
4,fn001092.awdN00551,182-apekool.,apekool,ap@kol,o,57.953,58.136,Netherlandic,False,55.054,...,58.176,2,3,1,0.183,0.598,0.193,0.354,0.051,2.087463


# write results to file

In [7]:
cgn['Interfix'].value_counts()

Interfix
s     11809
en     6709
e       755
Name: count, dtype: int64

In [8]:
heads = list(set(lemma.Head))

In [9]:
[x for x in list(set(cgn.leftOrtho)) if x not in heads ]

['eenrichting',
 'drieland',
 'vijfjaar',
 'driepunt',
 'klootje',
 'prinsje',
 'pandje',
 'tweepersoon',
 'eengezin',
 'sportfonds',
 'conserve']

In [10]:
cgn.columns

Index(['FileNameTierName', 'ID', 'WordOrtho', 'WordPhono', 'Phone',
       'PhoneStart', 'PhoneEnd', 'variant', 'UtteranceBorder',
       'UtteranceStart', 'UtteranceEnd', 'UtteranceDuration', 'WordOrthoLow',
       'WordSylNum', 'SegCV', 'WordCV', 'UtteranceID', 'UttSylNum',
       'SpeechRate', 'leftOrtho', 'Interfix', 'rightOrtho', 'IfixPhono',
       'leftPhono', 'rightPhono', 'wordPhonoCELEX', 'leftPhonoCELEX',
       'IfixPhonoCELEX', 'rightPhonoCELEX', 'WordStart', 'WordEnd',
       'leftPhonoLen', 'rightPhonoLen', 'IfixLen', 'PhoneDuration',
       'WordDuration', 'LeftDuration', 'RightDuration', 'IfixDuration',
       'TypeSegmentalInfo'],
      dtype='object')

In [11]:
cgn.to_csv(outfile, sep='\t')