In [39]:
# celex_segmental_info_entropy.ipynb
# H. Muller
# 2023-05-29

# Input: celex components and cgn
# Output: compounds in cgn with information about relative frequency, paradigmatic probability, segmental information etc.

In [40]:
from os import environ

# Parse arguments
if environ.get('lemma'):
    lemmaPath = environ.get('lemma')
else:
    lemmaPath = '../DataProcessed/dml.txt'
    
if environ.get('word'):
    wordPath = environ.get('word')
else:
    wordPath = '../DataProcessed/dmw.txt'  
    
if environ.get('phono'):
    phonoPath = environ.get('phono')
else:
    phonoPath = '../DataProcessed/dpw.txt' 
    
if environ.get('cgn'):
    cgnPath = environ.get('cgn')
else:
    cgnPath = '../DataProcessed/cgn_predictors_comp-o.csv' 

if environ.get('outfile'):
    outfile = environ.get('outfile')
else:
    outfile = '../DataProcessed/cgn_processed_comp-o.csv'

In [41]:
import pandas as pd

# Read files
lemma = pd.read_csv(lemmaPath, sep='\t', header=0, index_col=0)
word = pd.read_csv(wordPath, sep='\t', header=0, index_col=0)
phono = pd.read_csv(phonoPath, sep='\t', header=0, index_col=0)
cgn = pd.read_csv(cgnPath, sep='\t', header=0, index_col=0)

# Compute segmental information

In [42]:
phono['DISC'] = phono.PhonStrsDISC.str.replace("'", "").str.replace('-', '')
phono = phono.fillna('')

In [43]:
# check if I can find all types
print(phono.loc[phono.DISC.str.startswith('jar@'), 'Word'].head())
print(phono.loc[phono.DISC.str.startswith('jar'), 'Word'].head())

142049         jaren
142050     jarenlang
142051    jarenlange
142054         jarig
142055        jarige
Name: Word, dtype: object
141437                jaar
141438      jaarabonnement
141439    jaarabonnementen
141440           jaarbasis
141441         jaarbericht
Name: Word, dtype: object


In [44]:
import math

# define a function to create the new columns
def get_fraction(row):
        left = row['leftPhonoCELEX']
        interfix = row['IfixPhonoCELEX']
        
        # combine for computing numerator and denominator
        numeratorStr = left + interfix
        denominatorStr = left
        
        # compute segmental info
        numerator = len(phono.loc[phono.DISC.str.startswith(numeratorStr), 'Word'].drop_duplicates())
        denominator = len(phono.loc[phono.DISC.str.startswith(denominatorStr), 'Word'].drop_duplicates())
        TypeSegmentalInfo = -1 * math.log(numerator / denominator, 2)
        
        return pd.Series([TypeSegmentalInfo])

import swifter

# apply the function to each row of the dataframe and create new columns
cgn[['TypeSegmentalInfo']] = cgn.swifter.apply(get_fraction, axis=1)
cgn.head()

Pandas Apply:   0%|          | 0/1846 [00:00<?, ?it/s]

Unnamed: 0,variant,WordOrtho,WordPhono,WordDuration,leftOrtho,Interfix,rightOrtho,leftPhonoCELEX,IfixPhonoCELEX,rightPhonoCELEX,...,SameLeftFQ,SameLeftDiffIfixFQ,RelFreq,SameLeftAndIfix,SameLeft,SameLeftDiffIfix,IfixNum,ParaProbType,RightPositionalEntropy,TypeSegmentalInfo
0,Netherlandic,apekool,ap@kol,0.598,aap,e,kool,ap,@,kol,...,61.0,9.0,1.0,9.0,11.0,2.0,2.0,0.818182,-0.0,2.087463
1,Netherlandic,vrouwenstem,vrA+w@stEm,0.863,vrouw,en,stem,vrMw,@,stEm,...,311.0,2.0,0.428571,63.0,64.0,1.0,2.0,0.984375,1.295836,0.221127
2,Netherlandic,levenslustige,lev@slYst@G@,0.764,leven,s,lustige,lev@,s,l}st@G@,...,2639.0,0.0,0.129032,119.0,119.0,0.0,1.0,1.0,0.753828,0.733285
3,Netherlandic,zonnegod,zOn@GOt,0.509,zon,e,god,zOn,@,GOt,...,2052.0,1139.0,1.0,53.0,60.0,7.0,2.0,0.883333,-0.0,1.208947
4,Netherlandic,zonnekind,zOn@kInt,0.558,zon,e,kind,zOn,@,kInt,...,,,,,,,,,,1.208947


# write results to file

In [50]:
cgn.columns

Index(['variant', 'WordOrtho', 'WordPhono', 'WordDuration', 'leftOrtho',
       'Interfix', 'rightOrtho', 'leftPhonoCELEX', 'IfixPhonoCELEX',
       'rightPhonoCELEX', 'leftPhono', 'IfixPhono', 'rightPhono',
       'LeftDuration', 'RightDuration', 'IfixDuration', 'SpeechRate',
       'SubtlexAbsFreq', 'SameLeftAndIfixFQ', 'SameLeftFQ',
       'SameLeftDiffIfixFQ', 'RelFreq', 'SameLeftAndIfix', 'SameLeft',
       'SameLeftDiffIfix', 'IfixNum', 'ParaProbType', 'RightPositionalEntropy',
       'TypeSegmentalInfo'],
      dtype='object')

In [51]:
cgn['Interfix'].value_counts()

Interfix
s     1018
en     740
e       88
Name: count, dtype: int64

In [52]:
heads = list(set(lemma.Head))
[x for x in list(set(cgn.leftOrtho)) if x not in heads ]

['drieland',
 'sportfonds',
 'vijfjaar',
 'prinsje',
 'eengezin',
 'driepunt',
 'pandje',
 'eenrichting',
 'tweepersoon',
 'conserve',
 'klootje']

In [53]:
cgn.fillna('')
cgn.to_csv(outfile, sep='\t')
print(f'cgn written to {outfile}')

cgn written to ../DataProcessed/cgn_processed_comp-o.csv


  values = values.astype(str)
