## Stress Patterns in Intra-word Code-switching
### Data: joining meta&annotated df

#### Source: Praat Annotation, Excel sheet
##### Annotated files #: 4 audio files (inflected forms) & 4 additional audio files (uninflected forms)
##### Date: 5/8/2025


### Stress Dataset

In [1]:
import csv
import pandas as pd
import os
import numpy as np

In [2]:
# Load metadata

# Read all sheets into a dictionary
file_path = "/Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations/StimuliListUpdated.xlsx"
sheets_dict = pd.read_excel(file_path, sheet_name=None)

# Access each sheet by its name
for sheet_name, sheet_data in sheets_dict.items():
    print(f"Sheet Name: {sheet_name}")
    print(sheet_data.head())  

Sheet Name: RussianNounCS
        Word  StressedSyll NounGender  Declension StressShift ShiftDirect  \
0   правдаға             1          f           1          no          na   
1  группалар             1          f           1          no          na   
2   пользаны             1          f           1          no          na   
3   книгалар             1          f           1          no          na   
4     силаны             1          f           1          no          na   

  AttestedInCS  
0          yes  
1          yes  
2          yes  
3          yes  
4          yes  
Sheet Name: WordList
  SyllStrWord      Word Language SuffixCase SyllStrCase  ForLater  Stress  \
0   CVC-CV-CV   жылқыға      Kaz        DAT          CV       NaN     NaN   
1  CVC-CV-CVC  шалғылар      Kaz     PLURAL         CVC       NaN     NaN   
2   CVC-CV-CV   пайданы      Kaz        ACC          CV       NaN     NaN   
3   CV-CV-CVC   шекелер      Kaz     PLURAL         CVC       NaN     NaN   
4  

In [3]:
pwd


'/Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations'

In [4]:
# Save dict into CSV files
output_directory = "/Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations/metadata_csv"  

# Iterate over the dictionary and save each sheet as a CSV file
for sheet_name, sheet_data in sheets_dict.items():
    # Construct the output file path
    output_file = f"{sheet_name}.csv"
    
    # Save the DataFrame as a CSV file
    sheet_data.to_csv(output_file, index=False)
    print(f"Saved {sheet_name} to {output_file}")

Saved RussianNounCS to RussianNounCS.csv
Saved WordList to WordList.csv
Saved VowelQualityKaz to VowelQualityKaz.csv
Saved VowelQualityRus to VowelQualityRus.csv
Saved AdditionRN to AdditionRN.csv


In [6]:
# Read csv files

stimulus_cs = pd.read_csv("/Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations/metadata_csv/RussianNounCS.csv")
stimulus_kaz_rus = pd.read_csv("/Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations/metadata_csv/WordList.csv")
vowel_quality_kaz = pd.read_csv("/Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations/metadata_csv/VowelQualityKaz.csv")
vowel_quality_rus = pd.read_csv("/Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations/metadata_csv/VowelQualityRus.csv")

In [7]:
stimulus_cs

Unnamed: 0,Word,StressedSyll,NounGender,Declension,StressShift,ShiftDirect,AttestedInCS
0,правдаға,1,f,1,no,na,yes
1,группалар,1,f,1,no,na,yes
2,пользаны,1,f,1,no,na,yes
3,книгалар,1,f,1,no,na,yes
4,силаны,1,f,1,no,na,yes
...,...,...,...,...,...,...,...
115,окнa,2,n,2,yes,backward,no
116,селa,2,n,2,yes,backward,yes
117,земли,1,f,2,yes,backward,no
118,души,1,f,2,yes,backward,yes


In [8]:
stimulus_kaz_rus

Unnamed: 0,SyllStrWord,Word,Language,SuffixCase,SyllStrCase,ForLater,Stress,WordForm,LatinScript,Gloss,WordClass
0,CVC-CV-CV,жылқыға,Kaz,DAT,CV,,,inflected,jylqyǵa,horse.DAT,Noun
1,CVC-CV-CVC,шалғылар,Kaz,PLURAL,CVC,,,inflected,shalǵylar,scythe.Pl,Noun
2,CVC-CV-CV,пайданы,Kaz,ACC,CV,,,inflected,paidany,benefit.ACC,Noun
3,CV-CV-CVC,шекелер,Kaz,PLURAL,CVC,,,inflected,shekeler,forehead.Pl,Noun
4,CV-CV-CV,қаланы,Kaz,ACC,CV,,,inflected,qalany,city.Pl,Noun
...,...,...,...,...,...,...,...,...,...,...,...
195,,олжа,Kaz,,,,,uninflected,olja,trophy,Noun
196,,сәби,Kaz,,,,,uninflected,sábi,child,Noun
197,,жеңге,Kaz,,,,,uninflected,jeńge,daughter-in-law,Noun
198,,төбе,Kaz,,,,,uninflected,tóbe,hill,Noun


In [9]:
stimulus_kaz_rus = stimulus_kaz_rus[["Word", "Language", "SuffixCase", "WordForm","LatinScript", "Gloss","WordClass"]]

In [10]:
stimulus_kaz_rus

Unnamed: 0,Word,Language,SuffixCase,WordForm,LatinScript,Gloss,WordClass
0,жылқыға,Kaz,DAT,inflected,jylqyǵa,horse.DAT,Noun
1,шалғылар,Kaz,PLURAL,inflected,shalǵylar,scythe.Pl,Noun
2,пайданы,Kaz,ACC,inflected,paidany,benefit.ACC,Noun
3,шекелер,Kaz,PLURAL,inflected,shekeler,forehead.Pl,Noun
4,қаланы,Kaz,ACC,inflected,qalany,city.Pl,Noun
...,...,...,...,...,...,...,...
195,олжа,Kaz,,uninflected,olja,trophy,Noun
196,сәби,Kaz,,uninflected,sábi,child,Noun
197,жеңге,Kaz,,uninflected,jeńge,daughter-in-law,Noun
198,төбе,Kaz,,uninflected,tóbe,hill,Noun


In [12]:
vowel_quality_kaz = vowel_quality_kaz[["VowelCyrillic", "VowelIPA", "VowelHeight", "VowelBackness", "Language"]]
vowel_quality_kaz

Unnamed: 0,VowelCyrillic,VowelIPA,VowelHeight,VowelBackness,Language
0,ы,ə,mid,back,Kaz
1,і,ɪ,high,front,Kaz
2,а,ɑ,low,back,Kaz
3,е,ie,high,front,Kaz
4,ы,ə,mid,back,Kaz
5,і,ɪ,high,front,Kaz
6,а,ɑ,low,back,Kaz
7,е,ie,high,front,Kaz
8,и,ij,high,front,Kaz
9,o,o,mid,back,Kaz


In [14]:
vowel_quality_rus

Unnamed: 0,VowelCyrillic,VowelIPA,VowelHeight,VowelBackness,Language
0,ы,ə,mid,central,Rus
1,і,i,high,front,Rus
2,а,a,low,central,Rus
3,е,e,mid,front,Rus
4,o,o,mid,back,Rus
5,у,ʊ,high,back,Rus
6,ы,ɨ,high,front,Rus
7,у,u,high,back,Rus
8,e,ɛ,mid,front,Rus
9,"o,a",ʌ,mid,back,Rus


In [12]:
# Load annotated data - for 1 speaker only
# Read all sheets into a dictionary
file_path_sd = "/Users/aidyn/Documents/Fall_2024_IndStudy/DataProcessing/speaker1Measurements_updated.xlsx"
sheets_dict = pd.read_excel(file_path_sd, sheet_name=None)

# Iterate over the dictionary and save each sheet as a CSV file
for sheet_name, sheet_data in sheets_dict.items():
    # Construct the output file path
    output_file = f"{sheet_name}.csv"
    
    # Save the DataFrame as a CSV file
    sheet_data.to_csv(output_file, index=False)
    print(f"Saved {sheet_name} to {output_file}")

Saved speaker1Measurements copy to speaker1Measurements copy.csv


In [15]:
pwd

'/Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations'

In [16]:
# Excel workbooks
excel_files = ['male_main.xlsx', 'male_add.xlsx', 'female_main.xlsx', 'female_add.xlsx']

# Read Excel files into DataFrame
dataframes = [pd.read_excel(file) for file in excel_files]

# Combine all DataFrames into one 
combined_df = pd.concat(dataframes, ignore_index=True)

# Print first few rows of the DataFrame
print(combined_df.head())

# Save the combined DataFrame to a CSV file
combined_df.to_csv('combined_data.csv', index=False)

         Filename     Word  WordInterval Syllable             SylInterval  \
0  Speaker_1_male   вечерa             2  'syll$'  'parallelsyllinterval'   
1  Speaker_1_male   вечерa             2  'syll$'  'parallelsyllinterval'   
2  Speaker_1_male   вечерa             2  'syll$'  'parallelsyllinterval'   
3  Speaker_1_male  олжаның             4  'syll$'  'parallelsyllinterval'   
4  Speaker_1_male  олжаның             4  'syll$'  'parallelsyllinterval'   

   SegIntervalNr                 Vowel     Begin       End  Duration_in_ms  \
0              2             s1_cv_vʲɛ  1.889375  2.044858      155.482845   
1              3  s2_cv_ʧʲɪ_unstressed  2.044858  2.238501      193.642926   
2              4     s3_cv_ra_stressed  2.238501  2.499526      261.024641   
3              6              s1_vc_ol  8.478645  8.669182      190.536827   
4              7              s2_cv_ʒɑ  8.669182  8.915570      246.388278   

   ...          F0AtPoint1          F0AtPoint2          F0AtPoint3  

In [17]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1976 entries, 0 to 1975
Data columns (total 50 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Filename         1976 non-null   object 
 1   Word             1975 non-null   object 
 2   WordInterval     1976 non-null   int64  
 3   Syllable         1976 non-null   object 
 4   SylInterval      1976 non-null   object 
 5   SegIntervalNr    1976 non-null   int64  
 6   Vowel            1976 non-null   object 
 7   Begin            1976 non-null   float64
 8   End              1976 non-null   float64
 9   Duration_in_ms   1976 non-null   float64
 10  Max_dB           1976 non-null   float64
 11  Min_dB           1976 non-null   float64
 12  TimeMax_dB       1976 non-null   float64
 13  TimeMin_dB       1976 non-null   float64
 14  Mean_dB          1976 non-null   float64
 15  Centre_mean_dB   1976 non-null   float64
 16  Centre_max_dB    1976 non-null   float64
 17  Centre_min_dB 

In [50]:
full_sample_data = pd.read_csv("/Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations/combined_data.csv")
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
full_sample_data[:10]

Unnamed: 0,Filename,Word,WordInterval,Syllable,SylInterval,SegIntervalNr,Vowel,Begin,End,Duration_in_ms,...,F0AtPoint1,F0AtPoint2,F0AtPoint3,F0AtPoint4,F0AtPoint5,F0AtPoint6,F0AtPoint7,F0AtPoint8,F0AtPoint9,F0AtPoint10
0,Speaker_1_male,вечерa,2,'syll$','parallelsyllinterval',2,s1_cv_vʲɛ,1.889375,2.044858,155.482845,...,109.63265204141156,104.35065626539527,102.49127821549959,105.52702923754437,110.79783808095402,116.12595990115243,117.94314337335328,116.39063899071706,112.27079598361979,108.1947021660767
1,Speaker_1_male,вечерa,2,'syll$','parallelsyllinterval',3,s2_cv_ʧʲɪ_unstressed,2.044858,2.238501,193.642926,...,108.1947021660767,--undefined--,--undefined--,--undefined--,--undefined--,--undefined--,--undefined--,124.37495874393628,117.86532184168861,115.52645918680702
2,Speaker_1_male,вечерa,2,'syll$','parallelsyllinterval',4,s3_cv_ra_stressed,2.238501,2.499526,261.024641,...,115.52645918680702,117.86416387695218,121.29690852810644,123.80898203049698,129.1979305385731,136.61081017697276,146.63234866977515,154.93604548571605,157.7039796119457,141.9820290517729
3,Speaker_1_male,олжаның,4,'syll$','parallelsyllinterval',6,s1_vc_ol,8.478645,8.669182,190.536827,...,--undefined--,123.7284450331763,120.85441035690577,118.32445192588568,114.26760413960666,111.71603508552155,113.7282582549003,117.921971214716,118.34365327483549,118.48326706520108
4,Speaker_1_male,олжаның,4,'syll$','parallelsyllinterval',7,s2_cv_ʒɑ,8.669182,8.91557,246.388278,...,118.48326706520108,117.09417019379566,116.48717933813998,114.5024660615793,112.7206940243868,116.28266974225471,118.16324849591815,115.9261553215855,114.39753759926442,112.67862903968746
5,Speaker_1_male,олжаның,4,'syll$','parallelsyllinterval',8,s3_cvc_nəŋ,8.91557,9.155325,239.754521,...,112.67862903968746,112.74893274269924,115.2186955991557,120.85881910377083,130.530433173532,136.78238021681457,148.16670467285218,151.58312972284287,146.22474970564122,136.3487737467197
6,Speaker_1_male,шекелер,6,'syll$','parallelsyllinterval',10,s1_cv_ʃie,13.508227,13.757677,249.449606,...,--undefined--,--undefined--,--undefined--,--undefined--,--undefined--,--undefined--,--undefined--,133.9575451597551,131.96014028329182,128.15429854249075
7,Speaker_1_male,шекелер,6,'syll$','parallelsyllinterval',11,s2_cv_kie,13.757677,13.923753,166.076394,...,128.15429854249075,--undefined--,--undefined--,--undefined--,--undefined--,--undefined--,--undefined--,134.01029549934054,129.4883327428241,121.69093662830693
8,Speaker_1_male,шекелер,6,'syll$','parallelsyllinterval',12,s3_cvc_lier,13.923753,14.150535,226.781868,...,121.69093662830693,111.0768725844573,106.0353728054569,108.62285836418842,107.7774144593974,107.9682284398636,108.8041558323174,108.32436854623924,--undefined--,--undefined--
9,Speaker_1_male,сөрелер,8,'syll$','parallelsyllinterval',14,s1_cv_sɵ,18.792268,19.070072,277.803623,...,--undefined--,--undefined--,--undefined--,--undefined--,--undefined--,--undefined--,133.58886245359832,132.24675122338584,133.48262100827304,132.12643755792863


In [33]:
full_sample_data['Word'].isna().sum()

0

In [54]:
full_sample_data[full_sample_data['Word'].isna()]
#full_sample_data[full_sample_data['Vowel'].isna()]

Unnamed: 0,Filename,Word,WordInterval,Syllable,SylInterval,SegIntervalNr,Vowel,Begin,End,Duration_in_ms,...,F0AtPoint1,F0AtPoint2,F0AtPoint3,F0AtPoint4,F0AtPoint5,F0AtPoint6,F0AtPoint7,F0AtPoint8,F0AtPoint9,F0AtPoint10


In [53]:
full_sample_data.loc[927, 'Word'] = 'вопрос'
full_sample_data

Unnamed: 0,Filename,Word,WordInterval,Syllable,SylInterval,SegIntervalNr,Vowel,Begin,End,Duration_in_ms,...,F0AtPoint1,F0AtPoint2,F0AtPoint3,F0AtPoint4,F0AtPoint5,F0AtPoint6,F0AtPoint7,F0AtPoint8,F0AtPoint9,F0AtPoint10
0,Speaker_1_male,вечерa,2,'syll$','parallelsyllinterval',2,s1_cv_vʲɛ,1.889375,2.044858,155.482845,...,109.63265204141156,104.35065626539527,102.49127821549959,105.52702923754437,110.79783808095402,116.12595990115243,117.94314337335328,116.39063899071706,112.27079598361979,108.1947021660767
1,Speaker_1_male,вечерa,2,'syll$','parallelsyllinterval',3,s2_cv_ʧʲɪ_unstressed,2.044858,2.238501,193.642926,...,108.1947021660767,--undefined--,--undefined--,--undefined--,--undefined--,--undefined--,--undefined--,124.37495874393628,117.86532184168861,115.52645918680702
2,Speaker_1_male,вечерa,2,'syll$','parallelsyllinterval',4,s3_cv_ra_stressed,2.238501,2.499526,261.024641,...,115.52645918680702,117.86416387695218,121.29690852810644,123.80898203049698,129.1979305385731,136.61081017697276,146.63234866977515,154.93604548571605,157.7039796119457,141.9820290517729
3,Speaker_1_male,олжаның,4,'syll$','parallelsyllinterval',6,s1_vc_ol,8.478645,8.669182,190.536827,...,--undefined--,123.7284450331763,120.85441035690577,118.32445192588568,114.26760413960666,111.71603508552155,113.7282582549003,117.92197121471601,118.34365327483549,118.48326706520108
4,Speaker_1_male,олжаның,4,'syll$','parallelsyllinterval',7,s2_cv_ʒɑ,8.669182,8.915570,246.388278,...,118.48326706520108,117.09417019379566,116.48717933813998,114.5024660615793,112.7206940243868,116.28266974225471,118.16324849591815,115.9261553215855,114.39753759926442,112.67862903968746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971,Speaker_4_female_additional,окно,156,'syll$','parallelsyllinterval',234,s2_ccv_kno_stressed,398.959062,399.439655,480.592105,...,--undefined--,--undefined--,246.48895075384854,232.74159198647908,230.467029163604,224.1702694030495,218.02993473046484,227.54182244463976,--undefined--,--undefined--
1972,Speaker_4_female_additional,гора,158,'syll$','parallelsyllinterval',236,s1_cv_ɡʌ_pause,403.482342,403.784519,302.177810,...,--undefined--,--undefined--,175.8019646440638,217.02488005426304,208.93863487451924,223.12953273541066,241.8365263342389,264.4203175566175,282.5889539864136,280.0630737493057
1973,Speaker_4_female_additional,гора,158,'syll$','parallelsyllinterval',237,s2_cv_ra_stressed,403.784519,404.086697,302.177810,...,280.0630737493057,278.85880546451733,244.79838287400074,116.51275780077238,112.65014979021846,111.74023158396626,115.22541705709473,118.15196206515186,--undefined--,--undefined--
1974,Speaker_4_female_additional,сырға,160,'syll$','parallelsyllinterval',239,s1_cvc_sər,408.327752,408.683643,355.890808,...,--undefined--,--undefined--,--undefined--,--undefined--,--undefined--,284.0288893155089,293.0452036921328,--undefined--,274.0988493118092,238.92555967541682


In [55]:
for i in full_sample_data.columns:
    print(i)
# print(sample_data.columns)

Filename
Word
WordInterval
Syllable
SylInterval
SegIntervalNr
Vowel
Begin
End
Duration_in_ms
Max_dB
Min_dB
TimeMax_dB
TimeMin_dB
Mean_dB
Centre_mean_dB
Centre_max_dB
Centre_min_dB
Word_beg
Word_end
Word_dur_ms
Syl_beg
Syl_end
Syl_dur_ms
MaxF0Hz
TimeF0Max
MinF0Hz
TimeF0Min
MeanF0
Centre_MeanF0
TimeF0AtPoint1
TimeF0AtPoint2
TimeF0AtPoint3
TimeF0AtPoint4
TimeF0AtPoint5
TimeF0AtPoint6
TimeF0AtPoint7
TimeF0AtPoint8
TimeF0AtPoint9
TimeF0AtPoint10
F0AtPoint1
F0AtPoint2
F0AtPoint3
F0AtPoint4
F0AtPoint5
F0AtPoint6
F0AtPoint7
F0AtPoint8
F0AtPoint9
F0AtPoint10


In [16]:
# The columns we need: 'Filename', 'Word', 'Syllable','Vowel','Begin', 'End', 'Duration_in_ms','Mean_dB','Word_dur_ms','Syl_dur_ms', 
# 'MeanF0', ?calculated MeanF0 from 10 points

In [56]:
full_sample_data[full_sample_data['Vowel'].isna()]

Unnamed: 0,Filename,Word,WordInterval,Syllable,SylInterval,SegIntervalNr,Vowel,Begin,End,Duration_in_ms,...,F0AtPoint1,F0AtPoint2,F0AtPoint3,F0AtPoint4,F0AtPoint5,F0AtPoint6,F0AtPoint7,F0AtPoint8,F0AtPoint9,F0AtPoint10


In [13]:
# Replace '--undefined--' with NaN

#full_sample_data.replace('--undefined--', np.nan, inplace=True)

In [20]:
#print(sample_data[['MeanF0']])

### The columns we need: 
- 'Filename' 
- 'Word' 
- 'Vowel'-- 'Syllable'
- 'Begin', 'End' 
- 'Duration_in_ms' (vowel),
- MaxdB, MindB
- 'Mean_dB' (vowel),
- centre_meandB
- 'Word_dur_ms',
- 'MeanF0' (syllable)
- word begin, end
- max fo
- min f0
- centre mean f0
- fo point1-10 
    - mean of F0AtPoint1-Point10 is similar to MeanF0.
    - is there any difference in this calculation?
        - (11) **f0 at 10 equidistant points (can be used to create avarage time-normalized f0 contours)**
        - (12) **time for these 10 equidistant points**


In [57]:
new_sd = full_sample_data[['Filename', 'Word', 'Word_beg','Word_end','Word_dur_ms','Vowel','Begin', 'End', 'Duration_in_ms','Max_dB','Min_dB', 'Mean_dB','Centre_mean_dB','MaxF0Hz','MinF0Hz','MeanF0','Centre_MeanF0']]

In [58]:
#new_sd.rename(columns={"Vowel": "Syllable"}, inplace=True)
#new_sd

new_sd = new_sd.copy()         
new_sd.rename(columns={"Vowel": "Annotation"}, inplace=True)
new_sd

Unnamed: 0,Filename,Word,Word_beg,Word_end,Word_dur_ms,Annotation,Begin,End,Duration_in_ms,Max_dB,Min_dB,Mean_dB,Centre_mean_dB,MaxF0Hz,MinF0Hz,MeanF0,Centre_MeanF0
0,Speaker_1_male,вечерa,1.889375,2.499526,610.150412,s1_cv_vʲɛ,1.889375,2.044858,155.482845,74.985990,63.362449,69.668919,69.955847,117.94501777565652,102.48061042356133,110.45141920183515,112.27203953943248
1,Speaker_1_male,вечерa,1.889375,2.499526,610.150412,s2_cv_ʧʲɪ_unstressed,2.044858,2.238501,193.642926,78.978600,54.501599,69.067156,69.564769,134.5442548372237,105.78469042787755,119.9122666683785,132.1938115430081
2,Speaker_1_male,вечерa,1.889375,2.499526,610.150412,s3_cv_ra_stressed,2.238501,2.499526,261.024641,82.423066,62.212040,77.371060,81.115531,158.85675082207482,114.03170346059693,135.30754057252486,134.50084575302895
3,Speaker_1_male,олжаның,8.478645,9.155741,677.095577,s1_vc_ol,8.478645,8.669182,190.536827,79.912234,68.533809,77.636074,79.089145,129.2106468283745,111.47316521453874,117.96414989958585,115.0118364127969
4,Speaker_1_male,олжаның,8.478645,9.155741,677.095577,s2_cv_ʒɑ,8.669182,8.915570,246.388278,79.246411,72.590683,75.541177,74.920249,118.48326706520108,112.45081127359856,115.66350048323727,115.45099148902773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971,Speaker_4_female_additional,окно,398.826399,399.439655,613.255551,s2_ccv_kno_stressed,398.959062,399.439655,480.592105,69.127785,52.744015,64.442786,68.399444,257.13410883608185,217.8142795800798,230.5511318516636,227.5493482413709
1972,Speaker_4_female_additional,гора,403.482342,404.086697,604.355619,s1_cv_ɡʌ_pause,403.482342,403.784519,302.177810,75.707378,43.795685,69.512423,73.469518,285.4730316882915,155.3990954053691,232.93896847936605,219.51178124256617
1973,Speaker_4_female_additional,гора,403.482342,404.086697,604.355619,s2_cv_ra_stressed,403.784519,404.086697,302.177810,74.260741,40.934942,67.350124,71.654074,282.51776373132617,111.71723674820116,164.18209122959635,119.95880583324052
1974,Speaker_4_female_additional,сырға,408.327752,409.006706,678.953701,s1_cvc_sər,408.327752,408.683643,355.890808,77.374015,46.955878,69.869074,72.911484,297.5424642400837,238.9255596754168,279.24196314622105,287.8251457071888


In [59]:
new_sd[new_sd['Annotation'].isna()]

Unnamed: 0,Filename,Word,Word_beg,Word_end,Word_dur_ms,Annotation,Begin,End,Duration_in_ms,Max_dB,Min_dB,Mean_dB,Centre_mean_dB,MaxF0Hz,MinF0Hz,MeanF0,Centre_MeanF0


In [68]:
# Extract SyllStr, IPA, and Stress using a regex pattern
#new_sd[["SyllStr", "IPA", "Stress"]] = new_sd["Syllable"].str.extract(r"^s\d+_([^_]+)_((?:[^_]+))(?:_(stressed))?$")

# Fill NaN values in "Stress" with an empty string (optional)
#new_sd["Stress"] = new_sd["Stress"].fillna("")

# Update "Syllable" column to keep only the syllable position (s2, s3, etc.)
#new_sd["Syllable"] = new_sd["Syllable"].str.extract(r"^(s\d+)")


# Display the updated DataFrame
#new_sd[:15]

In [69]:
#new_sd[new_sd['SyllStr'].isna()]

In [70]:
#new_sd = new_sd[['Filename', 'Word', 'Word_beg','Word_end','Word_dur_ms','Syllable',"SyllStr",'IPA','Stress','Begin', 'End', 'Duration_in_ms','Max_dB','Min_dB', 'Mean_dB','Centre_mean_dB','MaxF0Hz','MinF0Hz','MeanF0','Centre_MeanF0']]
#new_sd[:10]


In [60]:
# Inner join on the 'Word' column metadata on CS and new_sd
# another option 'outer' join
meta_df = pd.merge(new_sd, stimulus_kaz_rus, on='Word', how='left')

# Display the result
meta_df

Unnamed: 0,Filename,Word,Word_beg,Word_end,Word_dur_ms,Annotation,Begin,End,Duration_in_ms,Max_dB,...,MaxF0Hz,MinF0Hz,MeanF0,Centre_MeanF0,Language,SuffixCase,WordForm,LatinScript,Gloss,WordClass
0,Speaker_1_male,вечерa,1.889375,2.499526,610.150412,s1_cv_vʲɛ,1.889375,2.044858,155.482845,74.985990,...,117.94501777565652,102.48061042356133,110.45141920183515,112.27203953943248,Rus,GEN,inflected,vechera,evening.ACC,Noun
1,Speaker_1_male,вечерa,1.889375,2.499526,610.150412,s2_cv_ʧʲɪ_unstressed,2.044858,2.238501,193.642926,78.978600,...,134.5442548372237,105.78469042787755,119.9122666683785,132.1938115430081,Rus,GEN,inflected,vechera,evening.ACC,Noun
2,Speaker_1_male,вечерa,1.889375,2.499526,610.150412,s3_cv_ra_stressed,2.238501,2.499526,261.024641,82.423066,...,158.85675082207482,114.03170346059693,135.30754057252486,134.50084575302895,Rus,GEN,inflected,vechera,evening.ACC,Noun
3,Speaker_1_male,олжаның,8.478645,9.155741,677.095577,s1_vc_ol,8.478645,8.669182,190.536827,79.912234,...,129.2106468283745,111.47316521453874,117.96414989958585,115.0118364127969,Kaz,GEN,inflected,oljanyń,trophy.GEN,Noun
4,Speaker_1_male,олжаның,8.478645,9.155741,677.095577,s2_cv_ʒɑ,8.669182,8.915570,246.388278,79.246411,...,118.48326706520108,112.45081127359856,115.66350048323727,115.45099148902773,Kaz,GEN,inflected,oljanyń,trophy.GEN,Noun
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971,Speaker_4_female_additional,окно,398.826399,399.439655,613.255551,s2_ccv_kno_stressed,398.959062,399.439655,480.592105,69.127785,...,257.13410883608185,217.8142795800798,230.5511318516636,227.5493482413709,CS,,uninflected,okno,window,Noun
1972,Speaker_4_female_additional,гора,403.482342,404.086697,604.355619,s1_cv_ɡʌ_pause,403.482342,403.784519,302.177810,75.707378,...,285.4730316882915,155.3990954053691,232.93896847936605,219.51178124256617,CS,,uninflected,gora,mountain,Noun
1973,Speaker_4_female_additional,гора,403.482342,404.086697,604.355619,s2_cv_ra_stressed,403.784519,404.086697,302.177810,74.260741,...,282.51776373132617,111.71723674820116,164.18209122959635,119.95880583324052,CS,,uninflected,gora,mountain,Noun
1974,Speaker_4_female_additional,сырға,408.327752,409.006706,678.953701,s1_cvc_sər,408.327752,408.683643,355.890808,77.374015,...,297.5424642400837,238.9255596754168,279.24196314622105,287.8251457071888,Kaz,,uninflected,syrǵa,earring,Noun


In [71]:
# Mispronounced tokens - 7

meta_df.loc[234, 'Language'] = 'Rus'
meta_df.loc[234, 'SuffixCase'] = 'DAT'
meta_df.loc[234, 'WordForm'] = 'inflected'
meta_df.loc[234, 'LatinScript'] = 'vole'
meta_df.loc[234, 'Gloss'] = 'will.DAT'
meta_df.loc[234, 'WordClass'] = 'Noun'

meta_df.loc[235, 'Language'] = 'Rus'
meta_df.loc[235, 'SuffixCase'] = 'DAT'
meta_df.loc[235, 'WordForm'] = 'inflected'
meta_df.loc[235, 'LatinScript'] = 'vole'
meta_df.loc[235, 'Gloss'] = 'will.DAT'
meta_df.loc[235, 'WordClass'] = 'Noun'  

meta_df[meta_df['Language'].isna()]

Unnamed: 0,Filename,Word,Word_beg,Word_end,Word_dur_ms,Annotation,Begin,End,Duration_in_ms,Max_dB,...,WordForm,LatinScript,Gloss,WordClass,StressedSyll,NounGender,Declension,StressShift,ShiftDirect,AttestedInCS
212,Speaker_1_male,небa_misp,348.449254,349.246923,797.668972,s1_cv_ne,348.449254,348.964741,515.487249,79.698002,...,,,,,,,,,,
213,Speaker_1_male,небa_misp,348.449254,349.246923,797.668972,s2_cv_bʌ_stressed,348.964741,349.246923,282.181723,81.836902,...,,,,,,,,,,
1331,Speaker_4_female,вины_misp,22.973543,23.479154,505.611348,s1_cv_vʲi_stressed,22.973543,23.171554,198.010783,76.427367,...,,,,,,,,,,
1332,Speaker_4_female,вины_misp,22.973543,23.479154,505.611348,s2_cv_nɨ,23.171554,23.479154,307.600565,73.505544,...,,,,,,,,,,
1335,Speaker_4_female,звезды_misp,29.478327,30.142977,664.650269,s1_ccvc_zvʲoz_stressed,29.478327,29.835904,357.576676,77.087215,...,,,,,,,,,,
1336,Speaker_4_female,звезды_misp,29.478327,30.142977,664.650269,s3_cv_dɨ,29.835904,30.142977,307.073593,76.447961,...,,,,,,,,,,
1345,Speaker_4_female,горы_misp,42.649277,43.162072,512.795014,s1_cv_ɡo_stressed,42.649277,42.879933,230.656503,78.390866,...,,,,,,,,,,
1346,Speaker_4_female,горы_misp,42.649277,43.162072,512.795014,s2_cv_rɨ,42.879933,43.162072,282.138511,75.068718,...,,,,,,,,,,
1436,Speaker_4_female,души_misp,187.679216,188.293282,614.06631,s1_cv_du_stressed,187.679216,187.908059,228.843459,71.924373,...,,,,,,,,,,
1437,Speaker_4_female,души_misp,187.679216,188.293282,614.06631,s2_cv_ʃi,187.908059,188.293282,385.222851,76.944601,...,,,,,,,,,,


In [75]:
meta_df.loc[235, 'WordClass']

'Noun'

In [64]:
meta_df = pd.merge(meta_df, stimulus_cs, on='Word', how='left')

meta_df[:20]

Unnamed: 0,Filename,Word,Word_beg,Word_end,Word_dur_ms,Annotation,Begin,End,Duration_in_ms,Max_dB,...,WordForm,LatinScript,Gloss,WordClass,StressedSyll,NounGender,Declension,StressShift,ShiftDirect,AttestedInCS
0,Speaker_1_male,вечерa,1.889375,2.499526,610.150412,s1_cv_vʲɛ,1.889375,2.044858,155.482845,74.98599,...,inflected,vechera,evening.ACC,Noun,1.0,m,2.0,yes,forward,yes
1,Speaker_1_male,вечерa,1.889375,2.499526,610.150412,s2_cv_ʧʲɪ_unstressed,2.044858,2.238501,193.642926,78.9786,...,inflected,vechera,evening.ACC,Noun,1.0,m,2.0,yes,forward,yes
2,Speaker_1_male,вечерa,1.889375,2.499526,610.150412,s3_cv_ra_stressed,2.238501,2.499526,261.024641,82.423066,...,inflected,vechera,evening.ACC,Noun,1.0,m,2.0,yes,forward,yes
3,Speaker_1_male,олжаның,8.478645,9.155741,677.095577,s1_vc_ol,8.478645,8.669182,190.536827,79.912234,...,inflected,oljanyń,trophy.GEN,Noun,,,,,,
4,Speaker_1_male,олжаның,8.478645,9.155741,677.095577,s2_cv_ʒɑ,8.669182,8.91557,246.388278,79.246411,...,inflected,oljanyń,trophy.GEN,Noun,,,,,,
5,Speaker_1_male,олжаның,8.478645,9.155741,677.095577,s3_cvc_nəŋ,8.91557,9.155325,239.754521,76.262425,...,inflected,oljanyń,trophy.GEN,Noun,,,,,,
6,Speaker_1_male,шекелер,13.508227,14.150535,642.307868,s1_cv_ʃie,13.508227,13.757677,249.449606,76.437268,...,inflected,shekeler,forehead.Pl,Noun,,,,,,
7,Speaker_1_male,шекелер,13.508227,14.150535,642.307868,s2_cv_kie,13.757677,13.923753,166.076394,82.803796,...,inflected,shekeler,forehead.Pl,Noun,,,,,,
8,Speaker_1_male,шекелер,13.508227,14.150535,642.307868,s3_cvc_lier,13.923753,14.150535,226.781868,79.860326,...,inflected,shekeler,forehead.Pl,Noun,,,,,,
9,Speaker_1_male,сөрелер,18.792268,19.419988,627.719622,s1_cv_sɵ,18.792268,19.070072,277.803623,83.057102,...,inflected,sóreler,shelf.Pl,Noun,,,,,,


In [76]:
meta_df.to_csv('final_df.csv', index=False)

In [65]:
pwd

'/Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations'

### Formants dataset

In [12]:
import pandas as pd
import glob
import os
from charset_normalizer import from_path

# Define your folder path
folder_path = "/Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations/formants_new"

# Get list of all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Function to detect encoding using charset-normalizer and read CSV
def read_csv_with_encoding(file_path):
    result = from_path(file_path).best()
    encoding = result.encoding if result else 'utf-8-sig'  # fallback to utf-8
    print(f"Reading {os.path.basename(file_path)} as {encoding}")
    return pd.read_csv(file_path, encoding=encoding)

# Read and combine all CSV files safely
dfs = []
for file in csv_files:
    try:
        df = read_csv_with_encoding(file)
        dfs.append(df)
    except Exception as e:
        print(f"❌ Failed to read {file}: {e}")

# Combine into one DataFrame
df_formants_all = pd.concat(dfs, ignore_index=True)

# Show first few rows
df_formants_all.head()



Reading formants6.csv as utf-8-sig
❌ Failed to read /Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations/formants_new/formants6.csv: 'utf-8' codec can't decode byte 0xb2 in position 219: invalid start byte
Reading formants7.csv as utf-8-sig
❌ Failed to read /Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations/formants_new/formants7.csv: 'utf-8' codec can't decode byte 0xb2 in position 103: invalid start byte
Reading formants5.csv as utf-8-sig
❌ Failed to read /Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations/formants_new/formants5.csv: 'utf-8' codec can't decode byte 0x92 in position 97: invalid start byte
Reading formants4.csv as utf-8-sig
❌ Failed to read /Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnnotations/formants_new/formants4.csv: 'utf-8' codec can't decode byte 0xb2 in position 90: invalid start byte
Reading formants1.csv as utf_8
Reading formants3.csv as utf-8-sig
❌ Failed to read /Users/aidyn/Documents/Fall_2024_IndStudy/CompletedAnno

Unnamed: 0,file,time,word,phoneme,duration,F1,F2,F3
0,Speaker_1_male,0.944688,,,1.889375,700.622436,1433.047342,2194.3265
1,Speaker_1_male,1.923856,25G5@a,v≤,0.068962,183.441796,1435.802227,2413.905369
2,Speaker_1_male,2.001483,25G5@a,[_unstressed,0.086293,332.122411,1829.017775,2551.170752
3,Speaker_1_male,2.11954,25G5@a,ß≤,0.149821,2097.48422,2741.175778,3187.176626
4,Speaker_1_male,2.216664,25G5@a,j_unstressed,0.044427,476.466234,1491.016454,2458.569107
