### SOABB1 - SCRIPT 1: 
Aim: Cut the original recordings, separating the baby's vocalisations from other sounds, so that they can be listened to again and analysed afterwards, using start_times and stop_times (in secondes)

Goals: 
- creates files with the baby's vocalisations in the folders: 
		sounds/BB/BL
		sounds/BB/EXP
- creates files with the other sounds (i.e., portions of the recording where no vocalisations were detected) in the folders: 
		sounds/other/BL
		sounds/other/EXP

- In each folder, the _NM.wav files are unmodified sounds; the .wav files are modified sounds.

In [1]:
## CHANGE ON YOUR COMPUTER
user = "fannie"

In [2]:
### IMPORT PACKAGES
import matplotlib.pylab as plt
import numpy as np
import os, glob
import pandas as pd
import seaborn as sns
import re
from scipy import stats
from scipy.io import wavfile
#import pysptk
import statsmodels.api as sm
from statsmodels.formula.api import ols

# PARAM 4 F0 estimation
minF0 = 140
maxF0 = 1500
hop_length = 80

In [3]:
# OPEN THE FILE WITH THE SOUND TIMINGS AND SOUND TYPE CODING
DIR = "/Users/" + user + "/Documents/SOABB_ETUDE1/"
DIR_data = DIR + "audio_analysis/datasets/" # csv result files location 
DIR_SONS = DIR + "raw_data_1/" # audio files location 
nameresfile = DIR_data + "SOABB_N63_version230425.xlsx" # coding file (VTC or manual coding)
OUT_SOUNDS = DIR_data + "sounds/" # where to save the audio segments
df = pd.read_excel(nameresfile) # open file

## infer participant ID from filename
print(len(df))
allnames = df.groupby(["file"])["start_times"].mean().reset_index()
for i,r in allnames.iterrows(): allnames.loc[i, "BB"] = int(re.split("BB", re.split("_",r["file"])[0])[1])
df = pd.merge(df, allnames[["file","BB"]], on = ["file"])
print(len(df))

df = df.loc[~np.isnan(df["BB"])] ## exclude empty rows
#df = df.drop(columns=['coms_louise']) ## remove comments 
df.head() 

4102
4102


Unnamed: 0,file,start_times,stop_times,check_timings,duration,phase,who,voc_type,category1_M,category2_M,...,expressions (x/5),Reco miroir (0;1;2),attire attention soi (0;1;2),aime/aime pas (0;1;2),reco photo (0;1;2),attire attention actions (0;1;2),Unnamed: 34,coded,audio_order,BB
0,BB1_6M_1403_p100_NM,38.01,38.79,0.91,0.78,BL,B,B,,,...,0.0,,,,,,,,,1.0
1,BB1_6M_1403_p100_NM,39.7,40.79,2.89,1.09,BL,B,pB,,,...,0.0,,,,,,,,,1.0
2,BB1_6M_1403_p100_NM,43.68,45.95,4.05,2.27,BL,B,pB,,,...,0.0,,,,,,,,,1.0
3,BB1_6M_1403_p100_NM,50.0,50.7,1.14,0.7,BL,B,S,,,...,0.0,,,,,,,,,1.0
4,BB1_6M_1403_p100_NM,51.84,51.96,1.6,0.12,BL,B,V,,,...,0.0,,,,,,,,,1.0


### LOOP FOR PERFORMING F0 EXTRACTION AND CALCULATING THE MAIN MEASUREMENTS

In [4]:
### List subject numbers
df[["BB"]] = df[["BB"]].astype(int)
bb_list = np.unique(df["BB"])
print("N =", len(bb_list), bb_list)

## remove sounds outside the EXP/BL phases
df = df.loc[((df["phase"] == "EXP") | (df["phase"] == "BL"))]

N = 63 [  1   7  12  15  18  21  23  24  27  31  36  38  45  47  52  56  58  61
  66  68  69  70  71  72  74  77  78  81  88  90  91  92  94  96  99 100
 101 103 104 109 110 111 112 114 115 116 117 118 119 120 121 122 123 125
 127 129 130 131 133 134 148 149 150]


In [5]:
#### CODE TO MERGE VOCS THAT ARE CONTIGUOUS & OF THE SAME TYPE

# Adds columns to track merge/delete
df['delete_group'] = np.nan  # Start with None meaning not to delete
df['merge_group'] = np.nan  # Start with None meaning no group
df = df.reset_index()

current_row = df.iloc[0]
next_row = df.iloc[1]
    
# Loop through each row of the DataFrame starting from the 1st row (it does not count the header)
for i in range(1, (len(df)-2)):
    # Check if the next row's start time is less than 500ms after the current row's end time
    # and if they are of the same vocalization type and if they are from the same file (i.e., same baby)
    if (next_row['start_times'] - current_row['stop_times'] <= 0.5 and 
        next_row['voc_type'] == current_row['voc_type'] and 
        next_row['phase'] == current_row['phase'] and
        next_row['who'] == current_row['who'] and
        next_row['file'] == current_row['file']):
        
        # Assign groups to both current (merge) and next (delete) row
        df.at[current_row["index"], 'delete_group'] = 1  # Mark the current row to know we need to delete
        df.at[next_row["index"], 'merge_group'] = 1    # Mark the next row for merge
        
        # Merge the rows by extending the end time
        next_row['start_times'] = current_row['start_times']
        df.at[next_row["index"], 'start_times'] = next_row['start_times']
        
    current_row = df.iloc[i]
    next_row = df.iloc[i+1]
        
# deletes rows that have been merged
df = df.loc[np.isnan(df["delete_group"])]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  next_row['start_times'] = current_row['start_times']


In [5]:
### MAIN LOOP FOR CUTTING VOCALISATIONS AND SAVING FILE NAMES IN THE TABLE
for k in bb_list:
    curBB = df.loc[df["BB"] == k].sort_values(["start_times"]).reset_index() ## select data for this baby & sort them by Start Times    
    filename_NM = np.unique(curBB["file"])[0] ## find NM filename for this baby 
    filename_M = filename_NM.split("_NM")[0] ## find NM filename for this baby 
    cond = re.split( "_",filename_NM)[3] ## use filename to find condition
    folder = filename_NM.split("_" + cond)[0] ## use filename to find the audio recording folder
    audiofile = DIR_SONS + folder + "/" + filename_NM + ".wav"
    print("bb:", k, "- cond:", cond, "- files:", filename_NM, " - ", filename_M, "nb vocs?", len(curBB))
    
    ## SAVES CONDITION TO DF FOR ANALYSIS LATER
    df.loc[df["BB"] == k, "cond"] = cond
    
    ### OPENS M AND NM AUDIO fileS
    audiofile_NM = DIR_SONS + folder + "/" + filename_NM + ".wav"
    sr, x_NM = wavfile.read(audiofile_NM)
    if len(np.shape(x_NM)) > 1: ## TO CONVERT IN MONO IF NEEDED
        x_NM = x_NM[:,0]
        
    audiofile_M = DIR_SONS + folder + "/" + filename_M + ".wav"
    sr, x_M = wavfile.read(audiofile_M)
    if len(np.shape(x_M)) > 1: ## TO CONVERT IN MONO IF NEEDED
        x_M = x_M[:,0]

    ### Cut NM and M sounds to suppress the offset
    cutfile = int(0.01 * sr)  # sr = sampling rate (samples per second)
    x_NM = x_NM[:-cutfile]  # remove the last 10  milliseconds from NM files
    x_M = x_M[cutfile:]  # remove the last 10  milliseconds from M files
    
    ### CUTS EACH vocalisation IDENTIFIED BY START / STOP TIMES 
    ### & also the silence between this voc and the preceding one
    for ix, curvoc in curBB.iterrows():
        if ix > 0: prevoc = curBB.loc[ix-1] ## SELECTS THE PRECEDING VOC SO WE CAN FIND THE TIMING OF THE SILENCE BETWEEN THE CURRENT VOC AND THE PRECEDING ONE BELOW

        ### SAVE THIS SOUND for NM and M files
        cursound_NM = x_NM[int(curvoc["start_times"]*sr):int(curvoc["stop_times"]*sr)]#.astype(np.float64)
        curvocnamefile_NM = "/BB/" + curvoc["phase"] + "/" + filename_NM + "_" + str(np.round(curvoc["start_times"],2)) + "_to_" + str(np.round(curvoc["stop_times"],2))
        wavfile.write(OUT_SOUNDS + curvocnamefile_NM + ".wav", sr, cursound_NM)
        
        cursound_M = x_M[int(curvoc["start_times"]*sr):int(curvoc["stop_times"]*sr)]#.astype(np.float64)
        curvocnamefile_M = "/BB/" + curvoc["phase"] + "/" + filename_M + "_" + str(np.round(curvoc["start_times"],2)) + "_to_" + str(np.round(curvoc["stop_times"],2)) + "_" + curvoc["phase"]
        wavfile.write(OUT_SOUNDS + curvocnamefile_M + ".wav", sr, cursound_M)        
        
        ### ALSO SAVE THE PORTION OF THE AUDIO OCpreING BETWEEN THIS VOC AND THE PRECEDING VOC
        if ix > 0: ## WE ONLY DO THIS AFTER THE FIRST ONE
            if prevoc["stop_times"] > curvoc["start_times"]: 
                print("start_times and stop_times are not compatible ")
                continue ## TO FIND WHY BUG THEN REMOVE
            presound_NM = x_NM[int(prevoc["stop_times"]*sr):int(curvoc["start_times"]*sr)]#.astype(np.float64)
            prevocnamefile_NM = "/other/" + curvoc["phase"] + "/" + filename_NM + "_" + str(np.round(prevoc["stop_times"],2)) + "_to_" + str(np.round(curvoc["start_times"],2))
            wavfile.write(OUT_SOUNDS + prevocnamefile_NM + ".wav", sr, presound_NM)
            
            presound_M = x_M[int(prevoc["stop_times"]*sr):int(curvoc["start_times"]*sr)]#.astype(np.float64)
            prevocnamefile_M = "/other/" + curvoc["phase"] + "/" + filename_M + "_" + str(np.round(prevoc["stop_times"],2)) + "_to_" + str(np.round(curvoc["start_times"],2)) + "_" + curvoc["phase"]
            wavfile.write(OUT_SOUNDS + prevocnamefile_M + ".wav", sr, presound_M)        
        else:
            prevocnamefile_NM = "none"
            prevocnamefile_M = "none"

#        print(prevocnamefile_M)
#        print(curvocnamefile_M)
                
        ### SAVE fileNAMES IN THE TABLE NX_sounds_details.csv for later use in script 2
        df.loc[((df["BB"] == k) & (df["start_times"] == curvoc["start_times"])), "BBvoc_NM"] = curvocnamefile_NM
        df.loc[((df["BB"] == k) & (df["start_times"] == curvoc["start_times"])), "BBvoc_M"] = curvocnamefile_M
        df.loc[((df["BB"] == k) & (df["start_times"] == curvoc["start_times"])), "othersound_NM"] = prevocnamefile_NM
        df.loc[((df["BB"] == k) & (df["start_times"] == curvoc["start_times"])), "othersound_M"] = prevocnamefile_M

NameError: name 'bb_list' is not defined

In [7]:
## SAVES df to csv
df.to_csv(DIR_data + "N" + str(len(bb_list)) + "_sounds_details.csv", sep = ";")