# Automatically Extract the Relevant Teamwork Signals from Plant Recording 
---

Well, I tried to automate the extraction of teamwork session signals from plant recordings for different days and teams. The issue with this is that 
- some recordings were broken and I needed to manually delete them
- some recordings for some days are missing (e.g. team_22 has no recording for day 1 & 3
- for some teams there are multiple recordings for one day while there are teams with only one big recording per day
- ... 

For this reason, I will go through every team in different runs and adjust the code manually. 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import wavfile
from datetime import datetime
import os

plant_dir = r"..\data\raw-plant-data"
interval_dir = r"..\data\teamwork-intervals\interval-sub-tables"

store_dir = r"..\data\interim-plant-data-teamwork-extracted"

In [2]:
# Teamwork sessions only on 3 out of 4 days. 
session_days = ["2023-01-10", 
                "2023-01-12", 
                "2023-01-13"]

In [3]:
plant_data_folders = os.listdir(plant_dir)
plant_data_folders

['Team_01',
 'Team_02',
 'Team_03',
 'Team_04',
 'Team_05',
 'Team_06',
 'Team_07',
 'Team_08',
 'Team_09',
 'Team_10',
 'Team_11',
 'Team_12',
 'Team_13',
 'Team_15',
 'Team_16',
 'Team_17',
 'Team_18',
 'Team_19',
 'Team_20',
 'Team_22']

### Select the team here:

In [4]:
team_number = 18

In [5]:
team_name = plant_data_folders[team_number] # adjust the index to get a particular team. 
plant_data_folders[team_number] 

'Team_20'

In [6]:
signal_dir = os.path.join(plant_dir,team_name,r"Plant_corrected")
signal_labels = os.listdir(signal_dir) # ["BYB_Recording_2023-01-10_08.07.40", ...]
print(team_name) # "Team_01"

# Transform "Team_01" into "team_1" ONLY for accessing .xlsx files with teamwork session intervals per team! (team_1.xlsx)
team_label = team_name.lower()
print(team_label) # team_1

Team_20
team_20


The following dictionary is specific to the column names in the teamwork intervals excel sheets. 

In [7]:
signal_labels

['BYB_Recording_2023-01-10_08.39.08.wav',
 'BYB_Recording_2023-01-11_08.37.09.wav',
 'BYB_Recording_2023-01-12_08.25.58.wav',
 'BYB_Recording_2023-01-12_13.03.33.wav',
 'BYB_Recording_2023-01-13_08.31.08.wav']

Iterate over all `.wav` files of a particular team. 

In [8]:
def get_tw_intervals(team, day):
    """
    Helper function to get teamwork intervals for a particular team at a particular day.
    """
    df_team = pd.read_excel(os.path.join(interval_dir,team+".xlsx"))
    print(os.path.join(interval_dir,team+".xlsx"))
    
    # Dictionary for the columns indicating start and end times for each day. 
    interval_dict = {"2023-01-10": ["Day 1", "Unnamed: 1"],
                     "2023-01-12": ["Day 3", "Unnamed: 9"],
                     "2023-01-13": ["Day 4", "Unnamed: 17"]}
    
    df_team_day = df_team[[interval_dict[day][0],interval_dict[day][1]]]
    print(f"df_team_day:\n{df_team_day}")
    df_team_day = df_team_day.tail(-2) # first two rows not relevant.
    df_team_day_intervals = df_team_day[df_team_day[interval_dict[day][1]].apply(lambda x: isinstance(x, str))]
    print(f"df_team_day_intervals:\n{df_team_day_intervals}")
    print(len(df_team_day_intervals))
    
    return df_team_day_intervals    

In [9]:
def extract_teamwork_signal(day,recording_path,tw_interval,num_recs_per_day,recording_start):
    """
    Helper function to extract the teamwork signal from the plant recording. 
    """
    # Dictionary for the columns indicating start and end times for each day. 
    interval_dict = {"2023-01-10": ["Day 1", "Unnamed: 1"],
                     "2023-01-12": ["Day 3", "Unnamed: 9"],
                     "2023-01-13": ["Day 4", "Unnamed: 17"]}
    FMT = '%H:%M:%S'
    
    # Load .wav file. 
    sampling_rate, plant_wave = wavfile.read(recording_path)
    
    if num_recs_per_day == 1: #num recordings per team per day
        
        for ind in tw_interval.index:
            teamwork_start = tw_interval[interval_dict[day][0]][ind]
            teamwork_end = tw_interval[interval_dict[day][1]][ind]
            print(f"Start timestamp recording: {recording_start}\nTimestamp teamwork interval: {teamwork_start} until {teamwork_end}")
            
            timedelta_start = datetime.strptime(teamwork_start, FMT) - datetime.strptime(recording_start, FMT)
            timedelta_end = datetime.strptime(teamwork_end, FMT) - datetime.strptime(recording_start, FMT)
            print(f"Teamwork start: {timedelta_start}\nTeamwork end: {timedelta_end}\nTime difference: {timedelta_end-timedelta_start}")
            
            # Convert time into seconds. 
            start_s = int(timedelta_start.total_seconds())
            end_s = int(timedelta_end.total_seconds())
            print(f"TW start in s: {start_s}\nTW end in s: {end_s}")
            
            # Print recording info.
            print(f"Len(plant_wave): {len(plant_wave)}")
            print(f"sampling_rate: {sampling_rate}")

            # Compute the actual frames in the time series. 
            frame_start = sampling_rate * start_s
            frame_end = sampling_rate * end_s
            print(f"Frame start in s: {frame_start}\nFrame end in s: {frame_end}")

            # Extract time series corresponding to teamwork session from plant recording. 
            tw_signal = plant_wave[frame_start:frame_end]  
            
            # Save extracted time series corresponding to teamwork session. 
            tw_signal_label = f"sdm_{day}_{team_name.lower()}_{start_s}_{end_s}.wav"
            print(tw_signal_label)
            print(tw_signal)
                
            path_dir = os.path.join(store_dir,team_name.lower(),day)
            path_file = os.path.join(store_dir,team_name.lower(),day,tw_signal_label)

            if len(plant_wave) < frame_start: 
                print(f"Teamwork session not recorded. Len(plant_wave) {len(plant_wave)} exceeds frame_start {frame_start}")
                tw_signal_label = "empty_" + tw_signal_label
                print(f"Empty label: {tw_signal_label}")
                path_dir = os.path.join(store_dir,team_name.lower(),day+"_empty")
                path_file = os.path.join(store_dir,team_name.lower(),day+"_empty",tw_signal_label)     
                
            if not os.path.exists(path_dir):
                os.makedirs(path_dir)
            if not os.path.exists(path_file):
                wavfile.write(path_file, sampling_rate, tw_signal)
            
            ## Plot extracted time series corresponding to teamwork session.
            #xi = list(range(frame_start,frame_end))

            #plt.figure()
            #plt.plot(xi,tw_signal)
            #plt.show()

            print("********")
            
    elif num_recs_per_day > 1:
        print(f"Len(plant_wave): {len(plant_wave)}")
        print(f"sampling_rate: {sampling_rate}")
        print("********")

In [10]:
for label in signal_labels:
    signal_path = os.path.join(signal_dir, label)
    print(label)
    day = label.split("_")[2]
    recording_start = label.split("_")[-1].replace('.wav', '').replace(".",":")
    print(f"Recording start: {recording_start}")

    if day in session_days:
        print(f"Day {day} is a teamwork session day.")     
        
        # Get teamwork session intervals from excel sheet. 
        intervals = get_tw_intervals(team_label, day)
        
        # Compute bounds in seconds. 
        num_rec_per_day = sum(day in s for s in signal_labels) # number of recordings per particular day. 
        print(f"Number of recordings per day: {num_rec_per_day}")
        
        ## Extract teamwork session signal from plant recording(s).      
        extract_teamwork_signal(day,signal_path,intervals,num_rec_per_day,recording_start)
               
    else:
        print("Day is not in session days.")
    print("_____________")

BYB_Recording_2023-01-10_08.39.08.wav
Recording start: 08:39:08
Day 2023-01-10 is a teamwork session day.
..\data\teamwork-intervals\interval-sub-tables\team_20.xlsx
df_team_day:
         Day 1 Unnamed: 1
0   Start time   End time
1      Team 20        NaN
2     10:27:10   10:46:25
3     10:53:45   11:11:00
4     11:24:25   11:48:20
5     13:05:45   13:25:05
6     13:34:40   13:53:55
7     14:25:30   14:40:00
8     14:53:28   15:16:32
9          NaN        NaN
10         NaN        NaN
df_team_day_intervals:
      Day 1 Unnamed: 1
2  10:27:10   10:46:25
3  10:53:45   11:11:00
4  11:24:25   11:48:20
5  13:05:45   13:25:05
6  13:34:40   13:53:55
7  14:25:30   14:40:00
8  14:53:28   15:16:32
7
Number of recordings per day: 1
Start timestamp recording: 08:39:08
Timestamp teamwork interval: 10:27:10 until 10:46:25
Teamwork start: 1:48:02
Teamwork end: 2:07:17
Time difference: 0:19:15
TW start in s: 6482
TW end in s: 7637
Len(plant_wave): 306690790
sampling_rate: 10000
Frame start in s: 6482