# Supervised approach

We start by using a supervised approach, that is to say, we will use specific defined behaviors.

In [99]:
# Import necessary packages
import numpy as np
import pandas as pd
import pickle as pkl
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns



In [100]:
# Set up directories
root = Path('//folder/becell/Lab Projects/ERCstG_HighMemory/Data/Marc/1_SOC/1_ProtocolControlsMales/')
deepof_path = root / "DeepOF/supervised_annotation.pkl"
deepof_conditions_path = root / "DeepOF/conditions.csv"

# Upload conditions
target_values = pd.read_csv(deepof_conditions_path)

# Upload raw DeepOF data
with open(deepof_path, 'rb') as f:
    deepof_data = pkl.load(f)

In [106]:
# Compte abundance means for each behavior, individual, and time bin
def compute_means(data, number_of_bins=6):
    means = {}
    for id, behaviors in data.items():
        means[id] = {}
        
        # Divide rows of dataframe into number of bins
        numeric_index = pd.RangeIndex(len(behaviors))
        time_bins = pd.cut(numeric_index, bins=number_of_bins, labels=range(number_of_bins))
        behaviors = behaviors.copy()
        behaviors['time_bin'] = time_bins
        
        # Compute means for each behavior in the specified time bins
        for time_bin in range(number_of_bins):
            means[id][time_bin] = {}
            bin_df = behaviors[behaviors['time_bin'] == time_bin]
            for behavior in behaviors.columns[:-2]:  # exclude time_bin and maybe 'target' column
                means[id][time_bin][behavior] = np.nanmean(bin_df[behavior])
    
    # Convert means to DataFrame for easier handling
    # Columns: [id, time_bin, behavior1, behavior2, ...]
    means_df = pd.DataFrame()
    for id, time_bins in means.items():
        for time_bin, behaviors in time_bins.items():
            row = {'id': id, 'time_bin': time_bin}
            row.update(behaviors)
            means_df = pd.concat([means_df, pd.DataFrame([row])], ignore_index=True)
    means_df = means_df.fillna(0)
    
    return means_df

abundance_df = compute_means(deepof_data)

In [107]:
# Add a column indicating the cue acording to the target values
abundance_df['learning'] = abundance_df['id'].map(target_values.set_index('experiment_id')['learning'])

# Add a column indicating the group acording to the target values
abundance_df['group'] = abundance_df['id'].map(target_values.set_index('experiment_id')['group'])

In [110]:
# Check for missing values in the abundance_df DataFrame
abundance_df.isnull().sum()

id            0
time_bin      0
climbing      0
sniffing      0
huddle        0
lookaround    0
speed         0
learning      0
group         0
dtype: int64

In [111]:
# Download combined dataframe to csv
abundance_df.to_csv(root / 'DeepOF/abundance_means.csv', index=False)

## Biological questions

- Is the protocol working as expected?
- Which behavioral changes are there happening upon cue presentation?
- Are there differences between the direct and mediated response?
- Is it worth measuring more than one behavior? Does one behavior predict as well as an array of behaviors? Does it increase sensitivity to detect change (last minutes)