# Supervised approach

We start by using a supervised approach, that is to say, we will use specific defined behaviors.

In [36]:
# Import necessary packages
import numpy as np
import pandas as pd
import pickle as pkl
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

In [15]:
# Set up directories
root = Path('/Volumes/becell/Lab Projects/ERCstG_HighMemory/Data/Marc/1_SOC/1_ProtocolControlsMales')
deepof_path = root / "DeepOF/supervised_annotation.pkl"
deepof_conditions_path = root / "DeepOF/conditions.csv"

# Upload conditions
target_values = pd.read_csv(deepof_conditions_path)

# Upload raw DeepOF data
with open(deepof_path, 'rb') as f:
    deepof_data = pkl.load(f)

In [16]:
# Compte abundance means for each behavior, individual, and time bin
def compute_means(data, number_of_bins=6):
    means = {}
    for id, behaviors in data.items():
        means[id] = {}
        
        # Divide rows of dataframe into number of bins
        numeric_index = pd.RangeIndex(len(behaviors))
        time_bins = pd.cut(numeric_index, bins=number_of_bins, labels=range(number_of_bins))
        behaviors = behaviors.copy()
        behaviors['time_bin'] = time_bins
        
        # Compute means for each behavior in the specified time bins
        for time_bin in range(number_of_bins):
            means[id][time_bin] = {}
            bin_df = behaviors[behaviors['time_bin'] == time_bin]
            for behavior in behaviors.columns[:-2]:  # exclude time_bin and maybe 'target' column
                means[id][time_bin][behavior] = np.nanmean(bin_df[behavior])
    
    # Convert means to DataFrame for easier handling
    # Columns: [id, time_bin, behavior1, behavior2, ...]
    means_df = pd.DataFrame()
    for id, time_bins in means.items():
        for time_bin, behaviors in time_bins.items():
            row = {'id': id, 'time_bin': time_bin}
            row.update(behaviors)
            means_df = pd.concat([means_df, pd.DataFrame([row])], ignore_index=True)
    means_df = means_df.fillna(0)
    
    return means_df

abundance_df = compute_means(deepof_data)

In [17]:
# Add a column indicating the cue acording to the target values
abundance_df['learning'] = abundance_df['id'].map(target_values.set_index('experiment_id')['learning'])

# Add a column indicating the group acording to the target values
abundance_df['group'] = abundance_df['id'].map(target_values.set_index('experiment_id')['group'])

In [30]:
# Check for missing values in the abundance_df DataFrame
abundance_df.isnull().sum()

id            0
time_bin      0
climbing      0
sniffing      0
huddle        0
lookaround    0
speed         0
learning      0
group         0
dtype: int64

In [13]:
# Download combined dataframe to csv
abundance_df.to_csv(root / '/Users/mcanela/Desktop/Behavior paper review/deepof_abundance_mean.csv', index=False)

## Biological questions

- Is the protocol working as expected?
- Which behavioral changes are there happening upon cue presentation?
- Are there differences between the direct and mediated response?
- Is it worth measuring more than one behavior? Does one behavior predict as well as an array of behaviors? Does it increase sensitivity to detect change (last minutes)

# Unsupervised approach

In [32]:
# Set up directories
root = Path('/Volumes/becell/Lab Projects/ERCstG_HighMemory/Data/Marc/1_SOC/1_ProtocolControlsMales')
moseq_path = root / "MoSeq/2024_05_23-11_06_49/results"
deepof_conditions_path = root / "DeepOF/conditions.csv"

# Upload conditions
target_values = pd.read_csv(deepof_conditions_path)

In [None]:
# Get all syllables from each file
syllable_dict = {}
all_syllables = set()

for file in moseq_path.glob('*.csv'):
    syllables = pd.read_csv(file, usecols=['syllable'])['syllable'].values
    syllable_dict[file.stem] = syllables
    all_syllables.update(syllables)

In [None]:
# Create a DataFrame to store abundance syllables as columns
rows = []

for file, syllables in syllable_dict.items():
    counts = Counter(syllables)
    row = {syll: counts.get(syll, 0) for syll in all_syllables}
    row['id'] = file
    rows.append(row)

syllable_df = pd.DataFrame(rows)
syllable_df = syllable_df.fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,id
0,1235,1959,2978,351,766,361,193,113,212,129,...,8,21,3,0,9,0,0,6,0,20230728_Marc_ERC SOC S1_Males_box ab_02_01_1D...
1,2184,2555,860,1199,644,183,120,302,295,94,...,6,115,0,0,60,0,1,23,0,20230728_Marc_ERC SOC S1_Males_box cd_04_01_1D...
2,1851,2699,2620,773,334,76,171,98,111,79,...,0,14,0,0,41,0,0,0,0,20230728_Marc_ERC SOC S1_Males_box ab_05_01_1D...
3,1736,2653,2214,906,592,140,232,73,173,0,...,2,4,0,0,0,0,0,0,0,20230728_Marc_ERC SOC S1_Males_box ab_04_01_1D...
4,1001,1993,3074,797,712,128,230,73,238,110,...,3,22,0,0,0,0,0,0,0,20230728_Marc_ERC SOC S1_Males_box ab_03_01_1D...


In [40]:
# Split "id" column by "DLC" and keep the first part
syllable_df['id'] = syllable_df['id'].str.split('DLC').str[0]

# Add a column indicating the cue acording to the target values
syllable_df['learning'] = syllable_df['id'].map(target_values.set_index('experiment_id')['learning'])

# Add a column indicating the group acording to the target values
syllable_df['group'] = syllable_df['id'].map(target_values.set_index('experiment_id')['group'])

In [44]:
# Check for missing values in the abundance_df DataFrame
syllable_df.isnull().sum()

0           0
1           0
2           0
3           0
4           0
5           0
6           0
7           0
8           0
9           0
10          0
11          0
12          0
13          0
14          0
15          0
16          0
17          0
18          0
19          0
20          0
21          0
22          0
23          0
id          0
learning    0
group       0
dtype: int64

In [45]:
# Download combined dataframe to csv
syllable_df.to_csv(root / '/Users/mcanela/Desktop/Behavior paper review/moseq_abundance_mean.csv', index=False)