In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

audio_df = pd.read_csv("./preprocessing/audio_features.csv")
text_df = pd.read_csv("./preprocessing/bert_embeddings.csv")


In [None]:
merged_df = pd.merge(audio_df, text_df, on='Participant_ID')

if 'Label_x' in merged_df.columns and 'Label_y' in merged_df.columns:
    if not (merged_df['Label_x'] == merged_df['Label_y']).all():
        print("Warning: Label mismatch between audio and text data!")
    
    merged_df = merged_df.drop(columns=['Label_y'])
    merged_df.rename(columns={'Label_x': 'Label'}, inplace=True)

print(merged_df.head())


   audio_name  audio_pcm_intensity_sma_max  audio_pcm_intensity_sma_min  \
0         NaN                     0.000017                 3.688526e-12   
1         NaN                     0.000013                 7.147003e-12   
2         NaN                     0.000013                 4.841324e-12   
3         NaN                     0.000014                 9.565283e-12   
4         NaN                     0.000476                 2.304610e-13   

   audio_pcm_intensity_sma_range  audio_pcm_intensity_sma_maxPos  \
0                       0.000017                         16222.0   
1                       0.000013                         72432.0   
2                       0.000013                         93868.0   
3                       0.000014                         32566.0   
4                       0.000476                        137333.0   

   audio_pcm_intensity_sma_minPos  audio_pcm_intensity_sma_amean  \
0                         45092.0                   1.485055e-07   
1   

In [None]:
X_audio = merged_df.filter(like='audio_')
X_text = merged_df.filter(like='BERT')
y = merged_df['Label']


In [6]:
# RFE for audio features
print("Running RFE on audio features...")
audio_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=30)
X_audio_selected = audio_selector.fit_transform(X_audio, y)
audio_selected_columns = X_audio.columns[audio_selector.support_]


Running RFE on audio features...


In [None]:
# RFE for text features
print("Running RFE on text features...")
text_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=20)
X_text_selected = text_selector.fit_transform(X_text, y)
text_selected_columns = X_text.columns[text_selector.support_]


Running RFE on text features...


In [None]:
X_combined_selected = np.concatenate([X_audio_selected, X_text_selected], axis=1)

reduced_df = pd.DataFrame(X_combined_selected, columns=list(audio_selected_columns) + list(text_selected_columns))
reduced_df['Label'] = y.values

scaler = StandardScaler()
reduced_df.iloc[:, :-1] = scaler.fit_transform(reduced_df.iloc[:, :-1])

output_csv = "./preprocessing/reduced_features.csv"
reduced_df.to_csv(output_csv, index=False)
print(f"Selected and normalized features saved to {output_csv}")
