# This notebook applies PCA to audio feature selection

In [None]:
import pickle
import os
import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.api import OLS
import statsmodels.api as sm
import statsmodels.stats as sts
from scipy import stats
import statsmodels.stats.api as sms
import re

import datetime
date = datetime.datetime.now()
date = date.strftime("%Y.%m.%d")
np.random.seed(0)

def stdz(series: pd.Series):
    """Standardize the given pandas Series"""
    return (series - series.mean())/series.std()
def unitstdz(series:pd.Series):
    return (series - series.min())/(series.max()-series.min())

from imblearn.under_sampling import RandomUnderSampler


# read data

In [2]:
rating = pd.read_excel('eyetracking-coordinates-imname.xlsx', sheet_name='video-based')

In [3]:
rating.columns

Index(['VideoNumber', 'GroupN(1=low(1-3),2=neutral(4-6),3=high(7-10))',
       'PrefereneMean', 'PurchaseDesireMean', 'NarrativeInterityMean',
       'EmpathyMean', 'TotalTime(S)', 'ShotNumber', 'ShotMean',
       'Shotvariance'],
      dtype='object')

In [4]:
rating.drop('Shotvariance',axis=1,inplace=True)

In [5]:
rating.head()

Unnamed: 0,VideoNumber,"GroupN(1=low(1-3),2=neutral(4-6),3=high(7-10))",PrefereneMean,PurchaseDesireMean,NarrativeInterityMean,EmpathyMean,TotalTime(S),ShotNumber,ShotMean
0,1,2,5.957447,5.553191,5.085106,5.255319,33,8,4.125
1,2,2,5.0,4.0,5.0,6.333333,84,9,9.333333
2,3,1,3.0,1.777778,5.555556,2.777778,31,10,3.1
3,4,3,7.111111,6.777778,4.666667,6.777778,34,17,2.0
4,5,1,3.666667,2.333333,5.833333,2.833333,32,12,2.666667


In [6]:
def split_ad_id(ad_id):
    if '+' in ad_id[0:5]:
        return ad_id.split('+')[0]
    elif '-' in ad_id[0:5]:
        return ad_id.split('-')[0]
    else:
        return ad_id  # if neither '+' nor '-' is present, return the original ad_id

In [7]:
def lowercase_columns(df):
    df.columns = df.columns.str.lower()
    return df

# Read the first file and convert column names to lowercase
features = pd.read_csv('audio_features.csv')
features = lowercase_columns(features)
features = lowercase_columns(features)
features['VideoNumber'] = features['filename'].apply(split_ad_id)

In [8]:
features.columns

Index(['filename', 'duration', 'rms_mean', 'rms_std', 'rms_max', 'zcr_mean',
       'zcr_std', 'spectral_centroid_mean', 'spectral_centroid_std',
       'spectral_bandwidth_mean', 'spectral_bandwidth_std', 'pitch_mean',
       'pitch_std', 'mfcc_1_mean', 'mfcc_1_std', 'mfcc_2_mean', 'mfcc_2_std',
       'mfcc_3_mean', 'mfcc_3_std', 'mfcc_4_mean', 'mfcc_4_std', 'mfcc_5_mean',
       'mfcc_5_std', 'mfcc_6_mean', 'mfcc_6_std', 'mfcc_7_mean', 'mfcc_7_std',
       'mfcc_8_mean', 'mfcc_8_std', 'mfcc_9_mean', 'mfcc_9_std',
       'mfcc_10_mean', 'mfcc_10_std', 'mfcc_11_mean', 'mfcc_11_std',
       'mfcc_12_mean', 'mfcc_12_std', 'mfcc_13_mean', 'mfcc_13_std',
       'chroma_1_mean', 'chroma_1_std', 'chroma_2_mean', 'chroma_2_std',
       'chroma_3_mean', 'chroma_3_std', 'chroma_4_mean', 'chroma_4_std',
       'chroma_5_mean', 'chroma_5_std', 'chroma_6_mean', 'chroma_6_std',
       'chroma_7_mean', 'chroma_7_std', 'chroma_8_mean', 'chroma_8_std',
       'chroma_9_mean', 'chroma_9_std', 'chroma_

In [9]:
features.tail()

Unnamed: 0,filename,duration,rms_mean,rms_std,rms_max,zcr_mean,zcr_std,spectral_centroid_mean,spectral_centroid_std,spectral_bandwidth_mean,...,chroma_9_std,chroma_10_mean,chroma_10_std,chroma_11_mean,chroma_11_std,chroma_12_mean,chroma_12_std,mel_spectrogram_mean,mel_spectrogram_std,VideoNumber
106,95+The Promise of Similac (1).mp3,30.09,0.034744,0.020984,0.099656,0.057304,0.079066,2281.77672,1885.872308,2608.298671,...,0.29185,0.353905,0.304546,0.423634,0.346395,0.348586,0.283656,0.295351,2.807308,95
107,96+The Promise of Similac.mp3,15.07,0.033668,0.021797,0.098135,0.054048,0.079519,2098.451675,1915.33355,2465.571943,...,0.261239,0.321404,0.262159,0.454804,0.355037,0.380009,0.291834,0.287847,2.636283,96
108,97+The Wonder of Baby SMA® PRO Follow on Milk...,30.07,0.041637,0.02544,0.116514,0.059701,0.069674,2458.209766,1609.758632,2949.133262,...,0.290666,0.51718,0.306825,0.58972,0.308117,0.502666,0.266747,0.443862,3.29879,97
109,98+Unibale Baby formula- Babyactiv8.mp3,30.07,0.035107,0.016796,0.090101,0.062981,0.06325,3083.946332,1510.506873,3439.121626,...,0.285418,0.474783,0.308618,0.456359,0.284647,0.527332,0.329419,0.280122,2.320226,98
110,99+VINTAGE 1958 PET EVAPORATED MILK COMMERCIAL...,58.45,0.048806,0.03088,0.157743,0.033685,0.021835,1380.853342,525.336408,1634.023818,...,0.275557,0.49071,0.319638,0.545004,0.312044,0.478621,0.285852,0.608709,4.155523,99


In [10]:
features['VideoNumber'] = features['VideoNumber'].astype(np.int64)

In [12]:
data = features.merge(rating, on='VideoNumber', how='inner')

In [13]:
data['ad_id'] = data['filename'].apply(lambda x: x.replace('.mp3', '.mp4'))

In [17]:
for col in data.columns:
    if data[col].isna().any():
        print(col)

In [18]:
data = data.dropna(how='any')
data = data.replace([np.inf, -np.inf], np.nan).dropna()

In [21]:
features = ['rms_mean', 'rms_std', 'rms_max', 'zcr_mean', 'zcr_std', 'spectral_centroid_mean', 'spectral_centroid_std', 'spectral_bandwidth_mean',
            'spectral_bandwidth_std', 'pitch_mean', 'pitch_std', 'mfcc_1_mean', 'mfcc_1_std', 'mfcc_2_mean', 'mfcc_2_std', 'mfcc_3_mean', 'mfcc_3_std',
            'mfcc_4_mean', 'mfcc_4_std', 'mfcc_5_mean', 'mfcc_5_std', 'mfcc_6_mean', 'mfcc_6_std', 'mfcc_7_mean', 'mfcc_7_std', 'mfcc_8_mean', 'mfcc_8_std', 
            'mfcc_9_mean', 'mfcc_9_std', 'mfcc_10_mean', 'mfcc_10_std', 'mfcc_11_mean', 'mfcc_11_std', 'mfcc_12_mean', 'mfcc_12_std', 'mfcc_13_mean', 
            'mfcc_13_std', 'chroma_1_mean', 'chroma_1_std', 'chroma_2_mean', 'chroma_2_std', 'chroma_3_mean', 'chroma_3_std', 'chroma_4_mean', 'chroma_4_std',
            'chroma_5_mean', 'chroma_5_std', 'chroma_6_mean', 'chroma_6_std', 'chroma_7_mean', 'chroma_7_std', 'chroma_8_mean', 'chroma_8_std', 'chroma_9_mean',
            'chroma_9_std', 'chroma_10_mean', 'chroma_10_std', 'chroma_11_mean', 'chroma_11_std', 'chroma_12_mean', 'chroma_12_std', 'mel_spectrogram_mean', 
            'mel_spectrogram_std']

In [22]:
len(features)

63

# dispersion

In [23]:
data.head()

Unnamed: 0,filename,duration,rms_mean,rms_std,rms_max,zcr_mean,zcr_std,spectral_centroid_mean,spectral_centroid_std,spectral_bandwidth_mean,...,VideoNumber,"GroupN(1=low(1-3),2=neutral(4-6),3=high(7-10))",PrefereneMean,PurchaseDesireMean,NarrativeInterityMean,EmpathyMean,TotalTime(S),ShotNumber,ShotMean,ad_id
0,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,1,2,5.957447,5.553191,5.085106,5.255319,33,8,4.125,1+90 Years Crafting.mp4
1,10+Aptamil Advert 2021.mp3,30.14,0.074573,0.049418,0.2598,0.065928,0.095349,2523.259214,1908.065955,3028.894412,...,10,2,4.166667,4.0,5.666667,3.333333,31,14,2.214286,10+Aptamil Advert 2021.mp4
2,100+Vintage Carnation infant formula commercia...,68.43,0.050167,0.053738,0.295545,0.04134,0.041643,1852.533373,1134.412981,2447.992161,...,100,1,3.333333,3.0,6.666667,2.333333,69,26,2.653846,100+Vintage Carnation infant formula commercia...
3,101+Welcome To Kendamil.mp3,60.3,0.063147,0.049256,0.265595,0.044595,0.055455,2108.228601,1422.951136,2701.332497,...,101,2,4.176471,3.588235,5.166667,3.388889,61,14,4.357143,101+Welcome To Kendamil.mp4
4,102+Welcome to the Kendamil family!.mp3,60.91,0.065595,0.049353,0.252705,0.045361,0.058613,2140.927916,1467.060556,2743.072238,...,102,2,4.285714,3.857143,5.0,2.857143,61,19,3.210526,102+Welcome to the Kendamil family!.mp4


In [25]:
def extract_video_number(filename):
    match = re.match(r'(\d+)[+-]', filename)
    return match.group(1) if match else None

In [26]:
dispersion1 = pd.read_json('dispersion_measure0629.json', lines=True)

In [30]:
dispersion1.columns

Index(['ad_id', 'id', 'std_x', 'std_y', 'combined_std', 'convex_hull_area',
       'convex_hull_area_shapely', 'mean_euclidean_distance', 'VideoNumber'],
      dtype='object')

In [33]:
dispersion1['VideoNumber'] = dispersion1['ad_id'].apply(extract_video_number)
data['VideoNumber'] = data['VideoNumber'].astype(int)
dispersion1['VideoNumber'] = dispersion1['VideoNumber'].astype(int)
data = data.merge(dispersion1[['id', 'std_x', 'std_y', 'combined_std', 'convex_hull_area',
       'convex_hull_area_shapely', 'mean_euclidean_distance', 'VideoNumber']], on='VideoNumber')

In [34]:
#data = data.merge(dispersion[['ad_id','mean_distance','std_distance']], on='ad_id')

In [35]:
data.head(10)

Unnamed: 0,filename,duration,rms_mean,rms_std,rms_max,zcr_mean,zcr_std,spectral_centroid_mean,spectral_centroid_std,spectral_bandwidth_mean,...,ShotNumber,ShotMean,ad_id,id,std_x,std_y,combined_std,convex_hull_area,convex_hull_area_shapely,mean_euclidean_distance
0,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,8,4.125,1+90 Years Crafting.mp4,R_2qFdJq0jnOo6gpt,0.049059,0.132776,0.14155,0.067034,0.067034,0.11414
1,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,8,4.125,1+90 Years Crafting.mp4,R_5iNTkLqxP6ZbD7H,0.071655,0.104923,0.127056,0.06368,0.06368,0.112329
2,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,8,4.125,1+90 Years Crafting.mp4,R_x5FWpiRdYvqrumZ,0.069265,0.068579,0.097471,0.040371,0.040371,0.084495
3,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,8,4.125,1+90 Years Crafting.mp4,R_2WDsrmAlPtoO0HZ,0.085395,0.055002,0.101575,0.0516,0.0516,0.083171
4,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,8,4.125,1+90 Years Crafting.mp4,R_2DTAqyIUDqXnKfP,0.06929,0.08772,0.111785,0.053018,0.053018,0.097591
5,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,8,4.125,1+90 Years Crafting.mp4,R_1Ic2grCw4d162ym,0.055206,0.09304,0.108186,0.040562,0.040562,0.094437
6,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,8,4.125,1+90 Years Crafting.mp4,R_ysdra57Iv3uoFm9,0.106843,0.143136,0.178615,0.13356,0.13356,0.153353
7,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,8,4.125,1+90 Years Crafting.mp4,R_pcjQCK9dAmqufrX,0.055028,0.104855,0.118417,0.063625,0.063625,0.099924
8,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,8,4.125,1+90 Years Crafting.mp4,R_2V3B9AvjeLjviju,0.113406,0.120954,0.165804,0.130167,0.130167,0.14275
9,10+Aptamil Advert 2021.mp3,30.14,0.074573,0.049418,0.2598,0.065928,0.095349,2523.259214,1908.065955,3028.894412,...,14,2.214286,10+Aptamil Advert 2021.mp4,R_2cjymW7g7GFnOaL,0.08405,0.083635,0.118571,0.082235,0.082235,0.101962


In [36]:
len(data)

746

In [37]:
data = data.dropna()

In [38]:
len(data)

746

# PCA

In [39]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = data[features]
y = data['convex_hull_area']

# Standardize your features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [40]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV


In [41]:
# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Get explained variance ratios
explained_var_ratio = pca.explained_variance_ratio_

# Determine number of components to keep
total_var = 0
n_components = 0
for i, ratio in enumerate(explained_var_ratio):
    total_var += ratio
    if total_var >= 0.975:  # Adjust the threshold as needed
        n_components = i + 1
        break

print(f"Number of principal components selected: {n_components}")

# Apply PCA transformation with selected components
pca_final = PCA(n_components=n_components)
X_selected = pca_final.fit_transform(X_scaled)

# Get selected features (if needed, you can interpret the components to understand which original features contribute the most)
selected_features = X.columns[pca_final.components_.mean(axis=0).argsort()[::-1][:n_components]]
print("Selected features by PCA:", selected_features)

Number of principal components selected: 33
Selected features by PCA: Index(['mfcc_11_mean', 'mel_spectrogram_mean', 'mel_spectrogram_std',
       'pitch_mean', 'mfcc_13_mean', 'mfcc_6_std', 'spectral_bandwidth_std',
       'pitch_std', 'chroma_9_std', 'mfcc_4_mean', 'mfcc_9_mean', 'rms_max',
       'rms_mean', 'chroma_10_mean', 'zcr_mean', 'mfcc_12_mean', 'mfcc_1_std',
       'chroma_9_mean', 'spectral_bandwidth_mean', 'chroma_7_std',
       'spectral_centroid_mean', 'mfcc_10_mean', 'mfcc_2_std', 'mfcc_3_mean',
       'chroma_4_mean', 'chroma_5_mean', 'rms_std', 'mfcc_7_mean',
       'mfcc_3_std', 'chroma_11_mean', 'mfcc_12_std', 'mfcc_8_std',
       'mfcc_7_std'],
      dtype='object')


In [42]:
sorted_values = np.sort(selected_features.values)
sorted_values

array(['chroma_10_mean', 'chroma_11_mean', 'chroma_4_mean',
       'chroma_5_mean', 'chroma_7_std', 'chroma_9_mean', 'chroma_9_std',
       'mel_spectrogram_mean', 'mel_spectrogram_std', 'mfcc_10_mean',
       'mfcc_11_mean', 'mfcc_12_mean', 'mfcc_12_std', 'mfcc_13_mean',
       'mfcc_1_std', 'mfcc_2_std', 'mfcc_3_mean', 'mfcc_3_std',
       'mfcc_4_mean', 'mfcc_6_std', 'mfcc_7_mean', 'mfcc_7_std',
       'mfcc_8_std', 'mfcc_9_mean', 'pitch_mean', 'pitch_std', 'rms_max',
       'rms_mean', 'rms_std', 'spectral_bandwidth_mean',
       'spectral_bandwidth_std', 'spectral_centroid_mean', 'zcr_mean'],
      dtype=object)

In [50]:
len(sorted_values)

33

In [47]:
data['preference'] = data['GroupN(1=low(1-3),2=neutral(4-6),3=high(7-10))']

In [48]:
conditions = [
    (0 < data['PurchaseDesireMean']) & (data['PurchaseDesireMean'] <= 3),
    (3 < data['PurchaseDesireMean']) & (data['PurchaseDesireMean'] <= 6),
    (6 < data['PurchaseDesireMean']) & (data['PurchaseDesireMean'] <= 10)
]

choices = [1, 2, 3]

data['purchase'] = np.select(conditions, choices)

In [44]:
data.columns

Index(['filename', 'duration', 'rms_mean', 'rms_std', 'rms_max', 'zcr_mean',
       'zcr_std', 'spectral_centroid_mean', 'spectral_centroid_std',
       'spectral_bandwidth_mean', 'spectral_bandwidth_std', 'pitch_mean',
       'pitch_std', 'mfcc_1_mean', 'mfcc_1_std', 'mfcc_2_mean', 'mfcc_2_std',
       'mfcc_3_mean', 'mfcc_3_std', 'mfcc_4_mean', 'mfcc_4_std', 'mfcc_5_mean',
       'mfcc_5_std', 'mfcc_6_mean', 'mfcc_6_std', 'mfcc_7_mean', 'mfcc_7_std',
       'mfcc_8_mean', 'mfcc_8_std', 'mfcc_9_mean', 'mfcc_9_std',
       'mfcc_10_mean', 'mfcc_10_std', 'mfcc_11_mean', 'mfcc_11_std',
       'mfcc_12_mean', 'mfcc_12_std', 'mfcc_13_mean', 'mfcc_13_std',
       'chroma_1_mean', 'chroma_1_std', 'chroma_2_mean', 'chroma_2_std',
       'chroma_3_mean', 'chroma_3_std', 'chroma_4_mean', 'chroma_4_std',
       'chroma_5_mean', 'chroma_5_std', 'chroma_6_mean', 'chroma_6_std',
       'chroma_7_mean', 'chroma_7_std', 'chroma_8_mean', 'chroma_8_std',
       'chroma_9_mean', 'chroma_9_std', 'chroma_

In [45]:
data.head()

Unnamed: 0,filename,duration,rms_mean,rms_std,rms_max,zcr_mean,zcr_std,spectral_centroid_mean,spectral_centroid_std,spectral_bandwidth_mean,...,ad_id,id,std_x,std_y,combined_std,convex_hull_area,mean_euclidean_distance,intercept,preference,purchase
0,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,1+90 Years Crafting.mp4,R_2qFdJq0jnOo6gpt,0.049059,0.132776,0.14155,1.166466,0.11414,1,2,2
1,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,1+90 Years Crafting.mp4,R_5iNTkLqxP6ZbD7H,0.071655,0.104923,0.127056,1.013764,0.112329,1,2,2
2,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,1+90 Years Crafting.mp4,R_x5FWpiRdYvqrumZ,0.069265,0.068579,0.097471,0.889882,0.084495,1,2,2
3,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,1+90 Years Crafting.mp4,R_2WDsrmAlPtoO0HZ,0.085395,0.055002,0.101575,0.960789,0.083171,1,2,2
4,1+90 Years Crafting.mp3,32.299977,0.067268,0.042547,0.186023,0.06758,0.083763,3061.162958,2014.217722,3452.305987,...,1+90 Years Crafting.mp4,R_2DTAqyIUDqXnKfP,0.06929,0.08772,0.111785,0.95193,0.097591,1,2,2


In [46]:
data.tail()

Unnamed: 0,filename,duration,rms_mean,rms_std,rms_max,zcr_mean,zcr_std,spectral_centroid_mean,spectral_centroid_std,spectral_bandwidth_mean,...,ad_id,id,std_x,std_y,combined_std,convex_hull_area,mean_euclidean_distance,intercept,preference,purchase
653,99+VINTAGE 1958 PET EVAPORATED MILK COMMERCIAL...,58.45,0.048806,0.03088,0.157743,0.033685,0.021835,1380.853342,525.336408,1634.023818,...,99+VINTAGE 1958 PET EVAPORATED MILK COMMERCIAL...,R_pcjQCK9dAmqufrX,0.061889,0.095416,0.11373,0.878797,0.104958,1,2,2
654,99+VINTAGE 1958 PET EVAPORATED MILK COMMERCIAL...,58.45,0.048806,0.03088,0.157743,0.033685,0.021835,1380.853342,525.336408,1634.023818,...,99+VINTAGE 1958 PET EVAPORATED MILK COMMERCIAL...,R_3QVndydHS3ggukh,0.072965,0.084651,0.111757,0.959388,0.101739,1,2,2
655,99+VINTAGE 1958 PET EVAPORATED MILK COMMERCIAL...,58.45,0.048806,0.03088,0.157743,0.033685,0.021835,1380.853342,525.336408,1634.023818,...,99+VINTAGE 1958 PET EVAPORATED MILK COMMERCIAL...,R_3n75JGLz1DIwqsc,0.072944,0.09115,0.116744,1.030377,0.103575,1,2,2
656,99+VINTAGE 1958 PET EVAPORATED MILK COMMERCIAL...,58.45,0.048806,0.03088,0.157743,0.033685,0.021835,1380.853342,525.336408,1634.023818,...,99+VINTAGE 1958 PET EVAPORATED MILK COMMERCIAL...,R_W3A7gkKqFyYxn5D,0.042931,0.06393,0.077007,0.576954,0.069639,1,2,2
657,99+VINTAGE 1958 PET EVAPORATED MILK COMMERCIAL...,58.45,0.048806,0.03088,0.157743,0.033685,0.021835,1380.853342,525.336408,1634.023818,...,99+VINTAGE 1958 PET EVAPORATED MILK COMMERCIAL...,R_2Xps7uqZpYygTm6,0.067836,0.06846,0.096376,0.854313,0.083729,1,2,2
