# This notebook applies PCA to visual feature selection

In [1]:
import pickle
import os
import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.api import OLS
import statsmodels.api as sm
import statsmodels.stats as sts
from scipy import stats
import statsmodels.stats.api as sms


import datetime
date = datetime.datetime.now()
date = date.strftime("%Y.%m.%d")
np.random.seed(0)

def stdz(series: pd.Series):
    """Standardize the given pandas Series"""
    return (series - series.mean())/series.std()
def unitstdz(series:pd.Series):
    return (series - series.min())/(series.max()-series.min())

from imblearn.under_sampling import RandomUnderSampler

import re
def extract_video_number(filename):
    match = re.match(r'(\d+)[+-]', filename)
    return match.group(1) if match else None

# read data

In [2]:
rating = pd.read_excel('../dataset/eyetracking-coordinates-imname.xlsx', sheet_name='video-based')

In [3]:
rating.columns

Index(['VideoNumber', 'GroupN(1=low(1-3),2=neutral(4-6),3=high(7-10))',
       'PrefereneMean', 'PurchaseDesireMean', 'NarrativeInterityMean',
       'EmpathyMean', 'TotalTime(S)', 'ShotNumber', 'ShotMean',
       'Shotvariance'],
      dtype='object')

In [4]:
rating.drop('Shotvariance',axis=1,inplace=True)

In [5]:
features = pd.read_csv('../dataset/infant_ads_visual_features.csv')

In [6]:
features.head()

Unnamed: 0,ad_id,inner_brightness_mean,inner_brightness_std,inner_sharpness_mean,inner_sharpness_std,objects_count_yolo_mean,objects_count_yolo_std,face_count_yolo_mean,face_count_yolo_std,region_size_avg_mean,...,texture_c3_contrast_std,texture_c3_correlation_mean,texture_c3_correlation_std,texture_c3_energy_mean,texture_c3_energy_std,texture_c3_homogeneity_mean,texture_c3_homogeneity_std,texture_c3_dissimilarity_mean,texture_c3_dissimilarity_std,VideoNumber
0,1+90 Years Crafting.mp4,1.484394,0.118956,43.936865,49.623876,1.823529,1.975958,0.823529,0.635934,0.082606,...,38.526797,0.997088,0.002315,0.234465,0.019319,0.582653,0.051157,2.524118,0.90194,1.0
1,10+Aptamil Advert 2021.mp4,1.392879,0.201502,50.400985,69.511979,1.564103,1.046168,1.25641,0.751068,0.091923,...,38.453309,0.991721,0.002525,0.227305,0.004294,0.623649,0.061499,3.228256,0.972189,10.0
2,100+Vintage Carnation infant formula commercia...,1.579462,0.492753,75.848081,138.295918,0.851351,0.822333,0.5,0.646381,0.171666,...,39.750976,0.998358,0.00137,0.286304,0.148167,0.689636,0.092245,1.517273,0.413373,100.0
3,101+Welcome To Kendamil.mp4,1.284288,0.116894,33.42706,43.549638,1.125,2.040456,0.625,1.078639,0.08412,...,229.951821,0.992722,0.009695,0.289033,0.104256,0.645588,0.113514,3.27195,1.74429,101.0
4,102+Welcome to the Kendamil family!.mp4,1.30504,0.127926,63.916788,98.977502,1.58,3.077834,0.46,1.053856,0.076532,...,166.883153,0.99365,0.007458,0.280196,0.093924,0.63927,0.113054,3.049412,1.484278,102.0


In [7]:
features.columns

Index(['ad_id', 'inner_brightness_mean', 'inner_brightness_std',
       'inner_sharpness_mean', 'inner_sharpness_std',
       'objects_count_yolo_mean', 'objects_count_yolo_std',
       'face_count_yolo_mean', 'face_count_yolo_std', 'region_size_avg_mean',
       'region_size_avg_std', 'region_count_mean', 'region_count_std',
       'rule_of_thirds_mean', 'rule_of_thirds_std', 'im_num_mean',
       'im_num_std', 'color_brightness_mean', 'color_brightness_std',
       'color_hue_mean', 'color_hue_std', 'color_saturation_mean',
       'color_saturation_std', 'color_brightness_contrast_mean',
       'color_brightness_contrast_std', 'color_color_diversity_mean',
       'color_color_diversity_std', 'color_clarity_mean', 'color_clarity_std',
       'color_black_mean', 'color_black_std', 'color_blue_mean',
       'color_blue_std', 'color_brown_mean', 'color_brown_std',
       'color_gray_mean', 'color_gray_std', 'color_green_mean',
       'color_green_std', 'color_orange_mean', 'color_orange_

In [8]:
data = features.merge(rating, on='VideoNumber', how='inner')

In [9]:
len(data)

111

In [10]:
data.tail()

Unnamed: 0,ad_id,inner_brightness_mean,inner_brightness_std,inner_sharpness_mean,inner_sharpness_std,objects_count_yolo_mean,objects_count_yolo_std,face_count_yolo_mean,face_count_yolo_std,region_size_avg_mean,...,texture_c3_dissimilarity_std,VideoNumber,"GroupN(1=low(1-3),2=neutral(4-6),3=high(7-10))",PrefereneMean,PurchaseDesireMean,NarrativeInterityMean,EmpathyMean,TotalTime(S),ShotNumber,ShotMean
106,95+The Promise of Similac (1).mp4,1.333013,0.208542,347.773251,1053.793986,1.282051,0.646803,1.128205,0.570295,0.084364,...,0.467679,95.0,3,7.5,6.75,4.125,7.0,31,14,2.214286
107,96+The Promise of Similac.mp4,1.286813,0.211484,46.54474,49.48652,1.733333,0.457738,1.533333,0.63994,0.08556,...,0.744364,96.0,2,7.0,6.0,4.5,7.0,16,6,2.666667
108,97+The Wonder of Baby SMA庐 PRO Follow on Milk...,1.676643,0.236861,66.527147,143.564321,0.7,0.464095,0.625,0.49029,0.169377,...,0.585037,97.0,3,7.25,6.875,4.25,7.25,31,14,2.214286
109,98+Unibale Baby formula- Babyactiv8.mp4,1.356488,0.277388,59.16913,120.130925,1.406977,1.172006,0.709302,0.82414,0.103759,...,1.182115,98.0,2,4.75,4.75,6.25,7.75,31,30,1.033333
110,99+VINTAGE 1958 PET EVAPORATED MILK COMMERCIAL...,1.93459,0.485412,0.0,0.0,1.380952,0.804748,0.857143,0.853564,0.078429,...,0.353413,99.0,2,4.5,3.666667,7.333333,5.0,59,8,7.375


In [11]:
for col in data.columns:
    if data[col].isna().any():
        print(col)

In [12]:
data = data.dropna(how='any')
data = data.replace([np.inf, -np.inf], np.nan).dropna()

In [14]:
features = ['color_brightness_mean',
       'color_brightness_std', 'color_hue_mean', 'color_hue_std',
       'color_saturation_mean', 'color_saturation_std',
       'color_brightness_contrast_mean', 'color_brightness_contrast_std',
       'color_color_diversity_mean', 'color_color_diversity_std',
       'color_clarity_mean', 'color_clarity_std', 'color_black_mean',
       'color_black_std', 'color_blue_mean', 'color_blue_std',
       'color_brown_mean', 'color_brown_std', 'color_gray_mean',
       'color_gray_std', 'color_green_mean', 'color_green_std',
       'color_orange_mean', 'color_orange_std', 'color_pink_mean',
       'color_pink_std', 'color_purple_mean', 'color_purple_std',
       'color_red_mean', 'color_red_std', 'color_white_mean',
       'color_white_std', 'color_yellow_mean', 'color_yellow_std',
       'texture_c1_contrast_mean', 'texture_c1_contrast_std',
       'texture_c1_correlation_mean', 'texture_c1_correlation_std',
       'texture_c1_energy_mean', 'texture_c1_energy_std',
       'texture_c1_homogeneity_mean', 'texture_c1_homogeneity_std',
       'texture_c1_dissimilarity_mean', 'texture_c1_dissimilarity_std',
       'texture_c2_contrast_mean', 'texture_c2_contrast_std',
       'texture_c2_correlation_mean', 'texture_c2_correlation_std',
       'texture_c2_energy_mean', 'texture_c2_energy_std',
       'texture_c2_homogeneity_mean', 'texture_c2_homogeneity_std',
       'texture_c2_dissimilarity_mean', 'texture_c2_dissimilarity_std',
       'texture_c3_contrast_mean', 'texture_c3_contrast_std',
       'texture_c3_correlation_mean', 'texture_c3_correlation_std',
       'texture_c3_energy_mean', 'texture_c3_energy_std',
       'texture_c3_homogeneity_mean', 'texture_c3_homogeneity_std',
       'texture_c3_dissimilarity_mean', 'texture_c3_dissimilarity_std',
        'inner_brightness_mean',
       'inner_brightness_std', 'inner_sharpness_mean', 'inner_sharpness_std',
       'objects_count_yolo_mean', 'objects_count_yolo_std',
       'face_count_yolo_mean', 'face_count_yolo_std', 'region_size_avg_mean',
       'region_size_avg_std', 'region_count_mean', 'region_count_std',
       'rule_of_thirds_mean', 'rule_of_thirds_std']

In [15]:
len(features)

78

# dispersion

In [16]:
data.head()

Unnamed: 0,ad_id,inner_brightness_mean,inner_brightness_std,inner_sharpness_mean,inner_sharpness_std,objects_count_yolo_mean,objects_count_yolo_std,face_count_yolo_mean,face_count_yolo_std,region_size_avg_mean,...,texture_c3_dissimilarity_std,VideoNumber,"GroupN(1=low(1-3),2=neutral(4-6),3=high(7-10))",PrefereneMean,PurchaseDesireMean,NarrativeInterityMean,EmpathyMean,TotalTime(S),ShotNumber,ShotMean
0,1+90 Years Crafting.mp4,1.484394,0.118956,43.936865,49.623876,1.823529,1.975958,0.823529,0.635934,0.082606,...,0.90194,1.0,2,5.957447,5.553191,5.085106,5.255319,33,8,4.125
1,10+Aptamil Advert 2021.mp4,1.392879,0.201502,50.400985,69.511979,1.564103,1.046168,1.25641,0.751068,0.091923,...,0.972189,10.0,2,4.166667,4.0,5.666667,3.333333,31,14,2.214286
2,100+Vintage Carnation infant formula commercia...,1.579462,0.492753,75.848081,138.295918,0.851351,0.822333,0.5,0.646381,0.171666,...,0.413373,100.0,1,3.333333,3.0,6.666667,2.333333,69,26,2.653846
3,101+Welcome To Kendamil.mp4,1.284288,0.116894,33.42706,43.549638,1.125,2.040456,0.625,1.078639,0.08412,...,1.74429,101.0,2,4.176471,3.588235,5.166667,3.388889,61,14,4.357143
4,102+Welcome to the Kendamil family!.mp4,1.30504,0.127926,63.916788,98.977502,1.58,3.077834,0.46,1.053856,0.076532,...,1.484278,102.0,2,4.285714,3.857143,5.0,2.857143,61,19,3.210526


In [17]:
dispersion1 = pd.read_json('dispersion_measure0629.json', lines=True)

In [18]:
dispersion1['VideoNumber'] = dispersion1['ad_id'].apply(extract_video_number)
data['VideoNumber'] = data['VideoNumber'].astype(int)
dispersion1['VideoNumber'] = dispersion1['VideoNumber'].astype(int)
data = data.merge(dispersion1[['id', 'std_x', 'std_y', 'combined_std', 'convex_hull_area',
       'convex_hull_area_shapely', 'mean_euclidean_distance', 'VideoNumber']], on='VideoNumber')

In [20]:
data.head(10)

Unnamed: 0,ad_id,inner_brightness_mean,inner_brightness_std,inner_sharpness_mean,inner_sharpness_std,objects_count_yolo_mean,objects_count_yolo_std,face_count_yolo_mean,face_count_yolo_std,region_size_avg_mean,...,TotalTime(S),ShotNumber,ShotMean,id,std_x,std_y,combined_std,convex_hull_area,convex_hull_area_shapely,mean_euclidean_distance
0,1+90 Years Crafting.mp4,1.484394,0.118956,43.936865,49.623876,1.823529,1.975958,0.823529,0.635934,0.082606,...,33,8,4.125,R_2qFdJq0jnOo6gpt,0.049059,0.132776,0.14155,0.067034,0.067034,0.11414
1,1+90 Years Crafting.mp4,1.484394,0.118956,43.936865,49.623876,1.823529,1.975958,0.823529,0.635934,0.082606,...,33,8,4.125,R_5iNTkLqxP6ZbD7H,0.071655,0.104923,0.127056,0.06368,0.06368,0.112329
2,1+90 Years Crafting.mp4,1.484394,0.118956,43.936865,49.623876,1.823529,1.975958,0.823529,0.635934,0.082606,...,33,8,4.125,R_x5FWpiRdYvqrumZ,0.069265,0.068579,0.097471,0.040371,0.040371,0.084495
3,1+90 Years Crafting.mp4,1.484394,0.118956,43.936865,49.623876,1.823529,1.975958,0.823529,0.635934,0.082606,...,33,8,4.125,R_2WDsrmAlPtoO0HZ,0.085395,0.055002,0.101575,0.0516,0.0516,0.083171
4,1+90 Years Crafting.mp4,1.484394,0.118956,43.936865,49.623876,1.823529,1.975958,0.823529,0.635934,0.082606,...,33,8,4.125,R_2DTAqyIUDqXnKfP,0.06929,0.08772,0.111785,0.053018,0.053018,0.097591
5,1+90 Years Crafting.mp4,1.484394,0.118956,43.936865,49.623876,1.823529,1.975958,0.823529,0.635934,0.082606,...,33,8,4.125,R_1Ic2grCw4d162ym,0.055206,0.09304,0.108186,0.040562,0.040562,0.094437
6,1+90 Years Crafting.mp4,1.484394,0.118956,43.936865,49.623876,1.823529,1.975958,0.823529,0.635934,0.082606,...,33,8,4.125,R_ysdra57Iv3uoFm9,0.106843,0.143136,0.178615,0.13356,0.13356,0.153353
7,1+90 Years Crafting.mp4,1.484394,0.118956,43.936865,49.623876,1.823529,1.975958,0.823529,0.635934,0.082606,...,33,8,4.125,R_pcjQCK9dAmqufrX,0.055028,0.104855,0.118417,0.063625,0.063625,0.099924
8,1+90 Years Crafting.mp4,1.484394,0.118956,43.936865,49.623876,1.823529,1.975958,0.823529,0.635934,0.082606,...,33,8,4.125,R_2V3B9AvjeLjviju,0.113406,0.120954,0.165804,0.130167,0.130167,0.14275
9,10+Aptamil Advert 2021.mp4,1.392879,0.201502,50.400985,69.511979,1.564103,1.046168,1.25641,0.751068,0.091923,...,31,14,2.214286,R_2cjymW7g7GFnOaL,0.08405,0.083635,0.118571,0.082235,0.082235,0.101962


In [21]:
len(data)

746

In [22]:
data = data.dropna()

In [23]:
len(data)

746

# PCA

In [24]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = data[features]
y = data['convex_hull_area']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [25]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV


In [26]:
# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Get explained variance ratios
explained_var_ratio = pca.explained_variance_ratio_

# Determine number of components to keep
total_var = 0
n_components = 0
for i, ratio in enumerate(explained_var_ratio):
    total_var += ratio
    if total_var >= 0.975:  # Adjust the threshold as needed
        n_components = i + 1
        break

print(f"Number of principal components selected: {n_components}")

# Apply PCA transformation with selected components
pca_final = PCA(n_components=n_components)
X_selected = pca_final.fit_transform(X_scaled)

# Get selected features (if needed, you can interpret the components to understand which original features contribute the most)
selected_features = X.columns[pca_final.components_.mean(axis=0).argsort()[::-1][:n_components]]
print("Selected features by PCA:", selected_features)

Number of principal components selected: 39
Selected features by PCA: Index(['texture_c3_correlation_std', 'color_color_diversity_std',
       'color_yellow_mean', 'color_purple_mean', 'texture_c1_contrast_mean',
       'color_hue_mean', 'texture_c1_dissimilarity_mean', 'color_yellow_std',
       'region_size_avg_mean', 'color_purple_std', 'color_red_std',
       'region_size_avg_std', 'rule_of_thirds_mean', 'region_count_std',
       'color_blue_mean', 'color_black_mean', 'color_gray_mean',
       'inner_sharpness_std', 'texture_c2_contrast_std', 'color_red_mean',
       'inner_sharpness_mean', 'inner_brightness_std', 'color_green_std',
       'texture_c3_homogeneity_mean', 'texture_c1_correlation_std',
       'texture_c2_homogeneity_mean', 'color_orange_mean',
       'texture_c1_correlation_mean', 'color_black_std',
       'texture_c1_dissimilarity_std', 'color_saturation_mean',
       'texture_c1_energy_std', 'color_blue_std', 'objects_count_yolo_std',
       'color_saturation_std',

In [27]:
sorted_values = np.sort(selected_features.values)
sorted_values

array(['color_black_mean', 'color_black_std', 'color_blue_mean',
       'color_blue_std', 'color_color_diversity_std', 'color_gray_mean',
       'color_green_std', 'color_hue_mean', 'color_orange_mean',
       'color_orange_std', 'color_purple_mean', 'color_purple_std',
       'color_red_mean', 'color_red_std', 'color_saturation_mean',
       'color_saturation_std', 'color_yellow_mean', 'color_yellow_std',
       'inner_brightness_std', 'inner_sharpness_mean',
       'inner_sharpness_std', 'objects_count_yolo_std',
       'region_count_std', 'region_size_avg_mean', 'region_size_avg_std',
       'rule_of_thirds_mean', 'texture_c1_contrast_mean',
       'texture_c1_correlation_mean', 'texture_c1_correlation_std',
       'texture_c1_dissimilarity_mean', 'texture_c1_dissimilarity_std',
       'texture_c1_energy_std', 'texture_c2_contrast_std',
       'texture_c2_correlation_mean', 'texture_c2_correlation_std',
       'texture_c2_energy_mean', 'texture_c2_homogeneity_mean',
       'texture_

In [33]:
data['preference'] = data['GroupN(1=low(1-3),2=neutral(4-6),3=high(7-10))']

In [34]:
conditions = [
    (0 < data['PurchaseDesireMean']) & (data['PurchaseDesireMean'] <= 3),
    (3 < data['PurchaseDesireMean']) & (data['PurchaseDesireMean'] <= 6),
    (6 < data['PurchaseDesireMean']) & (data['PurchaseDesireMean'] <= 10)
]

choices = [1, 2, 3]

data['purchase'] = np.select(conditions, choices)