# Cars -  Feature Selection

Choosing the most effective variables  

In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [27]:
df = pd.read_pickle('df_merged_final.pkl')
df


Unnamed: 0,track_popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,key,mode,feat,remix,you,me,i,release_year,release_month,duration_s
0,41,0.682,0.401,-10.068,0.023600,0.279000,0.011700,0.0887,0.566,97.091,...,2,1,0,0,0,0,0,2001,1,235.440
1,15,0.582,0.704,-6.242,0.034700,0.065100,0.000000,0.2120,0.698,150.863,...,5,1,0,0,0,0,0,2018,1,197.286
2,28,0.303,0.880,-4.739,0.044200,0.011700,0.009940,0.3470,0.404,135.225,...,9,1,0,0,0,0,1,2017,11,373.512
3,24,0.659,0.794,-5.644,0.054000,0.000761,0.132000,0.3220,0.852,128.041,...,10,0,0,0,0,0,0,2015,8,228.565
4,38,0.662,0.838,-6.300,0.049900,0.114000,0.000697,0.0881,0.496,129.884,...,1,1,0,0,0,0,0,2018,11,236.308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28351,72,0.744,0.715,-6.103,0.095543,0.101000,0.000000,0.0919,0.340,154.962,...,0,1,0,0,0,0,0,2017,2,179.773
28352,36,0.832,0.666,-4.920,0.063300,0.143000,0.000000,0.0720,0.810,109.536,...,1,0,0,0,0,0,0,2010,10,223.890
28353,49,0.963,0.603,-6.224,0.180000,0.067300,0.000006,0.2140,0.647,129.990,...,2,1,0,0,0,0,0,2014,12,260.240
28354,40,0.458,0.540,-6.457,0.027000,0.237360,0.000428,0.1150,0.657,142.218,...,5,0,0,0,0,0,0,2013,7,191.205


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28356 entries, 0 to 28355
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   track_popularity       28356 non-null  int64  
 1   danceability           28356 non-null  float64
 2   energy                 28356 non-null  float64
 3   loudness               28356 non-null  float64
 4   speechiness            28356 non-null  float64
 5   acousticness           28356 non-null  float64
 6   instrumentalness       28356 non-null  float64
 7   liveness               28356 non-null  float64
 8   valence                28356 non-null  float64
 9   tempo                  28356 non-null  float64
 10  unique_playlist_count  28356 non-null  float64
 11  playlist_genre_edm     28356 non-null  int64  
 12  playlist_genre_latin   28356 non-null  int64  
 13  playlist_genre_pop     28356 non-null  int64  
 14  playlist_genre_r&b     28356 non-null  int64  
 15  pl

In [29]:
y=df['track_popularity']
X = df.drop(columns=['track_popularity'])

# Hyperparameters

## Multivariable Analysis

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor # regressor because mpg is continous
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVR
from sklearn.linear_model import Ridge

### Summarization and Selection of Variables 

The Lasso penalty here is controlled by alpha = 0.01.
This penalty forces some coefficients to shrink to zero, effectively performing feature selection.
A higher penalty (larger alpha) would result in more coefficients being zeroed out, and a lower penalty (smaller alpha) would retain more features.

In [35]:
# Fit models and determine if a feature is selected (1) or not (0)
lasso = Lasso(alpha=5).fit(X, y) 
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)

# Fit Ridge model
ridge = Ridge(alpha=5).fit(X, y)
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)

gb = GradientBoostingRegressor().fit(X, y)
gb_selected = (gb.feature_importances_ > 0).astype(int)

rf = RandomForestRegressor().fit(X, y)
rf_selected = (rf.feature_importances_ > 0).astype(int)

# Create a DataFrame to store results
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected, 
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'Ridge': ridge_selected
})

# Sum the number of selections for each feature
selection_df['Sum'] = selection_df[['Lasso', 'GradientBoost', 'RandomForest','Ridge']].sum(axis=1)

# Output the results
print(selection_df)

                  Feature  Lasso  GradientBoost  RandomForest  Ridge  Sum
0            danceability      0              1             1      1    3
1                  energy      0              1             1      1    3
2                loudness      0              1             1      1    3
3             speechiness      0              1             1      1    3
4            acousticness      0              1             1      1    3
5        instrumentalness      0              1             1      1    3
6                liveness      0              1             1      1    3
7                 valence      0              1             1      1    3
8                   tempo      0              1             1      1    3
9   unique_playlist_count      0              0             0      0    0
10     playlist_genre_edm      0              1             1      1    3
11   playlist_genre_latin      0              1             1      1    3
12     playlist_genre_pop      0      

# Creating DataFrame with most valuable variables 

 Selected variables - recommended by 3 or more models 

In [38]:
 #Selecting variables with a sum of selections >= 3 
final_var = selection_df[selection_df['Sum'] >= 3]['Feature'].tolist()
df_model = df[final_var].copy()
df_model['track_popularity'] = df['track_popularity'].copy()

# Output the result to verify
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28356 entries, 0 to 28355
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   danceability          28356 non-null  float64
 1   energy                28356 non-null  float64
 2   loudness              28356 non-null  float64
 3   speechiness           28356 non-null  float64
 4   acousticness          28356 non-null  float64
 5   instrumentalness      28356 non-null  float64
 6   liveness              28356 non-null  float64
 7   valence               28356 non-null  float64
 8   tempo                 28356 non-null  float64
 9   playlist_genre_edm    28356 non-null  int64  
 10  playlist_genre_latin  28356 non-null  int64  
 11  playlist_genre_pop    28356 non-null  int64  
 12  playlist_genre_r&b    28356 non-null  int64  
 13  playlist_genre_rap    28356 non-null  int64  
 14  playlist_genre_rock   28356 non-null  int64  
 15  key                

In [39]:
df_model.to_csv('df_model.csv')

## Setting proportion for Train, Test, Dev ('dev' is the same as 'validation' )

In [41]:
from sklearn.model_selection import train_test_split

# Define the target and feature set
X = df_model.drop(columns=['track_popularity'])
y = df_model['track_popularity']

# Total number of samples
total_samples = len(df_model)

# First split: training and temporary (test + dev)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Second split: testing and development (dev)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Calculate percentages
train_percent = (X_train.shape[0] / total_samples) * 100
dev_percent = (X_dev.shape[0] / total_samples) * 100
test_percent = (X_test.shape[0] / total_samples) * 100

# Output the sizes of each set in percentages
print(f"Training set size: {train_percent:.2f}%")
print(f"Validation (Dev) set size: {dev_percent:.2f}%")
print(f"Testing set size: {test_percent:.2f}%")

Training set size: 70.00%
Validation (Dev) set size: 15.00%
Testing set size: 15.00%


In [42]:
df_model.to_pickle("df_model.pkl")