<a href="https://colab.research.google.com/github/migub/recommender-systems/blob/main/Notebooks/02_Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
## 1. Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from google.colab import drive
drive.mount('/content/drive')
# Load preprocessed dataset
df = pd.read_csv('/content/drive/MyDrive/Recommender_Systems/train_preprocessed.csv')

# Display basic info
df.info()
display(df.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7086214 entries, 0 to 7086213
Data columns (total 16 columns):
 #   Column            Dtype  
---  ------            -----  
 0   genre_id          int64  
 1   media_id          int64  
 2   album_id          int64  
 3   context_type      int64  
 4   platform_name     int64  
 5   platform_family   int64  
 6   media_duration    float64
 7   listen_type       int64  
 8   user_gender       int64  
 9   user_id           int64  
 10  artist_id         int64  
 11  user_age          float64
 12  is_listened       bool   
 13  listen_hour       int64  
 14  listen_dayofweek  int64  
 15  song_age          float64
dtypes: bool(1), float64(3), int64(12)
memory usage: 817.7 MB


Unnamed: 0,genre_id,media_id,album_id,context_type,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened,listen_hour,listen_dayofweek,song_age
0,25471,222606,41774,12,1,0,0.502326,0,0,9241,55164,0.916667,False,13,3,4533.0
1,25571,250467,43941,0,2,1,0.260465,0,0,16547,55830,1.0,True,22,2,3927.0
2,16,305197,48078,1,2,1,0.15814,1,1,7665,2704,0.916667,True,13,5,859.0
3,7,900502,71521,0,0,0,0.581395,0,1,1580,938,1.0,False,9,5,5871.0
4,7,542335,71718,0,0,0,0.162791,0,1,1812,2939,0.5,True,18,5,3186.0


In [6]:
## 2. Encode Only Relevant Categorical Features
categorical_cols = ['genre_id', 'context_type', 'platform_name', 'platform_family', 'listen_type', 'user_gender']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for future use


In [7]:
## 3. Generate User & Media Statistics
# User-based features
user_features = df.groupby("user_id").agg({
    "is_listened": ["mean", "count"],  # User listen rate & count
    "media_duration": "mean",  # Average duration of listened songs
    "listen_hour": "median"  # Most common listening hour
}).reset_index()
user_features.columns = ["user_id", "user_listen_rate", "user_listen_count", "user_avg_duration", "user_median_listen_hour"]

# Media-based features
media_features = df.groupby("media_id").agg({
    "is_listened": ["mean", "count"],
    "song_age": "mean"
}).reset_index()
media_features.columns = ["media_id", "media_listen_rate", "media_listen_count", "media_avg_song_age"]

# Merge the new features
df = df.merge(user_features, on="user_id", how="left")
df = df.merge(media_features, on="media_id", how="left")

# Drop raw ID columns since they are now encoded into useful stats
df.drop(columns=['user_id', 'media_id', 'album_id', 'artist_id'], inplace=True)

display(df.head())

Unnamed: 0,genre_id,context_type,platform_name,platform_family,media_duration,listen_type,user_gender,user_age,is_listened,listen_hour,listen_dayofweek,song_age,user_listen_rate,user_listen_count,user_avg_duration,user_median_listen_hour,media_listen_rate,media_listen_count,media_avg_song_age
0,1276,12,1,0,0.502326,0,0,0.916667,False,13,3,4533.0,0.682927,205,0.516687,12.0,0.0,1,4533.0
1,1284,0,2,1,0.260465,0,0,1.0,True,22,2,3927.0,0.571429,35,0.54299,22.0,1.0,1,3927.0
2,14,1,2,1,0.15814,1,1,0.916667,True,13,5,859.0,0.98951,286,0.527452,17.0,1.0,1,859.0
3,6,0,0,0,0.581395,0,1,1.0,False,9,5,5871.0,0.588899,1063,0.475486,12.0,0.0,1,5871.0
4,6,0,0,0,0.162791,0,1,0.5,True,18,5,3186.0,0.932131,943,0.488698,15.0,0.866667,30,3197.9


In [None]:
## 4. Feature Importance using Random Forest
X = df.drop(columns=['is_listened'])  # Features
y = df['is_listened']  # Target variable

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importance
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': rf.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10,5))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title("Feature Importance (Random Forest)")
plt.show()

# Drop features with very low importance (threshold < 0.005)
low_importance_features = feature_importance[feature_importance['Importance'] < 0.005]['Feature'].tolist()
df.drop(columns=low_importance_features, inplace=True)

print(f"Dropped low importance features: {low_importance_features}")

In [4]:
## 5. Recursive Feature Elimination (RFE)
logreg = LogisticRegression(max_iter=500)
rfe = RFE(logreg, n_features_to_select=10)  # Select top 10 features
rfe.fit(X, y)

# Keep only selected features
selected_features = X.columns[rfe.support_]
df = df[selected_features + ['is_listened']]  # Keep target variable

print(f"Selected Features after RFE: {selected_features}")


NameError: name 'LogisticRegression' is not defined

In [None]:
## 6. Check Multicollinearity (Variance Inflation Factor - VIF)
# Compute VIF for each feature
X = df.drop(columns=['is_listened'])
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Drop features with VIF > 10 (high multicollinearity)
high_vif_features = vif_data[vif_data["VIF"] > 10]["Feature"].tolist()
df.drop(columns=high_vif_features, inplace=True)

print(f"Dropped high multicollinearity features: {high_vif_features}")

In [None]:
## 7. Save Selected Features
df.to_csv('/content/drive/MyDrive/Recommender_Systems/train_selected_features.csv', index=False)

print("Feature Selection Complete. Saved dataset with selected features.")