## DOCUMENT PREAMBLE

To run this document in Google Colab, please upload `ebnerd_small.zip` and `small_train_users_df_expanded.parquet` to the available Files for this document.

To run this document locally, please see the instructions in `README.md`.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import itertools
from tqdm import tqdm
import os

# Configure tqdm and matplotlib
tqdm.pandas()
plt.style.use("classic")
#plt.rcParams["figure.dpi"] = 200
plt.rcParams["figure.facecolor"] = "white"
plt.rcParams["font.family"] = "serif"

In [3]:
# Load data from parquet files
def load_data(version, data_type):
    if data_type not in ["train", "validation"]:
        raise ValueError("data_type must be either 'train' or 'validation'")

    # Read parquet files into DataFrames
    behaviors_df = pd.read_parquet(
        f"./data/ebnerd_{version}/{data_type}/behaviors.parquet"
    )
    history_df = pd.read_parquet(f"./data/ebnerd_{version}/{data_type}/history.parquet")
    articles_df = pd.read_parquet(f"./data/ebnerd_{version}/articles.parquet")

    # Print DataFrame info
    for name, df in zip(
        [f"{data_type}/behaviors", f"{data_type}/history", "articles"],
        [behaviors_df, history_df, articles_df],
    ):
        print(f"--- '{name}' ---\n")
        print(df.info(), "\n")

    return behaviors_df, history_df, articles_df

## FEATURE SELECTION 

In [5]:
#LOAD DATA

import pandas as pd

# Load datasets
users_df = pd.read_parquet(f"./data_processed/small_train_users_df_expanded.parquet")
behaviors_df = pd.read_parquet(f"./data_processed/small_train_behaviors_df_expanded.parquet")
articles_df = pd.read_parquet(f"./data_processed/small_train_articles_df_expanded.parquet")

# Merge datasets
data = pd.merge(behaviors_df, users_df, on='user_id', how='left')
data = pd.merge(data, articles_df, left_on='impression_article_id', right_on='article_id', how='left')

# Display shape for debugging
print(f"Data Shape: {data.shape}")

Data Shape: (2254478, 62)


### IDENTIFY NON-NUMERIC COLUMNS

In [6]:
# Identify non-numeric columns
non_numeric_columns = data.select_dtypes(include=['object']).columns
array_like_columns = [col for col in data.columns if isinstance(data[col].iloc[0], (list, np.ndarray))]

print(f"\nNon-Numeric Columns (object type):\n{list(non_numeric_columns)}")
print(f"\nArray-Like Columns (list or ndarray):\n{array_like_columns}")

# Analyze unique values for non-numeric columns
for col in non_numeric_columns:
    try:
        unique_count = data[col].nunique()
        print(f"Column: {col}, Unique Values: {unique_count}")
    except Exception as e:
        print(f"Error processing column {col}: {e}")

# For array-like columns, handle them separately or drop them
for col in array_like_columns:
    print(f"Column: {col}, Example Value: {data[col].iloc[0]}")

# Drop array-like columns (if not needed)
data = data.drop(columns=array_like_columns, errors='ignore')
print(f"Shape after dropping array-like columns: {data.shape}")



Non-Numeric Columns (object type):
['user_top_categories', 'user_top_subcategories', 'user_top_topics', 'user_top_ner_clusters', 'article_title', 'article_subtitle', 'article_is_premium', 'article_body', 'article_type', 'article_url', 'article_ner_clusters', 'article_entity_groups', 'article_topics', 'article_subcategory', 'article_category_str', 'article_sentiment_label']

Array-Like Columns (list or ndarray):
['user_top_categories', 'user_top_subcategories', 'user_top_topics', 'user_top_ner_clusters', 'article_ner_clusters', 'article_entity_groups', 'article_topics', 'article_subcategory']
Error processing column user_top_categories: unhashable type: 'numpy.ndarray'
Error processing column user_top_subcategories: unhashable type: 'numpy.ndarray'
Error processing column user_top_topics: unhashable type: 'numpy.ndarray'
Error processing column user_top_ner_clusters: unhashable type: 'numpy.ndarray'
Column: article_title, Unique Values: 1642
Column: article_subtitle, Unique Values: 161

### DROP HIGH CARDINALITY COLUMNS

In [7]:
# Threshold for dropping columns with high cardinality
high_cardinality_threshold = 50  # Adjust as necessary

# Identify non-numeric columns
non_numeric_columns = data.select_dtypes(include=['object']).columns
array_like_columns = [col for col in data.columns if isinstance(data[col].iloc[0], (list, np.ndarray))]

# Analyze cardinality for non-numeric columns
high_cardinality_columns = []
for col in non_numeric_columns:
    unique_count = data[col].nunique()
    print(f"Column: {col}, Unique Values: {unique_count}")
    if unique_count > high_cardinality_threshold:
        high_cardinality_columns.append(col)

# Drop high-cardinality columns
columns_to_drop = high_cardinality_columns + array_like_columns
print(f"\nColumns to Drop (High Cardinality or Array-Like): {columns_to_drop}")
data = data.drop(columns=columns_to_drop, errors='ignore')

# Display the resulting dataset shape
print(f"Shape after dropping high cardinality and array-like columns: {data.shape}")


Column: article_title, Unique Values: 1642
Column: article_subtitle, Unique Values: 1614
Column: article_is_premium, Unique Values: 2
Column: article_body, Unique Values: 1638
Column: article_type, Unique Values: 5
Column: article_url, Unique Values: 1642
Column: article_category_str, Unique Values: 15
Column: article_sentiment_label, Unique Values: 3

Columns to Drop (High Cardinality or Array-Like): ['article_title', 'article_subtitle', 'article_body', 'article_url']
Shape after dropping high cardinality and array-like columns: (2254478, 50)


### DEFINE COLUMNS TO ENCODE

In [8]:
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Define columns to encode
columns_to_encode = ['article_is_premium', 'article_type', 'article_category_str', 'article_sentiment_label']

# Apply label encoding
label_encoders = {}
for col in tqdm(columns_to_encode, desc="Encoding relevant columns"):
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le  # Store encoders for inverse transforms later

print(f"Shape after encoding relevant columns: {data.shape}")


Encoding relevant columns: 100%|██████████| 4/4 [00:03<00:00,  1.15it/s]

Shape after encoding relevant columns: (2254478, 50)





### SELECT FEATURES AND TARGET

In [9]:
# Select Features and Target

# List of identifier columns to exclude
columns_to_exclude = ['user_id', 'impression_id', 'article_id', 'impression_session_id']

# Automatically select all remaining feature columns except for identifier columns and target
target_column = 'target'
feature_columns = [col for col in data.columns if col not in columns_to_exclude + [target_column]]

print(f"Selected Features ({len(feature_columns)}): {feature_columns}")

# Split features (X) and target (y)
X = data[feature_columns]
y = data[target_column]

# Display shapes to confirm successful split
print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Check if there are any non-numeric columns in X
non_numeric_columns = X.select_dtypes(include=['object']).columns
print(f"Non-Numeric Columns in Features (X): {list(non_numeric_columns)}")

# Drop non-numeric columns (optional, only if needed)
X = X.drop(columns=non_numeric_columns, errors='ignore')
print(f"Shape after dropping non-numeric columns: {X.shape}")


Selected Features (45): ['impression_article_id', 'impression_time', 'impression_read_time', 'impression_scroll_percentage', 'impression_device_type', 'impression_article_id_inview', 'impression_article_id_clicked', 'user_is_sso', 'user_is_subscriber', 'impression_next_read_time', 'impression_next_scroll_percentage', 'impression_day_of_week', 'impression_hour', 'impression_day_of_week_sin', 'impression_day_of_week_cos', 'impression_hour_sin', 'impression_hour_cos', 'impression_is_frontpage', 'session_avg_read_time', 'session_avg_scroll_percentage', 'user_avg_sentiment_score', 'user_total_premium_viewed', 'user_avg_scroll_percentage', 'user_avg_read_time', 'user_total_articles_viewed', 'article_is_premium', 'article_published_time', 'article_type', 'article_category', 'article_category_str', 'article_sentiment_score', 'article_sentiment_label', 'article_published_year', 'article_published_month', 'article_published_day', 'article_published_day_of_week', 'article_published_hour', 'articl

### HANDLE MISSING VALUES

In [10]:
# Handle Missing Values

from sklearn.impute import SimpleImputer

# Identify datetime columns and convert them to numeric timestamps
datetime_columns = X.select_dtypes(include=['datetime64']).columns
print(f"Datetime Columns: {list(datetime_columns)}")

# Option 2: Convert datetime columns to timestamps and feature-engineered columns
for col in datetime_columns:
    X[f'{col}_timestamp'] = pd.to_datetime(X[col]).astype(int) // 10**9  # Unix timestamp
    X[f'{col}_year'] = pd.to_datetime(X[col]).dt.year
    X[f'{col}_month'] = pd.to_datetime(X[col]).dt.month
    X[f'{col}_day'] = pd.to_datetime(X[col]).dt.day

# Drop original datetime columns if not needed anymore
X = X.drop(columns=datetime_columns, errors='ignore')

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Check for any remaining missing values
missing_values_after_imputation = X_imputed.isnull().sum().sum()
print(f"Total Missing Values After Imputation: {missing_values_after_imputation}")

# Display the shape of X after imputation
print(f"Shape after imputation: {X_imputed.shape}")


Datetime Columns: ['impression_time', 'article_published_time']
Total Missing Values After Imputation: 0
Shape after imputation: (2254478, 51)


### SCALE FEATURES

In [12]:
# Scale Features

from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Scale the features in X
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X_imputed.columns)

# Display the shape of X after scaling
print(f"Shape after scaling: {X_scaled.shape}")

# Check statistics to confirm scaling (mean should be ~0, std should be ~1)
print(f"Feature Means After Scaling: \n{X_scaled.mean().head()}")
print(f"Feature Std Devs After Scaling: \n{X_scaled.std().head()}")


Shape after scaling: (2254478, 51)
Feature Means After Scaling: 
impression_article_id          -1.296179e-15
impression_read_time           -6.000828e-18
impression_scroll_percentage   -1.208864e-15
impression_device_type         -1.132656e-16
impression_article_id_inview    3.834352e-17
dtype: float64
Feature Std Devs After Scaling: 
impression_article_id           1.0
impression_read_time            1.0
impression_scroll_percentage    1.0
impression_device_type          1.0
impression_article_id_inview    1.0
dtype: float64


## TESTS FOR FEATURE SELECTION

### MUTUAL INFORMATION

In [None]:
# Calculate Mutual Information (with progress bar and index-based approach)

import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from tqdm import tqdm

# Custom function to track progress for mutual_info_classif
def calculate_mutual_info(X, y):
    mi_scores = []
    for i in tqdm(range(X.shape[1]), desc="Calculating Mutual Information"):
        mi_score = mutual_info_classif(X.iloc[:, [i]], y, discrete_features=False)[0]
        mi_scores.append(mi_score)
    return np.array(mi_scores)

# Calculate Mutual Information
print("\nCalculating Mutual Information...")
mi_scores = calculate_mutual_info(X_scaled, y)
mi_scores = pd.Series(mi_scores, index=range(X_scaled.shape[1])).sort_values(ascending=False)  # Use indices instead of names

# Display top 10 features by Mutual Information
print("\nTop 10 Feature Indices by Mutual Information:\n", mi_scores.head(10))

# Store indices of top 15 features
top_mi_indices = mi_scores.head(15).index
print(f"Top 15 Feature Indices by Mutual Information: {list(top_mi_indices)}")

# Display full results
feature_scores_mi = pd.DataFrame({
    'Feature Index': range(X_scaled.shape[1]),
    'Mutual Info': mi_scores.values
})
from IPython.display import display
display(feature_scores_mi) 


Calculating Mutual Information...


Calculating Mutual Information:  10%|▉         | 5/51 [02:39<24:50, 32.39s/it]

### RANDOM FOREST FEATURE IMPORTANCE

In [None]:
# Calculate Random Forest Feature Importance (with progress bar and index-based approach)

from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
import pandas as pd

# Custom function to track the Random Forest fitting process
class RandomForestWithProgress(RandomForestClassifier):
    def fit(self, X, y):
        print("\nFitting Random Forest for feature importance...")
        for i in tqdm(range(1, 2), desc="Random Forest Training"):  # Simulate progress
            super().fit(X, y)
        return self

# Train Random Forest and calculate feature importance
rf_model = RandomForestWithProgress(random_state=42, n_jobs=-1)
rf_model.fit(X_scaled, y)

rf_feature_importance = pd.Series(rf_model.feature_importances_, index=range(X_scaled.shape[1])).sort_values(ascending=False)

# Display top 10 features by Random Forest Importance
print("\nTop 10 Feature Indices by Random Forest Importance:\n", rf_feature_importance.head(10))

# Store indices of top 15 features
top_rf_indices = rf_feature_importance.head(15).index
print(f"Top 15 Feature Indices by Random Forest Importance: {list(top_rf_indices)}")

# Display full results
feature_scores_rf = pd.DataFrame({
    'Feature Index': range(X_scaled.shape[1]),
    'RF Importance': rf_feature_importance.values
})
from IPython.display import display
display(feature_scores_rf)


### RECURSIVE FEATURE ELIMINATION (RFE)

In [None]:
# Perform Recursive Feature Elimination (RFE) with Progress Bar

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
import pandas as pd  # For DataFrame display
from IPython.display import display  # Alternative to ace_tools

# Custom RFE class to add progress bar
class RFEWithProgress(RFE):
    def fit(self, X, y):
        print("\nPerforming Recursive Feature Elimination (this may take a while)...")
        for i in tqdm(range(1, 2), desc="RFE Progress"):  # Simulate progress
            super().fit(X, y)
        return self

# Perform RFE to select the top 10 features
rfe = RFEWithProgress(estimator=RandomForestClassifier(random_state=42, n_jobs=-1), n_features_to_select=10)
rfe.fit(X_scaled, y)

rfe_support = pd.Series(rfe.support_, index=X_scaled.columns)
selected_features_rfe = rfe_support[rfe_support == True].index

# Display selected features
print("\nTop 10 Features Selected by RFE:\n", list(selected_features_rfe))

# Display full results for all features
feature_scores_rfe = pd.DataFrame({
    'Feature': X_scaled.columns,
    'RFE Support': rfe_support.values
})

# Display DataFrame directly using IPython display
display(feature_scores_rfe)


### COMBINATION

In [None]:
# Step 9: Combine and Display Feature Selection Result
from IPython.display import display  # Use display to show DataFrame

# Combine all results into a single DataFrame
feature_scores_combined = pd.DataFrame({
    'Feature': X_scaled.columns,
    'Mutual Info': mi_scores.values,
    'RF Importance': rf_feature_importance.values,
#   'RFE Support': rfe_support.values
})

# Sort by Random Forest Importance (or you can choose another sorting metric)
feature_scores_combined = feature_scores_combined.sort_values(by='RF Importance', ascending=False)

# Display the top 10 features from the combined results
print("\nTop 10 Features by Combined Results:\n")
display(feature_scores_combined.head(10))

# Display the entire feature score table
display(feature_scores_combined)
