# FEATURE_ENGINEERING

To run this document in Google Colab, please upload `ebnerd_small.zip` and `small_train_users_df_expanded.parquet` to the available Files for this document.

To run this document locally, please see the instructions in `README.md`.

## DOCUMENT PREAMBLE

In [1]:
# Import and set up libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import itertools
from tqdm import tqdm
import os

# Additional configurations
tqdm.pandas()
plt.style.use("classic")
#plt.rcParams["figure.dpi"] = 200
plt.rcParams["figure.facecolor"] = "white"
plt.rcParams["font.family"] = "serif"

In [2]:
# Set document parameters
data_version = "small"
data_type = "train"
include_plots = False

## LOAD AND SUMMARISE DATA

In [3]:
# Load data from parquet files
def load_data(data_version, data_type, print_info=False):
    if data_type not in ["train", "validation"]:
        raise ValueError("data_type must be either 'train' or 'validation'")

    # Read parquet files into DataFrames
    behaviors_df = pd.read_parquet(
        f"./data/ebnerd_{data_version}/{data_type}/behaviors.parquet"
    )
    history_df = pd.read_parquet(f"./data/ebnerd_{data_version}/{data_type}/history.parquet")
    articles_df = pd.read_parquet(f"./data/ebnerd_{data_version}/articles.parquet")

    # Print DataFrame info
    if print_info:
        for name, df in zip(
            [f"{data_type}/behaviors", f"{data_type}/history", "articles"],
            [behaviors_df, history_df, articles_df],
        ):
            print(f"--- '{name}' ---\n")
            print(df.info(), "\n")

    return behaviors_df, history_df, articles_df

# Load data
behaviors_df, history_df, articles_df = load_data(data_version, data_type, print_info=False)

In [4]:
# Summarise missing data in DataFrames
def summarise_missing_data(dataframes, names=None):
    if names is None:
        names = [f"DataFrame {i}" for i in range(len(dataframes))]

    # Calculate and print missing data summary for each DataFrame
    for name, df in zip(names, dataframes):
        missing_count = df.isnull().sum()
        missing_percentage = (missing_count / len(df)) * 100
        missing_summary = pd.DataFrame(
            {"Missing count": missing_count, "Missing percentage": missing_percentage}
        ).sort_values(by="Missing count", ascending=False)

        missing_summary = missing_summary[missing_summary["Missing count"] > 0]

        print(f"--- Missing data summary for '{name}' ---\n")
        if missing_summary.empty:
            print("No missing data.\n")
        else:
            print(missing_summary, "\n")

    return

# Define DataFrame lists and summarise missing data.
dataframes = [behaviors_df, history_df, articles_df]
names = ["behaviors_df", "history_df", "articles_df"]
summarise_missing_data(dataframes, names)

--- Missing data summary for 'behaviors_df' ---

                        Missing count  Missing percentage
postcode                       228214           97.993447
age                            226546           97.277220
gender                         216668           93.035678
scroll_percentage              163789           70.329817
article_id                     162466           69.761730
next_scroll_percentage          26270           11.280149
next_read_time                   6218            2.669964 

--- Missing data summary for 'history_df' ---

No missing data.

--- Missing data summary for 'articles_df' ---

                 Missing count  Missing percentage
total_read_time          10882           52.473720
total_pageviews          10882           52.473720
total_inviews            10770           51.933648
image_ids                 1878            9.055840 



## FILTER DATA

In [5]:
# Filter DataFrame to keep only single-click entries
def filter_single_clicks(df):
    single_clicks = df["article_ids_clicked"].apply(len) == 1
    df = df[single_clicks].assign(
        article_ids_clicked=lambda df: df["article_ids_clicked"].str[0].astype(int)
    )
    
    return df

# Apply filtering function to behaviors DataFrame
behaviors_df = filter_single_clicks(behaviors_df)

## ADD TIME-BASED FEATURES

In [6]:
# Add time-related features to the DataFrame
def add_time_features(df, time_feature, prefix, include_long_term_features=True):
    if include_long_term_features:
        df[f"{prefix}year"] = df[time_feature].dt.year
        df[f"{prefix}month"] = df[time_feature].dt.month
        df[f"{prefix}day"] = df[time_feature].dt.day

    df[f"{prefix}day_of_week"] = df[time_feature].dt.dayofweek
    df[f"{prefix}hour"] = df[time_feature].dt.hour

    # Calculate sine and cosine for day of the week and hour
    df[f"{prefix}day_of_week_sin"] = np.sin(2 * np.pi * df[f"{prefix}day_of_week"] / 7)
    df[f"{prefix}day_of_week_cos"] = np.cos(2 * np.pi * df[f"{prefix}day_of_week"] / 7)
    df[f"{prefix}hour_sin"] = np.sin(2 * np.pi * df[f"{prefix}hour"] / 24)
    df[f"{prefix}hour_cos"] = np.cos(2 * np.pi * df[f"{prefix}hour"] / 24)

    # Assign season based on month
    def get_season(month):
        if month in [12, 1, 2]:
            return 1  # Winter
        elif month in [3, 4, 5]:
            return 2  # Spring
        elif month in [6, 7, 8]:
            return 3  # Summer
        elif month in [9, 10, 11]:
            return 4  # Autumn

    if include_long_term_features:
        df[f"{prefix}season"] = df[f"{prefix}month"].apply(get_season)

    return df

# Add time features to the behaviors DataFrame
behaviors_df = add_time_features(behaviors_df, 'impression_time', 'impression_', include_long_term_features=False)


### PLOT

In [7]:
# Plot distributions of categorical features
def plot_categorical_distributions(df, categorical_features, df_name):
    if include_plots:
        for feature in categorical_features:
            plt.figure(figsize=(8, 4))
            value_counts = df[feature].value_counts()
            if len(value_counts) > 30:
                value_counts = value_counts.nlargest(30)
            value_counts.sort_index().plot(kind="bar", color="skyblue", width=0.8)
            plt.title(f"{df_name}: Distribution of {feature}")
            plt.xlabel(feature)
            plt.ylabel("Frequency")
            plt.xticks(rotation=0, ha="right")
            plt.tight_layout()
            plt.show()
        
    return

# Define categorical features to plot
behaviors_categorical_features = [
    "device_type",
    "is_sso_user",
    "is_subscriber",
    "impression_day_of_week",
    "impression_hour",
]

# Plot categorical distributions for behaviors DataFrame
plot_categorical_distributions(behaviors_df, behaviors_categorical_features, "behaviors_df")

In [8]:
# Plot distributions of numerical features
def plot_numerical_distributions(df, numerical_features, df_name):
    if include_plots:
        for feature in numerical_features:
            plt.figure(figsize=(8, 4))
            data = df[feature].dropna()
            kde = gaussian_kde(data)
            x = np.linspace(data.min(), data.max(), 1000)
            density = kde(x)
            frequency = density * len(data)
            plt.plot(x, frequency, color="black")
            plt.fill_between(x, frequency, color="skyblue")
            plt.title(f"{df_name}: Distribution of {feature}")
            plt.xlabel(feature)
            plt.ylabel("Frequency")
            plt.xticks(rotation=0, ha="right")
            plt.tight_layout()
            plt.show()

    return

# Define numerical features to plot
behaviors_numerical_features = [
    "read_time",
    "next_read_time",
    "scroll_percentage",
    "next_scroll_percentage",
]

# Plot numerical distributions for behaviors DataFrame
plot_numerical_distributions(behaviors_df, behaviors_numerical_features, "behaviors_df")

## FIX MISSING DATA

In [9]:
def handle_missing_data(df):
    """
    Handles missing data by removing rows with incomplete information,
    except for front page impressions (where both 'article_id' and 'scroll_percentage' are missing).
    """
    initial_row_count = df.shape[0]

    # Drop rows with missing next scroll percentage or read time
    df = df.dropna(subset=["next_scroll_percentage", "next_read_time"])
    
    # Identify front page rows where both 'article_id' and 'scroll_percentage' are missing
    frontpage_rows = df[df["article_id"].isna() & df["scroll_percentage"].isna()]
    
    # Drop remaining rows with missing article_id or scroll_percentage
    df = df.dropna(subset=["article_id", "scroll_percentage"], how="any")
    
    # Re-add front page rows
    df = pd.concat([df, frontpage_rows])

    # Set article_id for front page rows and fill missing scroll_percentage
    df.loc[frontpage_rows.index, "article_id"] = -1
    df["scroll_percentage"] = df["scroll_percentage"].fillna(df["scroll_percentage"].median())
    
    # Create a flag for front page impressions
    df["is_frontpage"] = (df["article_id"] == -1).astype(int)

    # Drop unnecessary columns
    df = df.drop(columns=["postcode", "age", "gender"])

    # Report number of dropped rows
    total_rows_dropped = initial_row_count - df.shape[0]
    print(
        f"Rows dropped: {total_rows_dropped} ({(total_rows_dropped / initial_row_count) * 100:.2f}% of total).\n"
    )

    return df

# Handle missing data in the behaviors DataFrame
behaviors_df = handle_missing_data(behaviors_df)

Rows dropped: 26370 (11.38% of total).



## CAP OUTLIERS

In [10]:
# Cap outliers in specified columns of the DataFrame
def cap_outliers(df):
    df["read_time"] = np.clip(df["read_time"], 0, np.percentile(df["read_time"], 95))
    df["next_read_time"] = np.clip(df["next_read_time"], 0, np.percentile(df["next_read_time"], 95))
    df["scroll_percentage"] = np.clip(df["scroll_percentage"], 0, 100)
    df["next_scroll_percentage"] = np.clip(df["next_scroll_percentage"], 0, 100)
    
    return df

# Cap outliers in the behaviors DataFrame
behaviors_df = cap_outliers(behaviors_df)

### PLOT

In [11]:
# Plot numerical distributions for specified features
plot_numerical_distributions(behaviors_df, behaviors_numerical_features, "behaviors_df")

## ADD SESSION-BASED FEATURES

In [12]:
def add_session_metrics(df):
    """
    Calculate average read time and scroll percentage for each session
    and merge these metrics back into the original DataFrame.
    """
    # Calculate session metrics
    session_metrics = (
        df.groupby("session_id")
        .agg(
            session_avg_read_time=("read_time", "mean"),
            session_avg_scroll_percentage=("scroll_percentage", "mean"),
        )
        .reset_index()
    )

    # Merge session metrics with the original DataFrame
    df = df.merge(session_metrics, on="session_id", how="left")

    return df

# Add session metrics to the behaviors DataFrame
behaviors_df = add_session_metrics(behaviors_df)

### PLOT

In [13]:
# Update numerical features list to include session metrics
behaviors_numerical_features = behaviors_numerical_features + [
    "session_avg_read_time",
    "session_avg_scroll_percentage",
]

# Plot distributions of the newly added session metrics
plot_numerical_distributions(behaviors_df, behaviors_numerical_features[-2:], "behaviors_df")

In [14]:
# Generate scatter plots for combinations of numerical features
def plot_scatter(df, features, df_name):
    if include_plots:
        feature_pairs = itertools.combinations(features, 2)

        # Generate scatter plots for each pair of features
        for x_feature, y_feature in feature_pairs:
            plt.figure(figsize=(8, 4))
            plt.scatter(df[x_feature], df[y_feature], color='skyblue', edgecolor='black', linewidth=1)
            plt.title(f"{df_name}: {x_feature} vs {y_feature}")
            plt.xlabel(x_feature)
            plt.ylabel(y_feature)
            plt.tight_layout()
            plt.show()

    return

plot_scatter(behaviors_df, behaviors_numerical_features, "behaviors_df")

In [15]:
# Generate boxplots for numerical features grouped by categorical features
def plot_boxplots(df, categorical_features, numerical_features, df_name, include_plots=False):
    if include_plots:
        for categorical_feature in categorical_features:
            for numerical_feature in numerical_features:
                plt.figure(figsize=(8, 4))
                
                # Get sorted unique categories
                unique_categories = sorted(df[categorical_feature].unique())
                
                data = [
                    df[numerical_feature][df[categorical_feature] == cat].dropna()
                    for cat in unique_categories
                ]

                plt.boxplot(
                    data,
                    boxprops=dict(facecolor="skyblue", edgecolor="black"),
                    medianprops=dict(color="black"),
                    whiskerprops=dict(color="black"),
                    flierprops=dict(markerfacecolor="black"),
                    patch_artist=True,
                )

                plt.title(f"{df_name}: {categorical_feature} vs {numerical_feature}")
                plt.xlabel(categorical_feature)
                plt.ylabel(numerical_feature)
                plt.xticks(
                    ticks=range(1, len(unique_categories) + 1),
                    labels=unique_categories,
                )
                plt.tight_layout()
                plt.show()
            
    return

plot_boxplots(behaviors_df, behaviors_categorical_features, behaviors_numerical_features, "behaviors_df")

## CREATE USERS_DF DATAFRAME

In [16]:
import pandas as pd
from tqdm import tqdm

def create_users_df(history_df, articles_df, chunk_size=10000):
    """
    Creates a DataFrame with user-specific aggregated features.
    Optimised to reduce memory usage while keeping all features from articles_df.
    Includes print statements and progress bars for major steps.
    """
    print("Step 1: Exploding history_df columns for individual rows...")
    history_df = history_df.explode(
        [
            "impression_time_fixed",
            "scroll_percentage_fixed",
            "article_id_fixed",
            "read_time_fixed",
        ]
    )
    print("Step 1 complete!")

    print("\nStep 2: Adding prefixes to articles_df columns...")
    articles_df_prefixed = articles_df.add_prefix("article_")
    articles_df_prefixed.rename(columns={"article_article_id": "article_id"}, inplace=True)
    print("Step 2 complete!")

    print("\nStep 3: Merging history_df with articles_df in chunks...")

    def merge_in_chunks(history_df, articles_df, chunk_size=100000):
        """
        Merges the history_df with articles_df in smaller chunks to avoid memory overload.
        """
        chunks = []
        for start in tqdm(range(0, len(history_df), chunk_size), desc="Merging Chunks"):
            end = min(start + chunk_size, len(history_df))
            chunk = history_df.iloc[start:end]

            # Merge the current chunk with the articles_df (no columns dropped)
            merged_chunk = chunk.merge(
                articles_df_prefixed,
                left_on="article_id_fixed",
                right_on="article_id",
                how="left"
            )
            chunks.append(merged_chunk)
        
        # Concatenate all chunks into one DataFrame
        merged_df = pd.concat(chunks, ignore_index=True)
        return merged_df

    merged_df = merge_in_chunks(history_df, articles_df_prefixed, chunk_size)
    print("Step 3 complete!")

    print("\nStep 4: Aggregating user features...")
    def aggregate_user_features(group):
        # Most frequent (top) features
        most_categories = (
            group["article_category_str"].value_counts().nlargest(10).index.tolist()
        )
        most_subcategories = (
            group["article_subcategory"]
            .explode()
            .value_counts()
            .nlargest(10)
            .index.tolist()
        )
        most_topics = (
            group["article_topics"].explode().value_counts().nlargest(10).index.tolist()
        )
        most_ner_clusters = (
            group["article_ner_clusters"]
            .explode()
            .value_counts()
            .nlargest(10)
            .index.tolist()
        )

        # Least frequent features
        least_categories = (
            group["article_category_str"].value_counts().nsmallest(10).index.tolist()
        )
        least_subcategories = (
            group["article_subcategory"]
            .explode()
            .value_counts()
            .nsmallest(10)
            .index.tolist()
        )
        least_topics = (
            group["article_topics"].explode().value_counts().nsmallest(10).index.tolist()
        )
        least_ner_clusters = (
            group["article_ner_clusters"]
            .explode()
            .value_counts()
            .nsmallest(10)
            .index.tolist()
        )

        avg_sentiment = group["article_sentiment_score"].mean()
        premium_count = group["article_premium"].sum()
        avg_scroll = group["scroll_percentage_fixed"].mean()
        avg_read_time = group["read_time_fixed"].mean()
        total_articles = group["article_id_fixed"].nunique()

        return pd.Series(
            {
                "most_categories": most_categories,
                "most_subcategories": most_subcategories,
                "avg_sentiment_score": avg_sentiment,
                "premium_count": premium_count,
                "avg_scroll_percentage": avg_scroll,
                "avg_read_time": avg_read_time,
                "total_articles_viewed": total_articles,
                "most_topics": most_topics,
                "most_ner_clusters": most_ner_clusters,
                "least_categories": least_categories,
                "least_subcategories": least_subcategories,
                "least_topics": least_topics,
                "least_ner_clusters": least_ner_clusters,
            }
        )

    non_group_columns = [
        "article_category_str",
        "article_subcategory",
        "article_sentiment_score",
        "article_premium",
        "scroll_percentage_fixed",
        "read_time_fixed",
        "article_id_fixed",
        "article_topics",
        "article_ner_clusters",
    ]

    tqdm.pandas(desc="Grouping by user_id and Aggregating")
    users_df = (
        merged_df.groupby("user_id")[non_group_columns]
        .progress_apply(aggregate_user_features)
        .reset_index()
    )

    print("Step 4 complete!")
    return users_df

# Create users DataFrame from history and articles DataFrames
users_df = create_users_df(history_df, articles_df)

Step 1: Exploding history_df columns for individual rows...
Step 1 complete!

Step 2: Adding prefixes to articles_df columns...
Step 2 complete!

Step 3: Merging history_df with articles_df in chunks...


Merging Chunks: 100%|██████████| 243/243 [00:02<00:00, 88.20it/s]


Step 3 complete!

Step 4: Aggregating user features...


Grouping by user_id and Aggregating: 100%|██████████| 15143/15143 [00:46<00:00, 322.68it/s]

Step 4 complete!





In [17]:
# Display a random sample of 5 rows from the DataFrame and transpose it
users_df.sample(n=5).T

Unnamed: 0,708,7455,6944,14307,10370
user_id,133045,1292585,1204187,2442303,1782371
most_categories,"[krimi, nyheder, underholdning, sport, natione...","[nyheder, krimi, sport, underholdning, forbrug...","[nyheder, underholdning, sport, krimi, natione...","[krimi, nyheder, sport]","[nyheder, krimi, underholdning, sport, musik]"
most_subcategories,"[133, 327, 349, 433, 127, 425, 501, 196, 432, ...","[133, 123, 425, 337, 199, 433, 208, 196, 432, ...","[196, 133, 432, 227, 130, 199, 131, 425, 123, ...","[133, 196, 227, 264]","[133, 127, 429, 431, 432, 425, 316, 138, 501, ..."
avg_sentiment_score,0.905026,0.868926,0.865173,0.99084,0.846777
premium_count,0,6,1,1,0
avg_scroll_percentage,72.142857,62.605263,65.338028,54.6,77.129032
avg_read_time,48.304348,28.071429,149.071429,45.4,34.545455
total_articles_viewed,30,40,83,5,38
most_topics,"[Katastrofe, Transportmiddel, Bil, Mindre ulyk...","[Kriminalitet, Erhverv, Økonomi, Kendt, Sport,...","[Kendt, Erhverv, Underholdning, Sport, Økonomi...","[Kriminalitet, Personfarlig kriminalitet, Kata...","[Katastrofe, Mindre ulykke, Transportmiddel, K..."
most_ner_clusters,"[Twitter, Ekstra Bladet, Ukraine, Emilie Meng,...","[Ekstra Bladet, Danmark, København, Ekstra Bla...","[Ekstra Bladet, Danmark, Twitter, danskere, S,...","[Ekstra Bladet, Peter Hallas, Rita, Da Italien...","[Ekstra Bladet, Twitter, USA, Charles, Tryg, F..."


## CREATE TARGET FEATURE

In [18]:
# Explode article IDs in view and create target variable for clicked articles
def explode_and_create_target(df):
    df = df.explode("article_ids_inview")
    df["article_ids_inview"] = pd.to_numeric(df["article_ids_inview"], errors="coerce")
    df["target"] = (df["article_ids_inview"] == df["article_ids_clicked"]).astype(int)

    return df

# Process the behaviors DataFrame to explode article IDs and create target variable
behaviors_df = explode_and_create_target(behaviors_df)

In [19]:
# Define new column names for the behaviors DataFrame
new_column_names = [
    "impression_id",
    "impression_article_id",
    "impression_time",
    "impression_read_time",
    "impression_scroll_percentage",
    "impression_device_type",
    "impression_article_id_inview",
    "impression_article_id_clicked",
    "user_id",
    "user_is_sso",
    "user_is_subscriber",
    "impression_session_id",
    "impression_next_read_time",
    "impression_next_scroll_percentage",
    "impression_day_of_week",
    "impression_hour",
    "impression_day_of_week_sin",
    "impression_day_of_week_cos",
    "impression_hour_sin",
    "impression_hour_cos",
    "impression_is_frontpage",
    "session_avg_read_time",
    "session_avg_scroll_percentage",
    "target",
]

# Assign new column names to the behaviors DataFrame
behaviors_df.columns = new_column_names

## EXPAND HISTORY_DF

In [20]:
# Explode the history DataFrame and rename specific columns
def explode_history_df(history_df):
    history_df = history_df.explode(
        [
            "impression_time_fixed",
            "scroll_percentage_fixed",
            "article_id_fixed",
            "read_time_fixed",
        ]
    )
    history_df.rename(
        columns={
            "impression_time_fixed": "time",
            "scroll_percentage_fixed": "scroll_percentage",
            "read_time_fixed": "read_time",
        },
        inplace=True,
    )
    
    return history_df

# Process the history DataFrame to explode and rename columns
history_df = explode_history_df(history_df)

In [21]:
# Define new column names for the history DataFrame
new_column_names = [
    "user_id",
    "impression_time",
    "impression_scroll_percentage",
    "article_id",
    "impression_read_time",
]

# Assign new column names to the history DataFrame
history_df.columns = new_column_names

### ADD TIME-BASED FEATURES AND FIX COLUMN TYPES

In [22]:
# Add time-related features to the history DataFrame
history_df = add_time_features(history_df, 'impression_time', 'impression_', include_long_term_features=False)

# Convert columns to appropriate data types for memory efficiency
history_df['impression_scroll_percentage'] = history_df['impression_scroll_percentage'].astype(np.float32)
history_df['article_id'] = history_df['article_id'].astype(np.int32)
history_df['impression_read_time'] = history_df['impression_read_time'].astype(np.float32)

### PLOT

In [23]:
# Define numerical features for the history DataFrame and plot their distributions
history_numerical_features = [
    "impression_scroll_percentage",
    "impression_read_time",
]

# Plot distributions of numerical features in the history DataFrame
plot_numerical_distributions(history_df, history_numerical_features, "history_df")

In [24]:
# Define categorical features for the history DataFrame and plot their distributions
history_categorical_features = [
    "impression_day_of_week",
    "impression_hour",
]

# Plot distributions of categorical features in the history DataFrame
plot_categorical_distributions(history_df, history_categorical_features, "history_df")

## EXPAND ARTICLES_DF

In [25]:
# Define new column names for the articles DataFrame
new_column_names = [
    "article_id",
    "article_title",
    "article_subtitle",
    "article_last_modified_time",
    "article_is_premium",
    "article_body",
    "article_published_time",
    "article_image_ids",
    "article_type",
    "article_url",
    "article_ner_clusters",
    "article_entity_groups",
    "article_topics",
    "article_category",
    "article_subcategory",
    "article_category_str",
    "article_total_inviews",
    "article_total_pageviews",
    "article_total_read_time",
    "article_sentiment_score",
    "article_sentiment_label",
]

# Assign new column names to the articles DataFrame
articles_df.columns = new_column_names

### ADD TIME-BASED FEATURES

In [26]:
# Add time-related features for the last modified time to the articles DataFrame
articles_df = add_time_features(articles_df, 'article_last_modified_time', 'article_last_modified_', include_long_term_features=True)

# Add time-related features for the published time to the articles DataFrame
articles_df = add_time_features(articles_df, 'article_published_time', 'article_published_', include_long_term_features=True)

### PLOT

In [27]:
# Define numerical features for the articles DataFrame and plot their distributions
articles_numerical_features = [
    "article_total_inviews",
    "article_total_pageviews",
    "article_total_read_time",
    "article_sentiment_score"
]

# Plot distributions of numerical features in the articles DataFrame
plot_numerical_distributions(articles_df, articles_numerical_features, "articles_df")

### LOG TRANSFORM SELECT FEATURES

In [28]:
# Apply log transformations to specified features in the DataFrame
def apply_log_transformations(df, to_log_features):
    for feature in to_log_features:
        if feature in df.columns:
            # Replace NaN with 0 before applying the transformation
            df[feature] = df[feature].fillna(0)
            
            # Apply log1p transformation
            df[f"{feature}_log"] = np.log1p(df[feature])
            
            # Drop the original feature
            #df = df.drop(columns=[feature])

    return df

# List of features to apply log transformations
to_log_features = [
    "article_total_inviews",
    "article_total_pageviews",
    "article_total_read_time",
]

# Apply log transformations to the articles DataFrame
articles_df = apply_log_transformations(articles_df, to_log_features)


#### PLOT

In [29]:
# Update the list of numerical features to include log-transformed features
articles_numerical_features = articles_numerical_features + [
    "article_total_inviews_log",
    "article_total_pageviews_log",
    "article_total_read_time_log",
]

# Plot distributions of the newly added log-transformed numerical features
plot_numerical_distributions(articles_df, articles_numerical_features[-3:], "articles_df")

In [30]:
# Define categorical features for the articles DataFrame and plot their distributions
articles_categorical_features = [
    "article_is_premium",
    "article_type",
    "article_sentiment_label",
    "article_last_modified_year",
    "article_last_modified_month",
    "article_last_modified_day",
    "article_last_modified_day_of_week",
    "article_last_modified_hour",
    "article_last_modified_season",
    "article_published_year",
    "article_published_month",
    "article_published_day",
    "article_published_day_of_week",
    "article_published_hour",
    "article_published_season",
]

# Plot distributions of categorical features in the articles DataFrame
plot_categorical_distributions(articles_df, articles_categorical_features, "articles_df")

In [31]:
# Drop all columns starting with 'article_last_modified_' from the DataFrame
articles_df.drop(articles_df.filter(like='article_last_modified_').columns, axis=1, inplace=True)

In [32]:
# Plot the top N counts of categorical features from the users DataFrame
def plot_list_categorical_features(df, ranked_features, df_name, top_n=10, ticks_above=False):
    if include_plots:
        for feature_name in ranked_features:
            plt.figure(figsize=(8, 4))
            all_features = df[feature_name].explode().dropna()
            feature_counts = all_features.value_counts().nlargest(top_n)

            ax = feature_counts[:top_n].plot(kind="bar", color="skyblue", width=0.8)

            plt.title(f'{df_name}: Most frequent {feature_name}')
            plt.xlabel(f'{feature_name.rsplit("_", 1)[-1].title()}')
            plt.ylabel("Frequency")

            x_ticks = ax.get_xticks()

            if ticks_above:
                ax.set_xticks(x_ticks)
                ax.set_xticklabels(
                    feature_counts.index, rotation=90, ha="center", va="bottom"
                )
                ax.tick_params(axis="x", pad=-10)
            else:
                ax.set_xticks(x_ticks)
                ax.set_xticklabels(feature_counts.index, rotation=45, ha="center")

            plt.tight_layout()
            plt.show()

    return

# Define list-type categorical features for the articles DataFrame and plot their distributions
articles_list_categorical_features = [
    "article_ner_clusters",
    "article_entity_groups",
    "article_topics",
    "article_category",
    "article_subcategory",
    "article_category_str",
]

# Plot distributions of list-type categorical features in the articles DataFrame
plot_list_categorical_features(articles_df, articles_list_categorical_features, "articles_df", top_n=10, ticks_above=True)

In [33]:
list(users_df.columns)

['user_id',
 'most_categories',
 'most_subcategories',
 'avg_sentiment_score',
 'premium_count',
 'avg_scroll_percentage',
 'avg_read_time',
 'total_articles_viewed',
 'most_topics',
 'most_ner_clusters',
 'least_categories',
 'least_subcategories',
 'least_topics',
 'least_ner_clusters']

In [34]:
# Define new column names for the users DataFrame
new_column_names = [
    "user_id",
    "user_most_categories",
    "user_most_subcategories",
    "user_avg_sentiment_score",
    "user_total_premium_viewed",
    "user_avg_scroll_percentage",
    "user_avg_read_time",
    "user_total_articles_viewed",
    "user_most_topics",
    "user_most_ner_clusters",
    "user_least_categories",
    "user_least_subcategories",
    "user_least_topics",
    "user_least_ner_clusters",
]

# Assign new column names to the users DataFrame
users_df.columns = new_column_names

## PLOT USERS_DF AND FINAL DATAFRAMES

In [35]:
# Define numerical features for the users DataFrame and plot their distributions
users_numerical_features = [
    "user_avg_sentiment_score",
    "user_total_premium_viewed",
    "user_avg_scroll_percentage",
    "user_avg_read_time",
    "user_total_articles_viewed",
]

# Plot distributions of numerical features in the users DataFrame
plot_numerical_distributions(users_df, users_numerical_features, "users_df")

In [36]:
# Define list-type categorical features for the users DataFrame and plot their distributions
users_list_categorical_features = [
    "user_most_categories",
    "user_most_subcategories",
    "user_most_topics",
    "user_most_ner_clusters",
    "user_least_categories",
    "user_least_subcategories",
    "user_least_topics",
    "user_least_ner_clusters",
]

# Plot distributions of list-type categorical features in the users DataFrame
plot_list_categorical_features(users_df, users_list_categorical_features, "users_df", top_n=10, ticks_above=True)

In [37]:
# Define DataFrame lists and summarise missing data.
dataframes = [behaviors_df, history_df, articles_df, users_df]
names = ["behaviors_df", "history_df", "articles_df", "users_df"]
summarise_missing_data(dataframes, names)

--- Missing data summary for 'behaviors_df' ---

No missing data.

--- Missing data summary for 'history_df' ---

                              Missing count  Missing percentage
impression_scroll_percentage         255076           10.513192 

--- Missing data summary for 'articles_df' ---

                   Missing count  Missing percentage
article_image_ids           1878             9.05584 

--- Missing data summary for 'users_df' ---

                            Missing count  Missing percentage
user_avg_scroll_percentage              2            0.013207 



In [38]:
# Drop rows with missing data in 'impression_scroll_percentage' column from history_df
history_df = history_df.dropna(subset=['impression_scroll_percentage'])

# Drop 'article_image_ids' column from articles_df
articles_df = articles_df.drop(columns=['article_image_ids'])

# Drop rows with missing data in 'user_avg_scroll_percentage' column from users_df
users_df = users_df.dropna(subset=['user_avg_scroll_percentage'])

In [39]:
# Display a random sample of 5 rows from the DataFrame and transpose it
behaviors_df.sample(n=5).T

Unnamed: 0,13351,94640,89580,60960,115232
impression_id,123104591,129736060,113419458,736897,216145578
impression_article_id,9772706.0,-1.0,-1.0,-1.0,-1.0
impression_time,2023-05-21 20:28:00,2023-05-23 18:00:35,2023-05-22 08:37:27,2023-05-25 04:44:25,2023-05-19 11:25:17
impression_read_time,75.0,36.0,29.0,29.0,40.0
impression_scroll_percentage,100.0,100.0,100.0,100.0,100.0
impression_device_type,1,2,2,2,2
impression_article_id_inview,9527358,9777992,9775567,9779648,9772291
impression_article_id_clicked,9774652,9777693,9775551,9778971,9772088
user_id,1013631,1561198,1174591,1458893,1124471
user_is_sso,False,False,False,False,False


In [40]:
# Display a random sample of 5 rows from the DataFrame and transpose it
history_df.sample(n=5).T

Unnamed: 0,4514,958,14083,4751,10543
user_id,1545564,1417668,992585,2529886,2324053
impression_time,2023-05-02 09:45:23,2023-05-03 20:12:48,2023-05-17 16:29:23,2023-04-28 12:46:18,2023-04-27 18:55:05
impression_scroll_percentage,27.0,25.0,100.0,82.0,100.0
article_id,9745471,9748514,9770492,9740662,9739344
impression_read_time,3.0,23.0,9.0,32.0,2.0
impression_day_of_week,1,2,2,4,3
impression_hour,9,20,16,12,18
impression_day_of_week_sin,0.781831,0.974928,0.974928,-0.433884,0.433884
impression_day_of_week_cos,0.62349,-0.222521,-0.222521,-0.900969,-0.900969
impression_hour_sin,0.707107,-0.866025,-0.866025,0.0,-1.0


In [41]:
# Display a random sample of 5 rows from the DataFrame and transpose it
articles_df.sample(n=5).T

Unnamed: 0,12985,154,9852,3239,16401
article_id,9715696,3987251,9575236,7309905,9752038
article_title,Trump giver første interview efter anholdelse:...,Rocksanger håner Rolling Stones,Dræbt på kræmmermarked: Knivstik og vild flugt,Lige nu: Her graver de efter nye spor,Sandra lukkede ned: Store CL-tæsk til dansk to...
article_subtitle,"Den kendte tv-vært Tucker Carlson, der angivel...",Chris Robinson fra The Black Crowes sammenlign...,Storkriminel dømt for dødstrusler og mishandli...,En varm sommer har gjort det muligt for politi...,"Györ var i storform, da holdet hjemme tævede O..."
article_is_premium,False,False,True,False,False
article_body,"Det er svært at vide, hvem der havde det mest ...",Chris Robinson og sangerens band The Black Cro...,"- Det var det. Farvel, Mark.\nDet var de sidst...",Ved du noget?\nTip os på 1224@eb.dk eller sms/...,Odense Håndbold missede lørdag mulighed for at...
article_published_time,2023-04-12 04:17:57,2013-03-20 09:03:07,2023-01-04 11:32:04,2018-09-17 10:35:51,2023-05-06 17:46:01
article_type,article_default,article_default,article_default,article_default,article_default
article_url,https://ekstrabladet.dk/nyheder/politik/trump-...,https://ekstrabladet.dk/musik/intlmusiknyt/art...,https://ekstrabladet.dk/krimi/draebt-paa-kraem...,https://ekstrabladet.dk/krimi/article7309905.ece,https://ekstrabladet.dk/sport/haandbold/sandra...
article_ner_clusters,"[Dominion Voting Systems, Ekstra Bladet, Fox N...",[Robinson],"[Deniz Yildiz Hvidberg Köktas, Døllefjelde Mar...",[],"[Althea Rienhardt, Ana Gros, Anne Mette Hansen..."
article_entity_groups,"[ORG, ORG, ORG, ORG, PER, PER, PER]",[PER],"[PER, LOC, ORG, ORG, LOC, LOC, PER, LOC, LOC, ...",[],"[PER, PER, PER, PER, LOC, EVENT, EVENT, ORG, M..."


In [42]:
# Display a random sample of 5 rows from the DataFrame and transpose it
users_df.sample(n=5).T

Unnamed: 0,10297,7397,11324,3544,5833
user_id,1770537,1282038,1946423,621147,1021720
user_most_categories,"[krimi, sport, nyheder, underholdning, natione...","[nyheder, sport, underholdning, krimi, natione...","[nyheder, underholdning, sport, krimi, natione...","[nyheder, sport, krimi, underholdning, natione...","[nyheder, sport, krimi, underholdning, natione..."
user_most_subcategories,"[133, 349, 327, 196, 433, 227, 270, 123, 199, ...","[133, 127, 196, 123, 227, 433, 432, 327, 130, ...","[133, 432, 123, 327, 196, 127, 337, 227, 349, ...","[196, 133, 123, 199, 227, 127, 208, 317, 437, ...","[133, 196, 327, 199, 432, 208, 123, 425, 130, ..."
user_avg_sentiment_score,0.904104,0.857029,0.846969,0.871563,0.860453
user_total_premium_viewed,8,6,0,1,5
user_avg_scroll_percentage,86.870968,96.76259,62.149425,67.458333,87.465116
user_avg_read_time,37.985915,108.390728,73.567308,50.888889,172.083333
user_total_articles_viewed,46,141,80,27,138
user_most_topics,"[Kriminalitet, Personfarlig kriminalitet, Kend...","[Kendt, Erhverv, Politik, Sport, Kriminalitet,...","[Kendt, Underholdning, Begivenhed, Erhverv, Sp...","[Erhverv, Sport, Økonomi, Privat virksomhed, K...","[Kendt, Sport, Erhverv, Katastrofe, Underholdn..."
user_most_ner_clusters,"[Ekstra Bladet, Hinge, Michella Winther, Amage...","[Ekstra Bladet, Danmark, Ukraine, Rusland, Sin...","[Ekstra Bladet, CNN, Ukraine, Twitter, Rusland...","[Danmark, Ekstra Bladet, Silkeborg, Ukraine, R...","[Ekstra Bladet, Danmark, Twitter, København, U..."


In [43]:
# Create a new directory for processed data if it doesn't exist
os.makedirs('data_processed', exist_ok=True)

# Define a dictionary with DataFrame names and their corresponding DataFrames
dataframes = {
    'behaviors_df': behaviors_df,
    'history_df': history_df,
    'articles_df': articles_df,
    'users_df': users_df
}

# Write each DataFrame to a Parquet file with a '_expanded' suffix
for name, df in dataframes.items():
    file_path = os.path.join('data_processed', f'{data_version}_{data_type}_{name}_expanded.parquet')
    df.to_parquet(file_path)

## DOCUMENT COMPLETE