# NEURAL NETWORK

## DOCUMENT PREAMBLE

In [10]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Configure matplotlib
tqdm.pandas()
plt.style.use("classic")
#plt.rcParams["figure.dpi"] = 200
plt.rcParams["figure.facecolor"] = "white"
plt.rcParams["font.family"] = "serif"

In [2]:
# Set document parameters
data_version = "demo"
data_type = "train"

In [3]:
# Load data from parquet files
def load_data(data_version, data_type, print_info=False):
    if data_type not in ["train", "validation"]:
        raise ValueError("data_type must be either 'train' or 'validation'")

    # Read parquet files into DataFrames
    behaviors_df = pd.read_parquet(
        f"./data/ebnerd_{data_version}/{data_type}/behaviors.parquet"
    )
    history_df = pd.read_parquet(f"./data/ebnerd_{data_version}/{data_type}/history.parquet")
    articles_df = pd.read_parquet(f"./data/ebnerd_{data_version}/articles.parquet")

    # Print DataFrame info
    if print_info:
        for name, df in zip(
            [f"{data_type}/behaviors", f"{data_type}/history", "articles"],
            [behaviors_df, history_df, articles_df],
        ):
            print(f"--- '{name}' ---\n")
            print(df.info(), "\n")

    return behaviors_df, history_df, articles_df

# Load data
behaviors_df, history_df, articles_df = load_data(data_version, data_type, print_info=False)

In [4]:
# Load data from parquet files
def load_data(version, data_type, print_info=False):
    base_path = f"./data_processed/{version}_{data_type}_"
    files = ["behaviors_df_expanded.parquet", "history_df_expanded.parquet", "articles_df_expanded.parquet", "users_df_expanded.parquet"]
    dataframes = [pd.read_parquet(f"{base_path}{file}") for file in files]
    
    if print_info:
        for df in dataframes:
            print(df.info(), "\n")
    
    return dataframes

# Load users_df
_, _, _, users_df = load_data(data_version, data_type, print_info=False)

In [5]:
# Filter rows where 'article_ids_clicked' has more than 1 element
behaviors_df = behaviors_df[behaviors_df['article_ids_clicked'].apply(lambda x: len(x) <= 1)]

# Convert the list to an integer value
behaviors_df['article_ids_clicked'] = behaviors_df['article_ids_clicked'].apply(lambda x: x[0] if x else None).astype('Int64')

In [6]:
# Merge history_df into behaviors_df
df = behaviors_df.merge(
    history_df,
    how='inner',
    left_on=['user_id'],
    right_on=['user_id']
)

In [7]:
# Merge articles_df into df
df = df.merge(
    articles_df.add_prefix('clicked_article_'),
    how='inner',
    left_on=['article_ids_clicked'],
    right_on=['clicked_article_article_id']
)

# Drop 'clicked_article_article_id' column
df = df.drop(columns=['clicked_article_article_id'])

In [8]:
# Merge users_df into df
df = df.merge(
    users_df,
    how='inner',
    left_on=['user_id'],
    right_on=['user_id']
)

print(df.iloc[1000].transpose())

impression_id                                                                  23290797
article_id                                                                    9777492.0
impression_time                                                     2023-05-24 07:22:14
read_time                                                                          52.0
scroll_percentage                                                                 100.0
device_type                                                                           3
article_ids_inview                    [9778386, 9777034, 9778007, 9778448, 9759955, ...
article_ids_clicked                                                             9778413
user_id                                                                          667805
is_sso_user                                                                       False
gender                                                                              NaN
postcode                        

In [None]:
# Create a Series mapping article_ids_clicked to topics for quick lookup
article_topics_map = df.set_index('article_ids_clicked')['clicked_article_topics'].to_dict()

# Precompute the impression times window for each article
impressions_df = df[['impression_time', 'article_ids_clicked']]

# Create a dictionary to store trendiness scores
trendiness_scores = {}

# Use tqdm to wrap the outer loop for progress tracking
for article_ids_clicked, group in tqdm(impressions_df.groupby('article_ids_clicked'), desc="Calculating Trendiness"):
    # Get the topics for the current article
    topics = article_topics_map.get(article_ids_clicked, [])
    
    # Calculate trendiness for each impression time in the group
    for impression_time in group['impression_time']:
        start_time = impression_time - pd.Timedelta(days=7)
        
        # Filter impressions in the time window
        relevant_impressions = impressions_df[
            (impressions_df['impression_time'] >= start_time) &
            (impressions_df['impression_time'] < impression_time)
        ]
        
        # Calculate trendiness by checking topic overlap
        trendiness = relevant_impressions['article_ids_clicked'].apply(
            lambda x: any(topic in article_topics_map.get(x, []) for topic in topics)
        ).sum()
        
        # Store the trendiness score
        trendiness_scores[(article_ids_clicked, impression_time)] = trendiness

# Map the trendiness scores back to the df
df['trendiness'] = df.apply(
    lambda row: trendiness_scores.get((row['article_ids_clicked'], row['impression_time']), 0), axis=1
)

In [12]:
import os

file_path = os.path.join('data_processed', f'DATA1.parquet')
df.to_parquet(file_path)

In [66]:
# Define the file path
file_path = os.path.join('data_processed', 'DATA1.parquet')

# Read the Parquet file into a DataFrame
df = pd.read_parquet(file_path)

In [67]:
print(df[['user_id', 'article_ids_clicked']])

       user_id  article_ids_clicked
0        22779              9759966
1       150224              9778661
2       160892              9777856
3      1001055              9776566
4      1001055              9776553
...        ...                  ...
24589  2053999              9775562
24590  2053999              9775361
24591  2060487              9775699
24592  2060487              9758424
24593  2096611              9770369

[24594 rows x 2 columns]


In [68]:
# Precompute the impression times window for each article
impressions_df = df[['user_id', 'impression_time', 'article_ids_inview']]

# Convert 'article_ids_inview' and 'article_ids_clicked' to lists explicitly
def convert_to_list(x):
    if isinstance(x, list):
        return x
    elif isinstance(x, str):
        return x.strip('[]').replace("'", "").split(', ')
    elif isinstance(x, np.ndarray):
        return x.tolist()
    else:
        return []

df['article_ids_inview'] = df['article_ids_inview'].apply(convert_to_list)

# seen_before_clicked calculation function
def calculate_seen_before_clicked(user_id, article_id, impression_time, hours=48):
    start_time = impression_time - pd.Timedelta(hours=hours)
    relevant_impressions = impressions_df[
        (impressions_df['user_id'] == user_id) &
        (impressions_df['impression_time'] >= start_time) & 
        (impressions_df['impression_time'] < impression_time)
    ]
    seen_before_clicked = relevant_impressions['article_ids_inview'].apply(lambda x: article_id in x).sum()
    return seen_before_clicked

# Apply the function with a progress bar
df['seen_before_clicked'] = df.progress_apply(
    lambda row: calculate_seen_before_clicked(row['user_id'], row['article_id'], row['impression_time']), axis=1
)

  0%|          | 0/24594 [00:00<?, ?it/s]

100%|██████████| 24594/24594 [00:08<00:00, 2838.53it/s]


In [69]:
print(df[['user_id', 'article_ids_clicked', 'seen_before_clicked']])

       user_id  article_ids_clicked  seen_before_clicked
0        22779              9759966                    0
1       150224              9778661                    0
2       160892              9777856                    0
3      1001055              9776566                    0
4      1001055              9776553                    0
...        ...                  ...                  ...
24589  2053999              9775562                    0
24590  2053999              9775361                    0
24591  2060487              9775699                    0
24592  2060487              9758424                    1
24593  2096611              9770369                    0

[24594 rows x 3 columns]


In [70]:
# Article Delay
df['article_delay'] = (df['impression_time'] - df['clicked_article_published_time']).dt.total_seconds() / 60

In [71]:
df['impression_hour'] = df['impression_time'].dt.hour
df['impression_day_of_week'] = df['impression_time'].dt.dayofweek

# Convert time of day and day of week to cyclical features
df['impression_hour_sin'] = np.sin(2 * np.pi * df['impression_hour'] / 24)
df['impression_hour_cos'] = np.cos(2 * np.pi * df['impression_hour'] / 24)
df['impression_day_of_week_sin'] = np.sin(2 * np.pi * df['impression_day_of_week'] / 7)
df['impression_day_of_week_cos'] = np.cos(2 * np.pi * df['impression_day_of_week'] / 7)

In [83]:
def get_ner_clusters(article_ids, articles_df):
    # Filter the articles_df to get NER clusters for the given article IDs
    clusters = articles_df[articles_df['article_id'].isin(article_ids)]['ner_clusters']
    # Flatten the list of lists and join into a single string
    return ' '.join([item for sublist in clusters for item in sublist])

# Extract NER clusters for each article_id_fixed
df['article_ner_clusters'] = df['article_id_fixed'].apply(lambda ids: get_ner_clusters(ids, articles_df))

# Convert the clicked_article_ner_clusters to strings
df['clicked_article_ner_clusters_str'] = df['clicked_article_ner_clusters'].apply(lambda x: ' '.join(x))

# Combine all NER cluster strings for TF-IDF computation
corpus = pd.concat([df['clicked_article_ner_clusters_str'], df['article_ner_clusters']])

# Create a TF-IDF vectoriser and fit it to the corpus
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# Calculate cosine similarity between clicked NER clusters and article NER clusters
def calculate_similarity(row_index):
    # Compute similarity between the clicked NER and article NER for each row
    return cosine_similarity(tfidf_matrix[row_index], tfidf_matrix[row_index + len(df)])[0][0]

# Apply similarity calculation for each row
df['ner_similarity'] = [calculate_similarity(i) for i in range(len(df))]

# Display the results
df[['article_ids_clicked', 'ner_similarity']]

Unnamed: 0,article_ids_clicked,ner_similarity
0,9759966,0.0
1,9778661,0.0
2,9777856,0.0
3,9776566,0.0
4,9776553,0.0
...,...,...
24589,9775562,0.0
24590,9775361,0.0
24591,9775699,0.0
24592,9758424,0.0
