In [3]:
import numpy as np
import polars as pl



class Data:
    def __init__(self, path_to_behaviors, path_to_articles, path_to_history, path_to_embeddings):
        self.behaviors = pl.read_parquet(path_to_behaviors)
        self.articles = pl.read_parquet(path_to_articles)
        self.history = pl.read_parquet(path_to_history)
        self.article_embeddings = pl.read_parquet(path_to_embeddings)
        #self.article_embeddings_dict = {row["article_id"]: row["contrastive_vector"] for row in self.article_embeddings.to_dicts()}


In [19]:
#Load data

from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:

#Set up path
!ls drive/'My Drive'/tmp/

path_to_data = 'drive/My Drive/tmp'
path_to_embeddings = 'drive/My Drive/tmp/Ekstra_Bladet_contrastive_vector'


articles.parquet  Ekstra_Bladet_contrastive_vector  Ekstra_Bladet_word2vec  train  validation


In [91]:
#Load data
data = Data(f"{path_to_data}/train/behaviors.parquet", f"{path_to_data}/articles.parquet", f"{path_to_data}/train/history.parquet", f"{path_to_embeddings}/contrastive_vector.parquet")

print(data.history)

shape: (15_143, 5)
┌─────────┬──────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┐
│ user_id ┆ impression_time_fixe ┆ scroll_percentage_f ┆ article_id_fixed    ┆ read_time_fixed     │
│ ---     ┆ d                    ┆ ixed                ┆ ---                 ┆ ---                 │
│ u32     ┆ ---                  ┆ ---                 ┆ list[i32]           ┆ list[f32]           │
│         ┆ list[datetime[μs]]   ┆ list[f32]           ┆                     ┆                     │
╞═════════╪══════════════════════╪═════════════════════╪═════════════════════╪═════════════════════╡
│ 13538   ┆ [2023-04-27          ┆ [100.0, 35.0, …     ┆ [9738663, 9738569,  ┆ [17.0, 12.0, …      │
│         ┆ 10:17:43, 2023-04-…  ┆ 100.0]              ┆ … 9769366]          ┆ 16.0]               │
│ 14241   ┆ [2023-04-27          ┆ [100.0, 46.0, …     ┆ [9738557, 9738528,  ┆ [8.0, 9.0, … 12.0]  │
│         ┆ 09:40:18, 2023-04-…  ┆ 100.0]              ┆ … 9767852]     

In [110]:
# Ensure it is a NumPy array or a list to avoid issues:
user_id = data.history["user_id"].to_numpy()
impression_time_fixed = data.history["impression_time_fixed"].to_numpy()
scroll_percentage_fixed = data.history["scroll_percentage_fixed"].to_numpy()
article_id_fixed = data.history["article_id_fixed"].to_numpy()
read_time_fixed = data.history["read_time_fixed"].to_numpy()

print(len(user_id)) #15143 users

#Print structure at index 1
print(read_time_fixed[1])

#Find the max read time of the first user
max_value = np.max(read_time_fixed[1])
max_index = np.argmax(read_time_fixed[1])

print(max_value)
print(max_index)

print(read_time_fixed[1][max_index])


15143
[   8.    9.   28.   17.   91.   21.   14.   27.    8.   88.    2.  121.
   51.    3.    6.   41.  112.   19.   26.   50.   16.   11.   56.   17.
   27.   25.   14.    3.   36.    4.   23.   22.    3.   55.   22.  441.
   44.    3.   10.   19.    5.   45.   26.   55.    4.   19.    0.   40.
   63.   15.   46.   60.   13.   18.   12.   20.   16.   17.   31.    4.
   14.   13.   19.   10.    4.    4.   18.  776.   19.    5.   62.   85.
   22.   36.   30.   10.   24.    6.   30.   90.    9.    0.   28.   58.
 1574.   42.   58.  100.   12.   37.    7.   52.   21.   28.   36.   10.
    4.   12.   28.   12.   19.   14.   19.    7.   41.    6.   19.    9.
   12.    0.  148.    8.   16.    3.    8.   42.   38.   21.   13.    2.
   15.   12.   31.   37.    7.    6.    4.  258.    6.    7.    2.  139.
    8.   18.    3.   24.    4.   20.   19.   13.   23.    6.   10.   42.
    3.   40.    3.   13.   45.    9.    0.   61.   86.    0.   56.   14.
   64.   25.   23.   12.   48.   42.    6.   

In [153]:
# Create a dictionary to store article IDs and their corresponding read times
article_read_times = {}
article_max_read_times = {}
amount_of_clicks = {}

# Iterate through article_id_fixed and read_time_fixed
for i in range(len(article_id_fixed)):
    for j in range(len(article_id_fixed[i])):
        article_id = article_id_fixed[i][j]
        read_time = read_time_fixed[i][j]

        # If the article ID is already in the dictionary, append the read time
        if article_id in article_read_times:
            article_read_times[article_id].append(read_time)
        # Otherwise, create a new list for the article ID and add the read time
        else:
            article_read_times[article_id] = [read_time]

# Calculate the average read time for each article ID
average_read_times = {}
for article_id, read_times in article_read_times.items():
    if len(read_times) >= 2: #Require that atleast 5 people have clicked on the article
        average_read_times[article_id] = np.mean(read_times)
        article_max_read_times[article_id] = np.max(read_times)

# Print the results
#for article_id, avg_read_time in average_read_times.items():
#    print(f"Article ID: {article_id}, Average Read Time: {avg_read_time}")

#for article_id, max_read_time in article_max_read_times.items():
#    print(f"Article ID: {article_id}, Max Read Time: {max_read_time}")



In [158]:
#Print dictionary of average read times
print(average_read_times.items())
print(max(average_read_times.values())) #This probably corresponds to only one recorded user
print(sum(average_read_times.values())/len(average_read_times.items()))

#Print max of max read times
print(max(average_read_times.values())) #Max used seconds is 1800, rest is cut off

#Amount of clicks
print(len(read_time_fixed[1]))

#Showing that the dictionary keys are unique
print(len(average_read_times.items()))
print(len(set(average_read_times.keys())))

#Count unread articles
count = 0
for article_id, read_time in average_read_times.items():
  if read_time == 0:
    #print(f"Article: {article_id}, Read time: {read_time}")
    count += 1

print(count)

dict_items([(9738663, 60.622517), (9738569, 56.415718), (9738490, 41.970257), (9738667, 70.79342), (9738528, 48.387386), (9736689, 44.653156), (9738452, 52.959545), (9733713, 86.12619), (9738777, 53.567837), (9738760, 51.80671), (9739035, 51.867294), (9739088, 55.376755), (9739065, 43.850636), (9738646, 61.659885), (9737084, 51.72007), (9739202, 31.592407), (9739274, 35.121693), (9739200, 32.105076), (9739179, 51.558712), (9738567, 60.70842), (9739283, 61.279606), (9739634, 35.96794), (9737243, 123.92713), (9739072, 69.82227), (9739640, 40.896442), (9738974, 47.41578), (9737420, 53.56751), (9739443, 61.001453), (9728819, 39.060326), (9739344, 52.085464), (9738660, 84.20529), (9739342, 54.795666), (9740392, 63.858482), (9740356, 80.70466), (9740432, 29.042974), (9740405, 66.02631), (9740400, 96.56854), (9741311, 77.15075), (9741297, 45.62402), (9733782, 47.860294), (9741508, 38.397995), (9741449, 48.454895), (9741476, 34.639374), (9741360, 64.70581), (9741106, 50.104168), (9741036, 39.1