In [1]:
import pandas as pd
import numpy as np

In [8]:
interactions = pd.read_parquet('data/train_interactions.parquet')
users_meta = pd.read_parquet('data/users_meta.parquet')
items_meta = pd.read_parquet('data/items_meta.parquet')


In [None]:
interactions

Unnamed: 0,user_id,item_id,timespent,like,dislike,share,bookmarks
0,3810,138979,6,0,0,0,0
1,101874,331160,6,0,0,0,0
2,150332,73709,11,0,0,0,0
3,4982,189745,5,0,0,0,0
4,149601,289643,1,0,0,1,0
...,...,...,...,...,...,...,...
145667277,10718,93558,7,0,0,0,0
145667278,119164,60206,60,0,0,0,0
145667279,171518,99323,70,0,0,0,0
145667280,109533,74203,1,0,0,0,0


In [9]:

# Drop the embeddings column from items_meta
items_meta.drop(columns=['embeddings'], inplace=True)


In [10]:
users_meta.set_index('user_id', inplace=True)
# Step 3: Merge user and item information
interactions = interactions.merge(users_meta[['age']], on='user_id', how='left')


In [None]:

# Step 2: Normalize timespent
items_meta.set_index('item_id', inplace=True)
interactions['timespent'] /= interactions['item_id'].map(items_meta['duration'])



In [11]:

# Calculate mean_timespent, sum_like, and sum_dis directly in the interactions DataFrame
interactions['mean_timespent'] = interactions.groupby('item_id')['timespent'].transform('mean')
interactions['sum_like'] = interactions.groupby('item_id')['like'].transform('sum')
interactions['sum_dis'] = interactions.groupby('item_id')['dislike'].transform('sum')

# Display the updated DataFrame
print(interactions.head())


   user_id  item_id  timespent  like  dislike  share  bookmarks  age  \
0     3810   138979          6     0        0      0          0   36   
1   101874   331160          6     0        0      0          0   52   
2   150332    73709         11     0        0      0          0   24   
3     4982   189745          5     0        0      0          0   40   
4   149601   289643          1     0        0      1          0   34   

   mean_timespent  sum_like  sum_dis  
0       33.069952        37        0  
1        5.100000         6        0  
2       11.506805        52        5  
3       19.684796         1        0  
4       16.905465        54        2  


In [12]:
# After that, delete the duration column to save memory
items_meta.drop(columns=['duration'], inplace=True)

In [13]:

# Step 1: Create preference column
interactions['preference'] = 0
interactions.loc[interactions['like'] == 1, 'preference'] += 50
interactions.loc[interactions['dislike'] == 1, 'preference'] -= 100
interactions.loc[interactions['share'] == 1, 'preference'] += 30
interactions.loc[interactions['bookmarks'] == 1, 'preference'] += 100


In [14]:

# After that, delete the like, dislike, share, and bookmarks columns to save memory
interactions.drop(columns=['like', 'dislike', 'share', 'bookmarks'], inplace=True)


In [None]:
interactions

In [15]:
# Step 4: Prepare first training data
train_data1 = interactions[['user_id', 'item_id', 'timespent', 'preference', 'age', 'mean_timespent', 'sum_like', 'sum_dis']]






# Optional: View the resulting DataFrames
print("First Train Data:")
print(train_data1.head())




First Train Data:
   user_id  item_id  timespent  preference  age  mean_timespent  sum_like  \
0     3810   138979          6           0   36       33.069952        37   
1   101874   331160          6           0   52        5.100000         6   
2   150332    73709         11           0   24       11.506805        52   
3     4982   189745          5           0   40       19.684796         1   
4   149601   289643          1          30   34       16.905465        54   

   sum_dis  
0        0  
1        0  
2        5  
3        0  
4        2  


In [16]:
items_meta.drop(columns=['source_id'], inplace=True)

In [None]:
train_data1.shape

(145667282, 5)

In [None]:
train_data1.to_csv('/content/drive/MyDrive/VK_RecSys/train_data1.csv', index=False)

In [None]:
del users_meta
del items_meta

In [None]:
del interactions

In [None]:
train_data1['timespent'].min(axis=0)

0.005555555555555556

In [None]:
train_data1

Unnamed: 0,user_id,item_id,timespent,preference,age
0,3810,138979,0.111111,0,36
1,101874,331160,1.000000,0,52
2,150332,73709,0.687500,0,24
3,4982,189745,0.200000,0,40
4,149601,289643,0.043478,30,34
...,...,...,...,...,...
145667277,10718,93558,0.129630,0,36
145667278,119164,60206,1.016949,0,24
145667279,171518,99323,1.794872,0,38
145667280,109533,74203,0.040000,0,20


In [17]:
import pandas as pd

df = train_data1
# Define normalization functions
def normalize_columns(df):
    # Normalize timespent to [0, 1]

    # Normalize preference from -100 to +180 to [0, 1]
    df['preference'] = (df['preference'] + 100) / (280)  # Shift to [0, 1]

    # Normalize age from 18 to 60 to [0, 1]
    df['age'] = (df['age'] - 18) / (18 - 60)  # Scale to [-1, 0]

    return df

# Define data type optimization functions
def optimize_dataframe(df):
    df['user_id'] = df['user_id'].astype('int32')
    df['item_id'] = df['item_id'].astype('int32')
    df['timespent'] = df['timespent'].astype('float32')
    df['preference'] = df['preference'].astype('float32')  # Maintain as float for precision
    return df

# Load the data using chunks to avoid RAM overflow
chunk_size = 10**6  # Adjust based on your available memory
chunks = train_data1

data=normalize_columns(df)
train_data=optimize_dataframe(data)


print("Data normalization and optimization complete.")


Data normalization and optimization complete.


In [18]:
train_data

Unnamed: 0,user_id,item_id,timespent,preference,age,mean_timespent,sum_like,sum_dis
0,3810,138979,6.0,0.357143,-0.428571,33.069952,37,0
1,101874,331160,6.0,0.357143,-0.809524,5.100000,6,0
2,150332,73709,11.0,0.357143,-0.142857,11.506805,52,5
3,4982,189745,5.0,0.357143,-0.523810,19.684796,1,0
4,149601,289643,1.0,0.464286,-0.380952,16.905465,54,2
...,...,...,...,...,...,...,...,...
145667277,10718,93558,7.0,0.357143,-0.428571,36.742564,103,10
145667278,119164,60206,60.0,0.357143,-0.142857,35.342559,25,0
145667279,171518,99323,70.0,0.357143,-0.476190,23.017857,2,0
145667280,109533,74203,1.0,0.357143,-0.047619,17.402707,28,1


In [None]:
# Specify the features to use for ranking
X = data[['user_id', 'item_id', 'preference', 'age', 'mean_timespent', 'sum_like', 'sum_dis']]
y = data['timespent']  # This is just a placeholder; can be your target variable
