In [17]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
# Установка seed для воспроизводимости результатов
np.random.seed(1234)

# Создаем соединение с базой данных
engine = create_engine("postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml")

# Загрузка данных
user_data = pd.read_sql("SELECT * FROM public.user_data", con=engine)
post_text_df = pd.read_sql("SELECT * FROM public.post_text_df", con=engine)
feed_data = pd.read_sql("SELECT * FROM public.feed_data ORDER BY timestamp DESC LIMIT 1000000", con=engine)

In [18]:
# Объединяем данные в один DataFrame
user_feed = pd.merge(feed_data, user_data, on='user_id', how='left')
all_data = pd.merge(user_feed, post_text_df, on='post_id', how='left')

def generate_new_target(target, action):
    if target == 1:
        return 3
    if action == 'like':
        return 2
    if action == 'view':
        return 1

all_data['target'] = all_data.apply(lambda row: generate_new_target(row['target'], row['action']), axis=1)

# Создаем признаки
all_data['timestamp'] = pd.to_datetime(all_data['timestamp'])
all_data['day_of_week'] = all_data['timestamp'].dt.dayofweek
all_data['hour'] = all_data['timestamp'].dt.hour
all_data['time_slot'] = all_data['day_of_week'].astype(str) + '_' + all_data['hour'].astype(str)

train = all_data.drop(['action', 'timestamp'], axis=1)

train.head()


Unnamed: 0,user_id,post_id,target,gender,age,country,city,exp_group,os,source,text,topic,day_of_week,hour,time_slot
0,13350,6119,1,1,42,Russia,Ivanovo,0,iOS,ads,A rather lame teen slasher from Brisbane. Whil...,movie,2,23,2_23
1,168096,1295,1,0,49,Russia,Tomsk,2,iOS,organic,Mandelson warns BBC on Campbell\n\nThe BBC sho...,politics,2,23,2_23
2,26960,1608,1,1,31,Russia,Irkutsk,3,Android,ads,A year to remember for Irish\n\nThere used to ...,sport,2,23,2_23
3,113678,749,1,0,22,Russia,Orekhovo-Borisovo Yuzhnoye,1,iOS,organic,Da Vinci Code is lousy history\n\nThe plot of ...,entertainment,2,23,2_23
4,6545,2735,1,1,16,Kazakhstan,Temirtaū,4,iOS,ads,August 16 : Positive cases reported worldwide ...,covid,2,23,2_23


In [79]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.knns import KNNBasic

user_post_interactions = all_data[['user_id', 'post_id', 'target']].copy()
user_post_interactions = user_post_interactions.drop_duplicates(subset=['user_id', 'post_id'], keep='last')

# Удаление пользователей с нулевыми векторами
user_counts = user_post_interactions.groupby('user_id')['post_id'].count()
valid_users = user_counts[user_counts > 0].index
user_post_interactions = user_post_interactions[user_post_interactions['user_id'].isin(valid_users)]

# Удаление элементов с нулевыми векторами
post_counts = user_post_interactions.groupby('post_id')['user_id'].count()
valid_posts = post_counts[post_counts > 0].index
user_post_interactions = user_post_interactions[user_post_interactions['post_id'].isin(valid_posts)]
reader = Reader(rating_scale=(1, 3))
user_post_interactions.head()
# Загрузка данных в объект Dataset
dataset = Dataset.load_from_df(pd.DataFrame(user_post_interactions, columns=['user_id', 'post_id', 'target']), reader)

# Разделение данных на обучающую и тестовую выборки
trainset, testset = train_test_split(dataset, test_size=0.25, random_state=42)

algo = SVD()

predictions = algo.fit(trainset).test(testset)

result = pd.DataFrame(predictions, columns=['user_id', 'post_id', 'target', 'predict', 'details'])
result['error'] = abs(result['target'] - result['predict'])
result.head()


Unnamed: 0,user_id,post_id,target,predict,details,error
0,167927,5955,1.0,1.523823,{'was_impossible': False},0.523823
1,39457,3454,1.0,1.195917,{'was_impossible': False},0.195917
2,168192,554,3.0,1.39933,{'was_impossible': False},1.60067
3,62782,7287,1.0,1.28508,{'was_impossible': False},0.28508
4,152530,5091,1.0,1.0,{'was_impossible': False},0.0


In [81]:
np.random.seed(1234)

dataset = Dataset.load_from_df(pd.DataFrame(user_post_interactions, columns=['user_id', 'post_id', 'target']), reader)

# Получение уникальных идентификаторов пользователей из фактических данных
unique_user_ids = all_data['user_id'].unique()

# Выбор случайных идентификаторов пользователей из фактических данных
random_ids = np.random.choice(unique_user_ids, size=2000)

top_posts = {}
for user_id in random_ids:
    test_df = post_text_df.copy()
    test_df['user_id'] = user_id
    test_df = pd.DataFrame(test_df, columns=['user_id', 'post_id'])
    
    predictions = []
    for index, row in post_text_df.iterrows():
        post_id = row['post_id']
        prediction = algo.predict(user_id, post_id)
        predictions.append(prediction)
        
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    top_posts[user_id] = [(pred.iid, pred.est) for pred in sorted_predictions[:5]]
    
hits = 0
total = 0

for user_id, posts in top_posts.items():
    user_actual_posts = all_data[(all_data['user_id'] == user_id) & (all_data['target'] == 3)]['post_id'].tolist()
    
    total += 1
    predicted_post_ids = [post_id for post_id, _ in posts]

    if any(post_id in user_actual_posts for post_id in predicted_post_ids):
        hits += 1

hitrate_at_5 = hits / total
print(f"Hitrate@5: {hitrate_at_5:.4f}")

Hitrate@5: 0.3048
