In [98]:
import pandas as pd
from google.colab import drive

In [99]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [100]:
events_file = '/content/drive/My Drive/Soreva/events.csv'
item_features_file = '/content/drive/My Drive/Soreva/item_features.csv'
user_features_file = '/content/drive/My Drive/Soreva/user_features.csv'

In [101]:
events_df = pd.read_csv(events_file)
item_features_df = pd.read_csv(item_features_file)
user_features_df = pd.read_csv(user_features_file)

In [102]:
display(events_df.head(20))

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,1505,4,0
1,0,3669,3,1
2,0,584,4,2
3,0,3390,3,3
4,0,2885,4,4
5,0,79,5,5
6,0,717,4,7
7,0,187,4,8
8,0,93,5,9
9,0,3016,5,10


In [103]:
display(user_features_df.head())

Unnamed: 0,user_id,gender,age
0,4855,F,1
1,4065,M,56
2,3331,M,25
3,5373,M,45
4,2032,M,25


In [104]:
display(item_features_df.head())

Unnamed: 0,item_id,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17
0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [105]:
#преобразовал timestamp в читаемый вид
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'], unit='s')

In [106]:
#слияние данных по фильмам с данными пользователей
merged_df = pd.merge(events_df, user_features_df, on='user_id', how='left')

In [107]:
merged_df = pd.merge(merged_df, item_features_df, on='item_id', how='left')

In [108]:
print("Колонки в merged_df:", merged_df.columns)

Колонки в merged_df: Index(['user_id', 'item_id', 'rating', 'timestamp', 'gender', 'age', 'genre_0',
       'genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5', 'genre_6',
       'genre_7', 'genre_8', 'genre_9', 'genre_10', 'genre_11', 'genre_12',
       'genre_13', 'genre_14', 'genre_15', 'genre_16', 'genre_17'],
      dtype='object')


In [109]:
#Расчёт долей полов и возрастов по фильмам
# Доли полов по фильмам
gender_distribution = merged_df.groupby('item_id')['gender'].value_counts(normalize=True).unstack().fillna(0)
gender_distribution.columns = ['female_ratio', 'male_ratio']

In [110]:
# Доли возрастных групп по фильмам
merged_df['age_group'] = pd.cut(merged_df['age'], bins=[0, 18, 45, 100], labels=['young', 'adult', 'senior'])
age_distribution = merged_df.groupby('item_id')['age_group'].value_counts(normalize=True).unstack().fillna(0)
age_distribution.columns = ['young_ratio', 'adult_ratio', 'senior_ratio']

In [111]:
# Объединим данные с признаками жанров
genre_columns = [col for col in item_features_df.columns if col.startswith('genre')]

In [112]:
# Доли полов по жанрам
gender_by_genre = merged_df.groupby(genre_columns)['gender'].value_counts(normalize=True).unstack().fillna(0)
gender_by_genre.columns = ['female_ratio_genre', 'male_ratio_genre']

In [113]:
# Доли возрастных групп по жанрам
age_by_genre = merged_df.groupby(genre_columns)['age_group'].value_counts(normalize=True).unstack().fillna(0)
age_by_genre.columns = ['young_ratio_genre', 'adult_ratio_genre', 'senior_ratio_genre']

In [114]:
# Подсчёт количества оценок, среднего и медианного рейтингов, квантили
item_stats = events_df.groupby('item_id').agg(
    avg_item_rating=('rating', 'mean'),
    item_rating_median=('rating', 'median'),
    item_rating_count=('rating', 'count'),
    item_rating_quantile_25=('rating', lambda x: x.quantile(0.25)),
    item_rating_quantile_75=('rating', lambda x: x.quantile(0.75)),
    avg_rating_time=('timestamp', lambda x: (x - x.min()).dt.total_seconds().mean()),
    rating_time_range=('timestamp', lambda x: (x.max() - x.min()).total_seconds())
).reset_index()

In [115]:
merged_df = pd.merge(merged_df, item_stats, on='item_id', how='left')

In [116]:
merged_df['rating_deviation_from_mean'] = merged_df['rating'] - merged_df['avg_item_rating']
merged_df['rating_deviation_from_median'] = merged_df['rating'] - merged_df['item_rating_median']

In [117]:
# Ср. время оценки фильма
item_stats['avg_rating_time'] = events_df.groupby('item_id')['timestamp'].apply(lambda x: (x - x.min()).dt.total_seconds().mean()).reset_index(drop=True)
item_stats['rating_time_range'] = events_df.groupby('item_id')['timestamp'].apply(lambda x: (x.max() - x.min()).total_seconds()).reset_index(drop=True)

In [118]:
# Жанровые колонки
genre_columns = [col for col in item_features_df.columns if col.startswith('genre')]

In [119]:
# Доли возрастов по жанрам
age_by_genre = merged_df.groupby(genre_columns)['age_group'].value_counts(normalize=True).unstack().fillna(0)
age_by_genre.columns = ['young_ratio_genre', 'adult_ratio_genre', 'senior_ratio_genre']

In [120]:
# Объединение всех фичей с основными данными о фильмах
item_features_extended = pd.merge(item_features_df, item_stats, on='item_id', how='left')
item_features_extended = pd.merge(item_features_extended, gender_distribution, on='item_id', how='left')
item_features_extended = pd.merge(item_features_extended, age_distribution, on='item_id', how='left')

In [121]:
display(item_features_extended.head())

Unnamed: 0,item_id,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,...,item_rating_count,item_rating_quantile_25,item_rating_quantile_75,avg_rating_time,rating_time_range,female_ratio,male_ratio,young_ratio,adult_ratio,senior_ratio
0,0,0,1,0,1,1,0,0,0,1,...,134.0,1.0,3.0,219.656716,1257.0,0.283582,0.716418,0.492537,0.425373,0.08209
1,1,0,0,0,0,0,0,0,1,0,...,14.0,3.0,4.0,238.5,1210.0,0.285714,0.714286,0.0,0.714286,0.285714
2,2,0,0,0,0,0,0,0,1,0,...,218.0,3.0,4.0,233.899083,1412.0,0.307339,0.692661,0.114679,0.747706,0.137615
3,3,0,0,0,0,0,0,0,1,0,...,150.0,3.0,5.0,249.34,1208.0,0.486667,0.513333,0.1,0.64,0.26
4,4,0,0,0,0,0,0,0,1,0,...,44.0,3.0,4.0,273.204545,1665.0,0.204545,0.795455,0.113636,0.772727,0.113636


In [122]:
item_features_extended.to_csv('/content/drive/My Drive/Soreva/features_target.csv', index=False)