# Создание обучаюшего датасета для последующих моделей

## Библиотеки

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

## Загрузка данных

In [2]:
transactions = pd.read_csv('./data_prepared/transactions_train.csv', date_parser='t_dat', dtype={'article_id': str})
articles = pd.read_csv('./data_prepared/articles.csv', dtype={'article_id': str})

## Создание датасета

In [3]:
def create_dataset(transactions: pd.DataFrame, week: int):
    """
    Функция для раздления transactions на обучающие параметры и таргет
    transactions - данные о транзакциях
    week - неделя, которая будет использоваться в качестве таргета
    """
    WEEK_HIST_MAX = 5
    
    target = transactions[transactions['week'] == week].groupby('customer_id')[['article_id', 't_dat']] \
        .apply(lambda x: x.sort_values(by='t_dat', ascending=False).iloc[0]['article_id'])
    target = target.reset_index()
    target.columns = ['customer_id', 'target']
    
    signs = transactions[(transactions['week'] > week) & (transactions['week'] <= week + 5)] \
        .groupby('customer_id')['article_id'].apply(lambda x: ' '.join(x))
    signs = signs.reset_index()
    signs.columns = ['customer_id', 'signs']
    
    trains = signs.merge(target, on='customer_id', how='left')
    trains = trains.dropna()
    trains = trains.reset_index(drop=True)
    
    return trains[['signs', 'target']]

In [4]:
result = create_dataset(transactions, 0)

In [5]:
result

Unnamed: 0,signs,target
0,0913367001 0904026001 0905914002 0640021012 06...,0827487003
1,0399136061 0732842014 0556255001 0852219003 07...,0757926001
2,0929744001 0865799005 0456163060 0933989002,0158340001
3,0832253003 0915027001 0817086002 0914959001 09...,0871519008
4,0828067001 0883033001 0399256001,0610776002
...,...,...
34555,0429313002 0907696006 0886557004 0905365002 09...,0833459002
34556,0880312005 0624486001 0887659010 0624486064 06...,0624486049
34557,0903581001 0882612004 0464679015 0797892001 08...,0396135007
34558,0724906019,0750423010


In [6]:
from tqdm import tqdm

def create_preference_matrix(dataset: pd.DataFrame):
    total_columns = articles[articles['rating'] > 100]['article_id'].unique().tolist() + ['target']

    df = pd.DataFrame(columns=total_columns, data=np.zeros((dataset.shape[0], len(total_columns))))
    df['target'] = df['target'].astype(str)
    
    for i in tqdm(range(dataset.shape[0])):
        row_articles = set(result.iloc[i]['signs'].split(' '))
        
        if not row_articles.issubset(set(total_columns)):
            continue
        
        df.loc[i, list(row_articles)] = 1
        df.loc[i, 'target'] = dataset.loc[i]['target']
    
    return df

In [7]:
preference_matrix = create_preference_matrix(result)

100%|██████████| 34560/34560 [02:54<00:00, 197.66it/s]


In [8]:
preference_matrix.head()

Unnamed: 0,0108775015,0108775044,0108775051,0110065001,0110065002,0110065011,0111565001,0111586001,0111593001,0111609001,...,0943097001,0944506001,0945995002,0946748003,0946748004,0946764003,0947509001,0949198001,0949551001,target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,757926001.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,158340001.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,610776002.0


In [None]:
preference_matrix.to_csv('./data_prepared/preference_matrix.csv', index=False)