In [None]:
# Тренировочные данные

In [1]:
import pandas as pd
from some_func import get_time_of_day, holidays

data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

# Преобразуем временную метку в объект datetime
data_train['timestamp'] = pd.to_datetime(data_train['timestamp'])

#Создаем признаки день недели, время суток, праздничный ли день
data_train['day_of_the_week'] = data_train['timestamp'].dt.dayofweek
data_train['part_of_the_day'] = data_train['timestamp'].dt.hour.apply(get_time_of_day)
data_train['holiday'] = data_train['timestamp'].dt.date.astype(str).isin(holidays)

# Создаем признак время года
data_train['season'] = pd.cut(data_train['timestamp'].dt.month,
                        bins=[0, 3, 6, 9, 12],
                        labels=['winter', 'spring', 'summer', 'autumn'])

# День месяца, часы, минуты, секунды
data_train['day_of_month'] = data_train['timestamp'].dt.day
data_train['hour'] = data_train['timestamp'].dt.hour
data_train['min'] = data_train['timestamp'].dt.minute
data_train['sec'] = data_train['timestamp'].dt.second

# Векторизуем данные
data_train = data_train.join(pd.get_dummies(data_train.loc[:, 'gate_id'], prefix='gate_id'))
data_train = data_train.join(pd.get_dummies(data_train.loc[:, 'day_of_the_week'], prefix='day_of_week'))
data_train = data_train.join(pd.get_dummies(data_train.loc[:, 'part_of_the_day'], prefix='part_of_day'))
data_train = data_train.join(pd.get_dummies(data_train.loc[:, 'season'], prefix='season'))
data_train['holiday'] = data_train['holiday'].astype(int)

# Время между проходами через конкретный турникет и время между каждой записью
data_train['time_between_passes'] = data_train.groupby('gate_id')['timestamp'].diff().dt.total_seconds()
data_train['time_between_passes'] = data_train['time_between_passes'].fillna(0)
data_train['delta'] = data_train['timestamp'].diff().dt.total_seconds()
data_train['delta'] = data_train['delta'].fillna(0)

# Еще немного векторизуем
data_train = data_train.join(pd.get_dummies(data_train.loc[:, 'hour'], prefix='hour'))
data_train = data_train.join(pd.get_dummies(data_train.loc[:, 'min'], prefix='min'))
data_train = data_train.join(pd.get_dummies(data_train.loc[:, 'day_of_month'], prefix='day_of_month'))

# Чистим и сохраняем в файл
data_train = data_train.drop(columns=['timestamp', 'gate_id', 'day_of_the_week', 'part_of_the_day', 'season'])

data_train.to_csv('5_clear_train.csv', index=False)

In [2]:
data_train

Unnamed: 0,row_id,user_id,holiday,day_of_month,hour,min,sec,gate_id_-1,gate_id_0,gate_id_1,...,day_of_month_22,day_of_month_23,day_of_month_24,day_of_month_25,day_of_month_26,day_of_month_27,day_of_month_28,day_of_month_29,day_of_month_30,day_of_month_31
0,0,18,0,29,9,8,54,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,18,0,29,9,9,54,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2,18,0,29,9,9,54,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3,18,0,29,9,10,6,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,4,18,0,29,9,10,8,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37513,37513,6,1,31,20,38,56,0,0,0,...,0,0,0,0,0,0,0,0,0,1
37514,37514,6,1,31,20,39,22,0,0,0,...,0,0,0,0,0,0,0,0,0,1
37515,37515,6,1,31,20,39,23,0,0,0,...,0,0,0,0,0,0,0,0,0,1
37516,37516,6,1,31,20,39,31,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# Тестовые данные

In [5]:
import pandas as pd
from datetime import datetime
from some_func import get_time_of_day, holidays
from collections import Counter

data_test = pd.read_csv('test.csv')

# Преобразуем временную метку в объект datetime
data_test['timestamp'] = pd.to_datetime(data_test['timestamp'])

# Создаем признаки день недели, время суток, праздничный ли день
data_test['day_of_the_week'] = data_test['timestamp'].dt.dayofweek
data_test['part_of_the_day'] = data_test['timestamp'].dt.hour.apply(get_time_of_day)
data_test['holiday'] = data_test['timestamp'].dt.date.astype(str).isin(holidays)

# Создаем признак время года
data_test['season'] = pd.cut(data_test['timestamp'].dt.month,
                             bins=[0, 3, 6, 9, 12],
                             labels=['winter', 'spring', 'summer', 'autumn'])

# День месяца, часы, минуты, секунды
data_test['day_of_month'] = data_test['timestamp'].dt.day
data_test['hour'] = data_test['timestamp'].dt.hour
data_test['min'] = data_test['timestamp'].dt.minute
data_test['sec'] = data_test['timestamp'].dt.second

# Векторизуем данные
"""data_test = data_test.join(pd.get_dummies(data_test.loc[:, 'gate_id'], prefix='gate_id'))
data_test = data_test.join(pd.get_dummies(data_test.loc[:, 'day_of_the_week'], prefix='day_of_week'))
data_test = data_test.join(pd.get_dummies(data_test.loc[:, 'part_of_the_day'], prefix='part_of_day'))
data_test = data_test.join(pd.get_dummies(data_test.loc[:, 'season'], prefix='season'))"""
data_test['holiday'] = data_test['holiday'].astype(int)

# Время между проходами через конкретный турникет и время между каждой записью
data_test['time_between_passes'] = data_test.groupby('gate_id')['timestamp'].diff().dt.total_seconds()
data_test['time_between_passes'] = data_test['time_between_passes'].fillna(0)
data_test['delta'] = data_test['timestamp'].diff().dt.total_seconds()
data_test['delta'] = data_test['delta'].fillna(0)

# Еще немного векторизуем
"""data_test = data_test.join(pd.get_dummies(data_test.loc[:, 'hour'], prefix='hour'))
data_test = data_test.join(pd.get_dummies(data_test.loc[:, 'min'], prefix='min'))
data_test = data_test.join(pd.get_dummies(data_test.loc[:, 'day_of_month'], prefix='day_of_month'))"""

# Чистим и сохраняем в файл
#data_test = data_test.drop(columns=['timestamp', 'gate_id', 'day_of_the_week', 'part_of_the_day', 'season'])
data_test.to_csv('5_clear_test.csv', index=False)

In [6]:
data_test

Unnamed: 0,row_id,timestamp,gate_id,day_of_the_week,part_of_the_day,holiday,season,day_of_month,hour,min,sec,time_between_passes,delta
0,37518,2023-01-03 08:21:00,9,1,morning,1,winter,3,8,21,0,0.0,0.0
1,37519,2023-01-03 08:21:00,9,1,morning,1,winter,3,8,21,0,0.0,0.0
2,37520,2023-01-03 08:21:18,5,1,morning,1,winter,3,8,21,18,0.0,18.0
3,37521,2023-01-03 08:21:19,5,1,morning,1,winter,3,8,21,19,1.0,1.0
4,37522,2023-01-03 08:21:39,10,1,morning,1,winter,3,8,21,39,0.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7120,44638,2023-02-24 19:43:36,11,4,evening,1,winter,24,19,43,36,9669.0,9279.0
7121,44639,2023-02-24 19:44:00,4,4,evening,1,winter,24,19,44,0,20427.0,24.0
7122,44640,2023-02-24 19:44:01,4,4,evening,1,winter,24,19,44,1,1.0,1.0
7123,44641,2023-02-24 19:44:09,9,4,evening,1,winter,24,19,44,9,20422.0,8.0


# Pipeline

In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import pandas as pd
from datetime import datetime
from some_func import get_time_of_day, holidays
import numpy as np

def get_features(data: pd.DataFrame) -> pd.DataFrame:

    data = data.copy()
    # Преобразуем временную метку в объект datetime
    data['timestamp'] = pd.to_datetime(data['timestamp'])

    # Создаем признаки день недели, время суток, праздничный ли день
    data['day_of_the_week'] = data['timestamp'].dt.dayofweek
    data['part_of_the_day'] = data['timestamp'].dt.hour.apply(get_time_of_day)
    data['holiday'] = data['timestamp'].dt.date.astype(str).isin(holidays)
    data['holiday'] = data['holiday'].astype(int)

    # Создаем признак время года
    data['season'] = pd.cut(data['timestamp'].dt.month,
                                 bins=[0, 3, 6, 9, 12],
                                 labels=['winter', 'spring', 'summer', 'autumn'])

    # День месяца, часы, минуты, секунды
    data['day_of_month'] = data['timestamp'].dt.day
    data['hour'] = data['timestamp'].dt.hour
    data['min'] = data['timestamp'].dt.minute
    data['sec'] = data['timestamp'].dt.second

    # Время между проходами через конкретный турникет и время между каждой записью
    data['time_between_passes'] = data.groupby('gate_id')['timestamp'].diff().dt.total_seconds()
    data['time_between_passes'] = data['time_between_passes'].fillna(0)
    data['delta'] = data['timestamp'].diff().dt.total_seconds()
    data['delta'] = data['delta'].fillna(0)

    # Чистим
    data = data.drop(columns=['timestamp', 'row_id',])

    return data



data_test = pd.read_csv('test.csv')
data_train = pd.read_csv('train.csv').drop(columns='user_id')

data_test = get_features(data_test)
data_train = get_features(data_train)
print(data_test.shape, data_train.shape)

ohe = make_column_transformer((OneHotEncoder(handle_unknown='ignore', sparse=False), ['day_of_month', 'hour', 'min', 'gate_id', 'day_of_the_week', 'part_of_the_day', 'season']),
                              remainder='passthrough')

vectorized_train = ohe.fit_transform(data_train)
vectorized_test = ohe.transform(data_test)
print(vectorized_test.shape, vectorized_train.shape)

y_train = np.array(pd.read_csv('train.csv').user_id, dtype=int)

vectorized_test


(7125, 11) (37518, 11)
(7125, 142) (37518, 142)




array([[0.0000e+00, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [0.0000e+00, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [0.0000e+00, 0.0000e+00, 1.0000e+00, ..., 1.8000e+01, 0.0000e+00,
        1.8000e+01],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 1.0000e+00,
        1.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.0000e+00, 2.0422e+04,
        8.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.0000e+00, 0.0000e+00,
        0.0000e+00]])