In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import plotly.express as px
import numpy as np
from statistics import mode
from sklearn.metrics import silhouette_score
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.preprocessing import TargetEncoder
from collections import Counter
from sklearn.ensemble import IsolationForest
pd.set_option('display.max_rows', 40)
target_enc = TargetEncoder(smooth="auto")

In [91]:
# path = ... # Ваш путь до директории с данными /path/to/data/
data = pd.read_csv('train_events.csv')
all_events = pd.read_csv('all_events.csv')
video = pd.read_csv('video_info_v2.csv')
targets = pd.read_csv('train_targets.csv')
test = pd.read_csv('test_events.csv')

# Предсказание социально-демографических характеристик пользователей Rutube
    Пользователи RUTUBE не всегда указывают свои данные, такие как возраст и пол, что затрудняет формирование портрета пользователя и создание персонализированных рекомендаций. Это ограничивает возможности платформы в предоставлении контента, который наиболее подходит интересам и потребностям пользователей, тем самым ухудшая пользовательский опыт.

    Необходимо разработать модель, которая на основе истории просмотров сможет предсказывать пол и возраст пользователя. В качестве baseline решения мы предоставляем этот ноутбук.


# Основной pipeline обработки
    Чтобы предсказать возраст и пол, мы будем использовать медианный возраст и модальное значение пола для просмотренных каналов. Для этого мы подсчитаем статистические данные для каждого author_id на основе обучающей выборки.

    Чтобы оценить качество данного подхода, разделим пользователей с известными социально-демографическими характеристиками на обучающую и валидационную выборки.

In [90]:
# константы
not_a_city = ['Oblast', 'Krai', 'Republic'] 
computers = ['Windows', 'Mac', 'other_desktop']
smarthones = ['Android', 'other_smartphone', 'iOS', 'other_tablet']
apple_devices = ['iOS', 'Mac']
os_whitelist = ['Android', 'Windows', 'iOS', 'Mac']
clients_names = all_events['ua_client_name'].value_counts()[:10].index.tolist()

In [4]:
is_region = lambda x: len(x.split()) >= 2 # регион/город

In [5]:
data['region_ind'] = data['region'].apply(is_region).astype(int)

In [6]:
data.loc[~data['ua_os'].isin(os_whitelist), 'ua_os'] = 'other'

In [7]:
data.loc[data['ua_os'] == 'other', 'ua_os'] = data.loc[data['ua_os'] == 'other'].apply(lambda x: x['ua_os'] + '_' + x['ua_device_type'], axis=1)

In [8]:
data.loc[data['ua_client_name'].apply(lambda x: sum([x==el for el in clients_names]) == 0), 'ua_client_name'] = 'other'
data.loc[data['ua_client_name'] == 'other', 'ua_client_name'] = data.loc[data['ua_client_name'] == 'other'].apply(lambda x: x['ua_client_name'] + '_' + x['ua_device_type'], axis=1)


In [9]:
data = data.merge(video, on='rutube_video_id', how='left')

In [10]:
data['duration_rate'] = data['total_watchtime'] / data['duration']

In [11]:
data['event_timestamp'] = pd.to_datetime(data['event_timestamp'])

In [12]:
regions_df = pd.read_csv('region_zones.csv')

mapped_time = []
for _, row in data.iterrows():
    region = row['region']
    time_zone = regions_df[regions_df['Region'] == region]['Time Zone'].iloc[0]
    if "+" in time_zone:
        time_zone = time_zone.replace("+", "-")
    else:
        time_zone = time_zone.replace("-", "+")
    time_zone = 'Etc/' + time_zone
    mapped_time.append(row['event_timestamp'].tz_convert(time_zone))

In [13]:
mapped_time = np.array(mapped_time)

data['event_timestamp'] = mapped_time

data['event_timestamp'] = data['event_timestamp'].astype("str")

data['event_timestamp'] = pd.to_datetime(data['event_timestamp'].apply(lambda x: x[:16]))

In [14]:
data['duration']  = data['duration'] / 1000

In [15]:
video_processed = pd.read_csv('video_info_processed.csv')
new_categories = video_processed['category_own'].apply(lambda x: eval(x)).apply(pd.Series).fillna(video_processed['category'])
new_categories['rutube_video_id'] = video_processed['rutube_video_id']
new_categories.columns = [f'category_own_{i}' for i in range(4)] + ['rutube_video_id']

In [16]:
data = data.merge(new_categories, on='rutube_video_id', how='left')

In [17]:
cat_transforms = {'top_1': lambda x: x.value_counts().index[0],
                  # 'top_2': lambda x: x.value_counts().index[1],
                  # 'top_3': lambda x: x.value_counts().index[2],
                  'top_n': lambda x: x.value_counts().index[-1],
                  'count': 'count',
                 }
num_transforms = {'mean': 'mean',
                  'count': 'count',
                  'max': 'max',
                  'min': 'min',
                  'median': 'median',
                  'std': 'std',
                  'q3': lambda x: np.quantile(x, 0.3),
                  'q7': lambda x: np.quantile(x, 0.7),
                  'q9': lambda x: np.quantile(x, 0.9),
                 }

In [18]:
cat_features = data.dtypes[data.dtypes == object].index.tolist()
num_features = data.dtypes[data.dtypes != object].index.tolist()

In [19]:
columns = [f"{col}_{ct}" for col in cat_features for ct in cat_transforms] + \
                         [f"{col}_{ct}" for col in num_features for ct in num_transforms]

In [20]:
features_group = {cf: list(cat_transforms.values()) for cf in cat_features}
features_group.update({nf: list(num_transforms.values()) for nf in num_features})

In [21]:
timeseries_features = data[['viewer_uid', 'category', 'region', 'event_timestamp']].merge(targets, on='viewer_uid', how='left')

In [22]:
def timeseries_features_func(group: str, feature: str):
    group_ts = timeseries_features.groupby(group).agg(
        event_timestamp = ('event_timestamp', list),
        features = (feature, list)
    )
    group_ts['timeseries'] = group_ts.apply(lambda x: sorted(list(zip(x['event_timestamp'], x['features'])), key=lambda x: x[0]), axis=1)
    group_ts[['event_timestamp', 'features']] = group_ts['timeseries'].apply(lambda x: list(zip(*x))).apply(pd.Series)
    group_ts.drop('timeseries', axis=1, inplace=True)
    group_ts[f'{feature}_mean'] = group_ts['features'].apply(lambda x: np.mean(x))
    group_ts[f'{feature}_std'] = group_ts['features'].apply(lambda x: np.std(x))

    morning_column_mean = []
    morning_column_std = []
    morning_mask = group_ts.apply(lambda x: [y.hour < 12 for y in x['event_timestamp']], axis=1)
    for mask, row in zip(morning_mask, group_ts['features']):
        morning_column_mean.append(np.array(row)[mask].mean())
        morning_column_std.append(np.array(row)[mask].std())
    group_ts[f'{feature}_morning_mean'] = morning_column_mean
    group_ts[f'{feature}_morning_std'] = morning_column_std
    
    day_column_mean = []
    day_column_std = []
    day_mask = group_ts.apply(lambda x: [12 <= y.hour < 18 for y in x['event_timestamp']], axis=1)
    for mask, row in zip(day_mask, group_ts['features']):
        day_column_mean.append(np.array(row)[mask].mean())
        day_column_std.append(np.array(row)[mask].std())
    group_ts[f'{feature}_day_mean'] = day_column_mean
    group_ts[f'{feature}_day_std'] = day_column_std

    even_column_mean = []
    even_column_std = []
    even_mask = group_ts.apply(lambda x: [y.hour >= 18 for y in x['event_timestamp']], axis=1)
    for mask, row in zip(even_mask, group_ts['features']):
        even_column_mean.append(np.array(row)[mask].mean())
        even_column_std.append(np.array(row)[mask].std())
    group_ts[f'{feature}_even_mean'] = even_column_mean
    group_ts[f'{feature}_even_std'] = even_column_std

    return group_ts.drop(['event_timestamp', 'features'], axis=1).reset_index()

In [23]:
category_age_ts = timeseries_features_func('category', 'age')
region_age_ts = timeseries_features_func('region', 'age')

In [24]:
# coбираем статистики относительно видео

data_video = data
video_group_info = data_video.groupby('rutube_video_id', as_index=False).agg(features_group)
video_group_info.columns = columns
video_group_info = video_group_info.fillna(0)

In [26]:
# популярность по названию

title_popularity = data.groupby('title', as_index=False).agg(
    rutube_video_id = ('rutube_video_id', 'first'),
    author = ('author_id', 'first'),
    unique_viewers = ('viewer_uid', lambda x: len(set(x))),
    all_viewers = ('viewer_uid', 'count'),
    watchtime = ('total_watchtime', 'sum'),
    duration = ('duration', 'first')
)
title_popularity['viewing_rate'] = title_popularity['watchtime'] / (title_popularity['all_viewers'] * title_popularity['duration'])

In [27]:
# популярность по категориям

cat_popularity = data.groupby('category', as_index=False).agg(
    authors = ('author_id', lambda x: len(set(x))),
    unique_viewers = ('viewer_uid', lambda x: len(set(x))),
    all_viewers = ('viewer_uid', 'count'),
    watchtime = ('total_watchtime', 'sum'),
    duration = ('duration', 'first')
)
cat_popularity['viewing_rate'] = cat_popularity['watchtime'] / (cat_popularity['all_viewers'] * cat_popularity['duration'])

In [28]:
# популярность по регионам

region_popularity = data.groupby('region', as_index=False).agg(
    authors = ('author_id', lambda x: len(set(x))),
    unique_viewers = ('viewer_uid', lambda x: len(set(x))),
    all_viewers = ('viewer_uid', 'count'),
    watchtime = ('total_watchtime', 'sum'),
    duration = ('duration', 'first')
)
region_popularity['viewing_rate'] = region_popularity['watchtime'] / (region_popularity['all_viewers'] * region_popularity['duration'])

In [29]:
# популярность по девайсам

device_popularity = data.groupby('ua_device_type', as_index=False).agg(
    authors = ('author_id', lambda x: len(set(x))),
    unique_viewers = ('viewer_uid', lambda x: len(set(x))),
    all_viewers = ('viewer_uid', 'count'),
    watchtime = ('total_watchtime', 'sum'),
    duration = ('duration', 'first')
)
device_popularity['viewing_rate'] = device_popularity['watchtime'] / (device_popularity['all_viewers'] * device_popularity['duration'])

In [30]:
# блок с матрицей интеракций

In [31]:
interact_matrix = data.groupby(['viewer_uid', 'category'], as_index=False).agg(
    count_views = ('rutube_video_id', 'count')
)

In [32]:
viewer2id = dict(zip(data['viewer_uid'].unique(), list(range(len(data['viewer_uid'].unique())))))
cat2id = dict(zip(data['category'].unique(), list(range(len(data['category'].unique())))))

In [33]:
import numpy as np
import scipy.sparse as sparse
coo = sparse.coo_matrix((interact_matrix['count_views'], 
                         (interact_matrix['viewer_uid'].map(viewer2id).values,
                          interact_matrix['category'].map(cat2id).values)))

In [34]:
coo_array = coo.toarray()

In [35]:
norm_coefs = np.sqrt((coo_array ** 2).sum(axis=1))

In [36]:
coo_array_norm = coo_array / norm_coefs[:, None]

In [37]:
U, S, Vh = np.linalg.svd(coo_array_norm, full_matrices=False)

In [38]:
isforest = IsolationForest(random_state=42, n_estimators=100, n_jobs=-1)

In [39]:
ismapper = isforest.fit_predict(U) == -1

In [40]:
kmeans = KMeans(n_clusters=40, random_state=42, n_init="auto")
kmeans.fit(U)
label_clusters = kmeans.labels_

In [41]:
label_clusters[ismapper] = -1

In [42]:
best_labels = pd.value_counts(label_clusters)

  best_labels = pd.value_counts(label_clusters)


In [43]:
filtrate_labels = list(map(lambda x: x if x in best_labels else -1, label_clusters))

In [44]:
# name_features = list(map(lambda x: 'user_cat_class_' + str(x), pd.value_counts(filtrate_labels).index))

In [45]:
name_features = list(map(lambda x: 'user_cat_class_' + str(x), filtrate_labels))

In [46]:
# аггрегация основных данных

In [47]:
data_user = data
user_group_info = data_user.groupby('viewer_uid', as_index=False).agg(features_group)
user_group_info.columns = columns
user_group_info = user_group_info.fillna(0)

In [48]:
user_group_info['viewer_uid'] = data_user['viewer_uid']

In [49]:
# объединение аггрегаций по юзерам и по видео

In [50]:
user_video_group = user_group_info.merge(video_group_info, on='rutube_video_id_top_1', how='left').\
                merge(video_group_info.rename(columns={'rutube_video_id_top_n':'rutube_video_id_top_n_x'}),
                    on='rutube_video_id_top_n_x', how='left')

In [51]:
# добавление знаний по популярности 

In [52]:
for cat in list(filter(lambda x: 'title' in x and 'count' not in x, user_video_group.columns)):
    user_video_group = user_video_group.merge(title_popularity.rename(columns={'title': cat}), on=cat, how='left', suffixes=(f'_{cat}_a', f'_{cat}_b'))

In [53]:
for cat in list(filter(lambda x: 'category' in x and 'count' not in x, user_video_group.columns)):
    user_video_group = user_video_group.merge(cat_popularity.rename(columns={'category': cat}), on=cat, how='left', suffixes=(f'_{cat}_a', f'_{cat}_b'))

In [54]:
for cat in list(filter(lambda x: 'region' in x and 'count' not in x and 'ind' not in x, user_video_group.columns)):
    user_video_group = user_video_group.merge(region_popularity.rename(columns={'region': cat}), on=cat, how='left', suffixes=(f'_{cat}_a', f'_{cat}_b'))

In [55]:
for cat in list(filter(lambda x: 'ua_device_type' in x and 'count' not in x, user_video_group.columns)):
    user_video_group = user_video_group.merge(device_popularity.rename(columns={'ua_device_type': cat}), on=cat, how='left', suffixes=(f'_{cat}_a', f'_{cat}_b'))

In [56]:
# добавление матрицы user-user

In [57]:
id2viewer = {v: k for k, v in viewer2id.items()}

In [58]:
user_inter = pd.DataFrame(U)
user_inter.index = [id2viewer[i] for i in user_inter.index]

In [59]:
user_inter = user_inter.reset_index().rename(columns={'index': 'viewer_uid'})

In [60]:
user_inter['name_features'] = name_features

In [61]:
user_inter = user_inter.sort_values(by='viewer_uid')

In [62]:
user_video_group = user_video_group.merge(user_inter, on='viewer_uid', how='left')

In [84]:
from sklearn.metrics.pairwise import cosine_similarity
usr = user_inter[user_inter.columns[1:-1]].values
sim_viewers = []
for idx1 in tqdm(range(len(usr))):
    mean_val = np.mean(cosine_similarity(usr[idx1].reshape(1, -1), usr))
    sim_viewers.append(mean_val)

100%|███████████████████████████████████| 180012/180012 [55:47<00:00, 53.77it/s]


In [85]:
pd.to_pickle(sim_viewers, 'sim_viewers.pkl')

In [86]:
user_video_group['sim_viewers'] = sim_viewers

In [87]:
train_data = user_video_group.merge(targets, on='viewer_uid', how='left')

In [88]:
train_data.isnull().sum().sum()

0

In [89]:
train_data.to_csv('prepared_train.csv', index=False)

**Все тоже самое для теста делаем**

In [92]:
test['region_ind'] = test['region'].apply(is_region).astype(int)

In [93]:
test.loc[~test['ua_os'].isin(os_whitelist), 'ua_os'] = 'other'

In [94]:
test.loc[test['ua_os'] == 'other', 'ua_os'] = test.loc[test['ua_os'] == 'other'].apply(lambda x: x['ua_os'] + '_' + x['ua_device_type'], axis=1)

In [95]:
test.loc[data['ua_client_name'].apply(lambda x: sum([x==el for el in clients_names]) == 0), 'ua_client_name'] = 'other'
test.loc[data['ua_client_name'] == 'other', 'ua_client_name'] = test.loc[test['ua_client_name'] == 'other'].apply(lambda x: x['ua_client_name'] + '_' + x['ua_device_type'], axis=1)


In [96]:
test = test.merge(video, on='rutube_video_id', how='left')

In [97]:
test['duration_rate'] = test['total_watchtime'] / test['duration']

In [98]:
test['event_timestamp'] = pd.to_datetime(test['event_timestamp'])

In [99]:
regions_df = pd.read_csv('region_zones.csv')

mapped_time = []
for _, row in test.iterrows():
    region = row['region']
    time_zone = regions_df[regions_df['Region'] == region]['Time Zone'].iloc[0]
    if "+" in time_zone:
        time_zone = time_zone.replace("+", "-")
    else:
        time_zone = time_zone.replace("-", "+")
    time_zone = 'Etc/' + time_zone
    mapped_time.append(row['event_timestamp'].tz_convert(time_zone))

In [100]:
mapped_time = np.array(mapped_time)

test['event_timestamp'] = mapped_time

test['event_timestamp'] = test['event_timestamp'].astype("str")

test['event_timestamp'] = pd.to_datetime(test['event_timestamp'].apply(lambda x: x[:16]))

In [101]:
test['duration']  = test['duration'] / 1000

In [103]:
test = test.merge(new_categories, on='rutube_video_id', how='left')

In [104]:
cat_features = test.dtypes[test.dtypes == object].index.tolist()
num_features = test.dtypes[test.dtypes != object].index.tolist()

In [105]:
columns = [f"{col}_{ct}" for col in cat_features for ct in cat_transforms] + \
                         [f"{col}_{ct}" for col in num_features for ct in num_transforms]

In [106]:
features_group = {cf: list(cat_transforms.values()) for cf in cat_features}
features_group.update({nf: list(num_transforms.values()) for nf in num_features})

In [107]:
timeseries_features = test[['viewer_uid', 'category', 'region', 'event_timestamp']]

In [114]:
# coбираем статистики относительно видео

data_video = test
video_group_info = data_video.groupby('rutube_video_id', as_index=False).agg(features_group)
video_group_info.columns = columns
video_group_info = video_group_info.fillna(0)

In [115]:
# популярность по названию

title_popularity = test.groupby('title', as_index=False).agg(
    rutube_video_id = ('rutube_video_id', 'first'),
    author = ('author_id', 'first'),
    unique_viewers = ('viewer_uid', lambda x: len(set(x))),
    all_viewers = ('viewer_uid', 'count'),
    watchtime = ('total_watchtime', 'sum'),
    duration = ('duration', 'first')
)
title_popularity['viewing_rate'] = title_popularity['watchtime'] / (title_popularity['all_viewers'] * title_popularity['duration'])

In [116]:
# популярность по категориям

cat_popularity = test.groupby('category', as_index=False).agg(
    authors = ('author_id', lambda x: len(set(x))),
    unique_viewers = ('viewer_uid', lambda x: len(set(x))),
    all_viewers = ('viewer_uid', 'count'),
    watchtime = ('total_watchtime', 'sum'),
    duration = ('duration', 'first')
)
cat_popularity['viewing_rate'] = cat_popularity['watchtime'] / (cat_popularity['all_viewers'] * cat_popularity['duration'])

In [117]:
# популярность по регионам

region_popularity = test.groupby('region', as_index=False).agg(
    authors = ('author_id', lambda x: len(set(x))),
    unique_viewers = ('viewer_uid', lambda x: len(set(x))),
    all_viewers = ('viewer_uid', 'count'),
    watchtime = ('total_watchtime', 'sum'),
    duration = ('duration', 'first')
)
region_popularity['viewing_rate'] = region_popularity['watchtime'] / (region_popularity['all_viewers'] * region_popularity['duration'])

In [118]:
# популярность по девайсам

device_popularity = test.groupby('ua_device_type', as_index=False).agg(
    authors = ('author_id', lambda x: len(set(x))),
    unique_viewers = ('viewer_uid', lambda x: len(set(x))),
    all_viewers = ('viewer_uid', 'count'),
    watchtime = ('total_watchtime', 'sum'),
    duration = ('duration', 'first')
)
device_popularity['viewing_rate'] = device_popularity['watchtime'] / (device_popularity['all_viewers'] * device_popularity['duration'])

In [119]:
interact_matrix = test.groupby(['viewer_uid', 'category'], as_index=False).agg(
    count_views = ('rutube_video_id', 'count')
)

In [120]:
viewer2id = dict(zip(test['viewer_uid'].unique(), list(range(len(test['viewer_uid'].unique())))))
cat2id = dict(zip(test['category'].unique(), list(range(len(test['category'].unique())))))

In [121]:
coo = sparse.coo_matrix((interact_matrix['count_views'], 
                         (interact_matrix['viewer_uid'].map(viewer2id).values,
                          interact_matrix['category'].map(cat2id).values)))

In [122]:
coo_array = coo.toarray()

In [123]:
norm_coefs = np.sqrt((coo_array ** 2).sum(axis=1))

In [124]:
coo_array_norm = coo_array / norm_coefs[:, None]

In [125]:
U, S, Vh = np.linalg.svd(coo_array_norm, full_matrices=False)

In [126]:
isforest = IsolationForest(random_state=42, n_estimators=100, n_jobs=-1)

In [127]:
ismapper = isforest.fit_predict(U) == -1

In [128]:
kmeans = KMeans(n_clusters=40, random_state=42, n_init="auto")
kmeans.fit(U)
label_clusters = kmeans.labels_

In [129]:
label_clusters[ismapper] = -1

In [130]:
best_labels = pd.value_counts(label_clusters)

  best_labels = pd.value_counts(label_clusters)


In [131]:
filtrate_labels = list(map(lambda x: x if x in best_labels else -1, label_clusters))

In [132]:
name_features = list(map(lambda x: 'user_cat_class_' + str(x), filtrate_labels))

In [133]:
data_user = test
user_group_info = data_user.groupby('viewer_uid', as_index=False).agg(features_group)
user_group_info.columns = columns
user_group_info = user_group_info.fillna(0)

In [134]:
user_group_info['viewer_uid'] = data_user['viewer_uid']

In [138]:
user_video_group = user_group_info.merge(video_group_info, on='rutube_video_id_top_1', how='left').\
                merge(video_group_info.rename(columns={'rutube_video_id_top_n':'rutube_video_id_top_n_x'}),
                    on='rutube_video_id_top_n_x', how='left')

In [139]:
for cat in list(filter(lambda x: 'title' in x and 'count' not in x, user_video_group.columns)):
    user_video_group = user_video_group.merge(title_popularity.rename(columns={'title': cat}), on=cat, how='left', suffixes=(f'_{cat}_a', f'_{cat}_b'))

In [140]:
for cat in list(filter(lambda x: 'category' in x and 'count' not in x, user_video_group.columns)):
    user_video_group = user_video_group.merge(cat_popularity.rename(columns={'category': cat}), on=cat, how='left', suffixes=(f'_{cat}_a', f'_{cat}_b'))

In [141]:
for cat in list(filter(lambda x: 'region' in x and 'count' not in x and 'ind' not in x, user_video_group.columns)):
    user_video_group = user_video_group.merge(region_popularity.rename(columns={'region': cat}), on=cat, how='left', suffixes=(f'_{cat}_a', f'_{cat}_b'))

In [142]:
for cat in list(filter(lambda x: 'ua_device_type' in x and 'count' not in x, user_video_group.columns)):
    user_video_group = user_video_group.merge(device_popularity.rename(columns={'ua_device_type': cat}), on=cat, how='left', suffixes=(f'_{cat}_a', f'_{cat}_b'))

In [143]:
id2viewer = {v: k for k, v in viewer2id.items()}

In [144]:
user_inter = pd.DataFrame(U)
user_inter.index = [id2viewer[i] for i in user_inter.index]

In [145]:
user_inter = user_inter.reset_index().rename(columns={'index': 'viewer_uid'})

In [146]:
user_inter['name_features'] = name_features

In [147]:
user_inter = user_inter.sort_values(by='viewer_uid')

In [148]:
user_video_group = user_video_group.merge(user_inter, on='viewer_uid', how='left')

In [149]:
from sklearn.metrics.pairwise import cosine_similarity
usr = user_inter[user_inter.columns[1:-1]].values
sim_viewers = []
for idx1 in tqdm(range(len(usr))):
    mean_val = np.mean(cosine_similarity(usr[idx1].reshape(1, -1), usr))
    sim_viewers.append(mean_val)

100%|████████████████████████████████████| 60004/60004 [06:47<00:00, 147.32it/s]


In [150]:
user_video_group['sim_viewers'] = sim_viewers

In [151]:
user_video_group.to_csv('prepared_test.csv', index=False)

# Далее идем в файл train_test.py