# Section 2.4.1 Split Train & Test Interaction Features

In [1]:
import time
import numpy as np
import pandas as pd
from typing import *
import json
import gc

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
def get_root_dir():
    if os.path.exists('/content/drive/MyDrive/'):
        return '/content/drive/MyDrive/AMEX Project/notebooks' #在Colab里
    else:
        return './' #在本地

#调用系统命令，相当于cd，但是直接!cd是不行的
os.chdir(get_root_dir())

### Loop through the full data and split into train & test

In [4]:
with open('../data/train_ids.json', 'r') as f:
    train_ids = json.load(f)

print(len(train_ids))

458913


In [5]:
with open('../data/test_ids.json', 'r') as f:
    test_ids = json.load(f)

print(len(test_ids))

924621


In [6]:
def train_test_split(df: pd.DataFrame, train_ids: List[str], test_ids: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train = df[df['customer_ID'].isin(train_ids)]
    test = df[df['customer_ID'].isin(test_ids)]
    print(f'train: {train.shape}; test: {test.shape}')
    return train, test

In [7]:
def split_all_1(train_ids: List[str], test_ids: List[str]):
    for i in range(1, 9):
        # load data
        read_path = "../data/6-interaction/full/full_" + str(i) + "_1.parquet"
        print('Splitting ', read_path)
        full = pd.read_parquet(read_path, engine='pyarrow')
        # split data & save
        train, test = train_test_split(full, train_ids, test_ids)
        train_path = "../data/6-interaction/train/train_interact" + str(i) + ".parquet"
        test_path = "../data/6-interaction/test/test_interact" + str(i) + ".parquet"
        train.to_parquet(train_path, index=False)
        test.to_parquet(test_path, index=False)


In [8]:
def split_all_2(train_ids: List[str], test_ids: List[str]):
    for i in range(1, 9):
        # load data
        read_path = '../data/6-interaction/full/full_' + str(i) + '_2.parquet'
        print('Splitting ', read_path)
        full = pd.read_parquet(read_path, engine='pyarrow')
        # split data & save
        train, test = train_test_split(full, train_ids, test_ids)
        train_path = "../data/6-interaction/train/train_interact" + str(i + 8) + ".parquet"
        test_path = "../data/6-interaction/test/test_interact" + str(i + 8) + ".parquet"
        train.to_parquet(train_path, index=False)
        test.to_parquet(test_path, index=False)

In [9]:
split_all_1(train_ids, test_ids)

Splitting  ../data/6-interaction/full/full_1_1.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_2_1.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_3_1.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_4_1.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_5_1.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_6_1.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_7_1.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_8_1.parquet
train: (458913, 469); test: (924621, 469)


In [10]:
split_all_2(train_ids, test_ids)

Splitting  ../data/6-interaction/full/full_1_2.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_2_2.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_3_2.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_4_2.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_5_2.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_6_2.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_7_2.parquet
train: (458913, 391); test: (924621, 391)
Splitting  ../data/6-interaction/full/full_8_2.parquet
train: (458913, 469); test: (924621, 469)
