# 1. 라이브러리 호출

In [112]:
import numpy as np
import pandas as pd

# 2. 파일 경로 설정 및 불러오기

In [4]:
train_path = './train.csv'
test_path = './test.csv'

In [5]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [15]:
train_df.columns

Index(['Unnamed: 0', 'session', 'aid', 'ts', 'type'], dtype='object')

In [21]:
train_df.drop('Unnamed: 0', axis=1, inplace=True)

In [17]:
test_df.columns

Index(['Unnamed: 0', 'session', 'aid', 'ts', 'type'], dtype='object')

In [22]:
test_df.drop('Unnamed: 0', axis=1, inplace=True)

In [23]:
train_df

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
10285556,199999,928064,1659336408967,clicks
10285557,199999,849970,1659336449078,clicks
10285558,199999,1052480,1659336547035,clicks
10285559,199999,487255,1659336561116,clicks


In [24]:
test_df

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000278,clicks
1,12899780,1142000,1661724000378,clicks
2,12899780,582732,1661724058352,clicks
3,12899780,973453,1661724109199,clicks
4,12899780,736515,1661724136868,clicks
...,...,...,...,...
928110,13099776,1159407,1661844072138,clicks
928111,13099776,546448,1661844142618,clicks
928112,13099777,468584,1661795832787,clicks
928113,13099778,926609,1661795832939,clicks


총 훈련 데이터: 10,285,561개 데이터 <br>
테스트 데이터: 928,115개 데이터 <br>

# 3. 데이터 확인

In [25]:
train_df.describe()

Unnamed: 0,session,aid,ts
count,10285560.0,10285560.0,10285560.0
mean,98494.58,929110.2,1660293000000.0
std,57591.58,536480.6,756241500.0
min,0.0,1.0,1659305000000.0
25%,49073.0,467968.0,1659576000000.0
50%,98525.0,928571.0,1660198000000.0
75%,147504.0,1394671.0,1660935000000.0
max,199999.0,1855601.0,1661724000000.0


In [26]:
test_df.describe()

Unnamed: 0,session,aid,ts
count,928115.0,928115.0,928115.0
mean,12996560.0,926965.8,1661815000000.0
std,58238.97,538098.5,108703300.0
min,12899780.0,0.0,1661724000000.0
25%,12945450.0,462601.0,1661766000000.0
50%,12995470.0,926145.0,1661782000000.0
75%,13046680.0,1394228.0,1661795000000.0
max,13099780.0,1855600.0,1662329000000.0


In [29]:
np.sum(train_df.isnull())

session    0
aid        0
ts         0
type       0
dtype: int64

In [30]:
np.sum(test_df.isnull())

session    0
aid        0
ts         0
type       0
dtype: int64

결측치 없음

In [60]:
def show_type_percentage(df):
    total = np.sum(df.groupby('type')['aid'].count())
    print(total == np.shape(df)[0])
    print(df.groupby('type')['aid'].count() / total * 100)

In [61]:
show_type_percentage(train_df)

True
type
carts      7.309130
clicks    90.617595
orders     2.073275
Name: aid, dtype: float64


In [62]:
show_type_percentage(test_df)

True
type
carts      8.448199
clicks    90.271033
orders     1.280768
Name: aid, dtype: float64


평균적으로 carts, clicks, orders는 각각 90 : 7 ~ 8 : 1 ~ 2의 비율로 존재

# 4. 검증 데이터 분류

In [83]:
left, right = train_df.index[0], train_df.index[-1]
valid_idx = np.random.choice(right + 1, size=right//10, replace=False)

In [86]:
right // 10 == len(np.unique(valid_idx))

True

In [95]:
valid_df = train_df.loc[valid_idx]
valid_df

Unnamed: 0,session,aid,ts,type
5101536,97782,939837,1660723049097,clicks
2494916,47535,1121585,1659372380851,clicks
429524,6443,1853043,1659412770416,clicks
9810053,190008,92535,1661018018143,clicks
6364037,121700,249395,1661182853108,carts
...,...,...,...,...
7626684,145702,307183,1661584573592,orders
9025798,173674,1408492,1660307125843,clicks
1547489,28405,153489,1659306569135,clicks
7119060,135934,467388,1659414367311,clicks


In [96]:
show_type_percentage(valid_df)

True
type
carts      7.365180
clicks    90.545289
orders     2.089531
Name: aid, dtype: float64


위에서 train 데이터와 매우 유사한 비율로 분류된 것을 확인할 수 있음

In [104]:
train_df.drop(valid_idx, axis=0, inplace=True)
train_df

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
10285556,199999,928064,1659336408967,clicks
10285557,199999,849970,1659336449078,clicks
10285558,199999,1052480,1659336547035,clicks
10285559,199999,487255,1659336561116,clicks


In [107]:
train_df.reset_index(drop=True, inplace=True)
train_df

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
9257000,199999,928064,1659336408967,clicks
9257001,199999,849970,1659336449078,clicks
9257002,199999,1052480,1659336547035,clicks
9257003,199999,487255,1659336561116,clicks


In [109]:
valid_df.reset_index(drop=True, inplace=True)
valid_df

Unnamed: 0,session,aid,ts,type
0,97782,939837,1660723049097,clicks
1,47535,1121585,1659372380851,clicks
2,6443,1853043,1659412770416,clicks
3,190008,92535,1661018018143,clicks
4,121700,249395,1661182853108,carts
...,...,...,...,...
1028551,145702,307183,1661584573592,orders
1028552,173674,1408492,1660307125843,clicks
1028553,28405,153489,1659306569135,clicks
1028554,135934,467388,1659414367311,clicks


# 5. 데이터 프레임 내보내기

In [110]:
train_df.to_csv('my_train.csv', mode='w')
valid_df.to_csv('my_valid.csv', mode='w')