# [Module 1.0] 데이터 간략하게 확인

이 노트북은 주어진 세가지 데이터 세트를 간략하게 획인 합니다. 
- 훈련 데이터 세트 샘블
- 전체 훈련 데이터 세트
- 테스트 데이터 세트
---


### 결론

- 훈련 데이터 세트 샘플은  100,000 개로서 건수가 작아서 사용하기 적합하지 않음
- 테스트 데이터 세트는 레이블 컬럼이 없어서 사용 불가능
- 전체 훈련 데이터를 사용하고, 필요한 만큼의 데이터를 샘플링해서 사용하는 것을 권장

## 데이터 폴더 위치 확인

In [1]:
import os
os.getcwd()

'/home/ec2-user/SageMaker/fraud-detector-workshop/code/phase0/prepare_data'

In [2]:
import pandas as pd
import os

data_folder = '../../../data/AdTalking'

## 훈련 데이터 세트 샘플
- 주어진 샘플 데이터는 100,000 건수와 8개의 컬럼으로 구성 됨

In [3]:

file = 'train_sample.csv'
file_path = os.path.join(data_folder, file)

df = pd.read_csv(file_path, parse_dates=['click_time'])
df

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0
...,...,...,...,...,...,...,...,...
99995,124883,11,1,19,122,2017-11-09 13:25:41,,0
99996,85150,9,1,13,244,2017-11-07 11:25:43,,0
99997,18839,3,1,13,19,2017-11-08 11:38:42,,0
99998,114276,15,1,12,245,2017-11-08 17:55:21,,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   ip               100000 non-null  int64         
 1   app              100000 non-null  int64         
 2   device           100000 non-null  int64         
 3   os               100000 non-null  int64         
 4   channel          100000 non-null  int64         
 5   click_time       100000 non-null  datetime64[ns]
 6   attributed_time  227 non-null     object        
 7   is_attributed    100000 non-null  int64         
dtypes: datetime64[ns](1), int64(6), object(1)
memory usage: 6.1+ MB


### 레이블 분포와 프로드 비율 확인

In [5]:
df.is_attributed.value_counts()

0    99773
1      227
Name: is_attributed, dtype: int64

In [6]:
df.is_attributed.value_counts()[1] / df.shape[0]

0.00227

## 전체 훈련 데이터 세트
- 데이터 로딩에 약 2분 걸림
- 총 열의 갯수: 184,903,890
- Fraud 비율은 약 0.2% 정도 임

In [7]:
%%time
file = 'train.csv'
file_path = os.path.join(data_folder, file)

df_t = pd.read_csv(file_path, parse_dates=['click_time'])
df_t

CPU times: user 1min 26s, sys: 17.6 s, total: 1min 43s
Wall time: 1min 43s


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,,0
1,17357,3,1,19,379,2017-11-06 14:33:34,,0
2,35810,3,1,13,379,2017-11-06 14:34:12,,0
3,45745,14,1,13,478,2017-11-06 14:34:52,,0
4,161007,3,1,13,379,2017-11-06 14:35:08,,0
...,...,...,...,...,...,...,...,...
184903885,121312,12,1,10,340,2017-11-09 16:00:00,,0
184903886,46894,3,1,19,211,2017-11-09 16:00:00,,0
184903887,320126,1,1,13,274,2017-11-09 16:00:00,,0
184903888,189286,12,1,37,259,2017-11-09 16:00:00,,0


In [8]:
df_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184903890 entries, 0 to 184903889
Data columns (total 8 columns):
 #   Column           Dtype         
---  ------           -----         
 0   ip               int64         
 1   app              int64         
 2   device           int64         
 3   os               int64         
 4   channel          int64         
 5   click_time       datetime64[ns]
 6   attributed_time  object        
 7   is_attributed    int64         
dtypes: datetime64[ns](1), int64(6), object(1)
memory usage: 11.0+ GB


In [9]:
df_t.is_attributed.value_counts()

0    184447044
1       456846
Name: is_attributed, dtype: int64

In [10]:
df_t.is_attributed.value_counts()[1] / df_t.shape[0]

0.002470721410998979

## 테스트 데이터 세트
- 데이터 로딩에 약 1분 걸림
- 총 열의 갯수: 184,903,890
- Fraud 비율은 약 0.2% 정도 임

In [11]:
file = 'test.csv'
file_path = os.path.join(data_folder, file)

df_test = pd.read_csv(file_path)
df_test

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00
...,...,...,...,...,...,...,...
18790464,18790464,99442,9,1,13,127,2017-11-10 15:00:00
18790465,18790465,88046,23,1,37,153,2017-11-10 15:00:00
18790466,18790467,81398,18,1,17,265,2017-11-10 15:00:00
18790467,18790466,123236,27,1,13,122,2017-11-10 15:00:00
